In [3]:
import pandas as pd
df = pd.read_csv("/content/dataset.csv")
print(df)

          id                                          title  \
0        278                       The Shawshank Redemption   
1      19404                    Dilwale Dulhania Le Jayenge   
2        238                                  The Godfather   
3        424                               Schindler's List   
4        240                         The Godfather: Part II   
...      ...                                            ...   
9995   10196                             The Last Airbender   
9996  331446                       Sharknado 3: Oh Hell No!   
9997   13995                                Captain America   
9998    2312  In the Name of the King: A Dungeon Siege Tale   
9999  455957                                         Domino   

                                                 genre original_language  \
0                                          Drama,Crime                en   
1                                 Comedy,Drama,Romance                hi   
2              

In [4]:
features = df[['id','title','genre','overview']]
features

Unnamed: 0,id,title,genre,overview
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...
...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy","The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"Action,Science Fiction,War","During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",A man named Farmer sets out to rescue his kidn...


In [5]:
df['tags'] = df['overview'] + df['genre'] # define separate column called tags

In [6]:
#from the updated dataset, we do not need the overview and genre columns anymore
new_df = df[['id','title','tags']]
new_df

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 10000, stop_words = 'english')
vector = cv.fit_transform(new_df['tags'].values.astype('U')).toarray()
print(vector.shape)

(10000, 10000)


In [8]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
print(similarity)

[[1.         0.05634362 0.12888482 ... 0.07559289 0.11065667 0.06388766]
 [0.05634362 1.         0.07624929 ... 0.         0.03636965 0.        ]
 [0.12888482 0.07624929 1.         ... 0.02273314 0.06655583 0.08645856]
 ...
 [0.07559289 0.         0.02273314 ... 1.         0.03253    0.02817181]
 [0.11065667 0.03636965 0.06655583 ... 0.03253    1.         0.0412393 ]
 [0.06388766 0.         0.08645856 ... 0.02817181 0.0412393  1.        ]]


In [11]:
index = new_df[new_df["title"] == "The Godfather"].index[0] # we access the index of a movie based on its title

In [14]:
# so if we access the index, we can find the similarity score of all the movies
similarity_values = sorted(list(enumerate(similarity[index])),reverse=True, key = lambda x:x[1]) #sort based on the score
print(similarity_values)

[(2, 1.0000000000000004), (4, 0.48976229911514363), (7419, 0.3521803625302496), (153, 0.3354968547317302), (2624, 0.3234983196103152), (9520, 0.3112864031823452), (2412, 0.3081578172139684), (330, 0.30499714066520933), (5010, 0.2995012465378748), (779, 0.29606845410646954), (7049, 0.29606845410646954), (9362, 0.2934836354418746), (4569, 0.29261523994305977), (3670, 0.2893456933022473), (4872, 0.28934569330224724), (1816, 0.2857953049377246), (4811, 0.28529870107872785), (6964, 0.2803652103289399), (4380, 0.2798845714165278), (734, 0.2758802939230217), (5605, 0.2758802939230217), (1223, 0.2756247308353552), (6788, 0.2756247308353552), (9245, 0.2756247308353552), (8555, 0.2744974265986884), (709, 0.2727977357881894), (3742, 0.2686124597780274), (519, 0.26687249808205815), (821, 0.26687249808205815), (6565, 0.26687249808205815), (250, 0.26622333025588873), (8503, 0.26622333025588873), (747, 0.2652335521937267), (233, 0.2641352718976872), (2272, 0.2641352718976872), (7866, 0.25890435250935

In [16]:
# so we'll access the top 5 movies
for value in similarity_values[:5]:
  print(value, new_df.iloc[value[0]].title) # get the top movies and print them

(2, 1.0000000000000004) The Godfather
(4, 0.48976229911514363) The Godfather: Part II
(7419, 0.3521803625302496) Blood Ties
(153, 0.3354968547317302) Joker
(2624, 0.3234983196103152) Bomb City


In [17]:
def recommend(movie): # so this will do the recommendation of the top 5 movies, can be leveraged based on the needs
  index = new_df[new_df["title"] == movie].index[0]
  similarity_values = sorted(list(enumerate(similarity[index])),reverse=True, key = lambda x:x[1])
  for value in similarity_values[:5]:
    print(new_df.iloc[value[0]].title)
recommend("Joker")

Joker
Blood Ties
The Big Heat
Dead Man Down
Chopper


In [22]:
# pickle the files for later uses like web dev
import pickle
pickle.dump(new_df, open("/content/movies.pkl","wb"))

In [23]:
pickle.dump(similarity, open("/content/similarity.pkl","wb"))

In [24]:
#read the pickle files
pickle.load(open("/content/movies.pkl","rb"))

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...
