In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [10]:
df = pd.read_csv("datasets/movies_metadata.csv",
                 low_memory=False)
df = df.head(100)


In [11]:
df = df[['id','title','overview']]
df.head(1)

Unnamed: 0,id,title,overview
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."


In [12]:
tfidf = TfidfVectorizer(stop_words='english')
df['overview'] = df['overview'].fillna('')

In [13]:
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape

(100, 1895)

In [14]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim[1]



array([0.01598601, 1.        , 0.05095125, 0.        , 0.        ,
       0.06184943, 0.        , 0.        , 0.12908223, 0.        ,
       0.01315689, 0.        , 0.        , 0.01316551, 0.        ,
       0.        , 0.        , 0.02874849, 0.        , 0.        ,
       0.        , 0.        , 0.0100649 , 0.        , 0.        ,
       0.0367933 , 0.02644778, 0.01683479, 0.        , 0.        ,
       0.        , 0.03348817, 0.        , 0.00931831, 0.00822231,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.0381127 , 0.        ,
       0.        , 0.0210013 , 0.01165518, 0.        , 0.01742351,
       0.0087352 , 0.        , 0.03310006, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02243943, 0.        , 0.02144823,
       0.01150284, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.0143346 , 0.        , 0.     

In [15]:
df = df[~df['title'].isna()]


In [16]:
indices = pd.Series(df.index, index=df['title'])
indices = indices[~indices.index.duplicated(keep='last')]
indices

title
Toy Story                         0
Jumanji                           1
Grumpier Old Men                  2
Waiting to Exhale                 3
Father of the Bride Part II       4
                                 ..
La Haine                         95
Shopping                         96
Heidi Fleiss: Hollywood Madam    97
City Hall                        98
Bottle Rocket                    99
Length: 100, dtype: int64

In [17]:
target_movie_index = indices['Toy Story']
target_movie_index

0

In [18]:
cosine_sim[target_movie_index]


array([1.        , 0.01598601, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.05458107, 0.        , 0.        ,
       0.01154289, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.02818954, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.01001783, 0.        , 0.        ,
       0.01153539, 0.        , 0.        , 0.        , 0.01564414,
       0.01719673, 0.0123872 , 0.        , 0.        , 0.02021326,
       0.        , 0.02899379, 0.02263477, 0.        , 0.03478363,
       0.        , 0.        , 0.01329742, 0.        , 0.01449816,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [19]:
similarity_scores = pd.DataFrame(cosine_sim[target_movie_index], columns=["score"])
similarity_scores

Unnamed: 0,score
0,1.000000
1,0.015986
2,0.000000
3,0.000000
4,0.000000
...,...
95,0.000000
96,0.000000
97,0.000000
98,0.000000


In [20]:
movie_indices = similarity_scores.sort_values("score", ascending=False)[0:11].index

In [21]:
df['title'].iloc[movie_indices]


0                                    Toy Story
17                                  Four Rooms
59                  The Indian in the Cupboard
83                 Last Summer in the Hamptons
56                       Home for the Holidays
33                                        Babe
57                                 The Postman
82    Once Upon a Time... When We Were Colored
54                                     Georgia
76                                   Nico Icon
50                              Guardian Angel
Name: title, dtype: object

In [22]:
def get_films_by_name(movie_name, movie_indices):
    return movie_indices[movie_indices.index.str.contains(movie_name, na=False)]
get_films_by_name('Lord', indices)

Series([], dtype: int64)

In [24]:
def get_recommended_movies(target_movie_index, movie_similarities,movies_df):
    similarity_scores = pd.DataFrame(movie_similarities[target_movie_index], columns=["score"]) 
    movie_indices = similarity_scores.sort_values("score", ascending=False)[0:11].index 
    return df['title'].iloc[movie_indices]
get_recommended_movies(77,cosine_sim,df) 
# 2007 index for "The Lord of the Rings"

77                         The Crossing Guard
54                                    Georgia
45              How To Make An American Quilt
71                      Kicking and Screaming
22                                  Assassins
57                                The Postman
94                         A Midwinter's Tale
59                 The Indian in the Cupboard
79                          The White Balloon
2                            Grumpier Old Men
80    Things to Do in Denver When You're Dead
Name: title, dtype: object