In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_csv('tmdb_5000_credits.csv')
df2 = pd.read_csv('tmdb_5000_movies.csv')

In [3]:
df1.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [4]:
df2.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [5]:
df1.rename(columns={'movie_id':'id'}, inplace=True)

In [6]:
df3 = df2.merge(df1)

In [7]:
df3.shape

(4803, 22)

In [8]:
df3.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'cast', 'crew'],
      dtype='object')

In [9]:
df3['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

###### Import TFIdfVectorizer from scikit-learn

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

###### Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'

In [11]:
tfidf = TfidfVectorizer(stop_words='english')

###### Replace NaN with an empty string

In [17]:
df3['overview'] = df3['overview'].fillna('')

###### Construct the required TF-IDF matrix by fitting and transforming the data

In [20]:
tfidf_matrix = tfidf.fit_transform(df2['overview'].values.astype('U'))

###### Output the shape of tfidf_matrix

In [21]:
tfidf_matrix.shape

(4803, 20979)

In [22]:
tfidf_matrix

<4803x20979 sparse matrix of type '<class 'numpy.float64'>'
	with 125843 stored elements in Compressed Sparse Row format>

In [23]:
df3.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            4793, 4794, 4795, 4796, 4797, 4798, 4799, 4800, 4801, 4802],
           dtype='int64', length=4803)

###### Import linear_kernel

In [24]:
from sklearn.metrics.pairwise import linear_kernel

###### Compute the cosine similarity matrix

In [26]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [32]:
cosine_sim[0]

array([1., 0., 0., ..., 0., 0., 0.])

###### Construct a reverse map of indices and movie titles

In [29]:
indices = pd.Series(df3.index, index=df3['title']).drop_duplicates()

In [33]:
indices

title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

In [35]:
indices['Newlyweds']

4799

In [42]:
cosine_sim[indices['Newlyweds']]

array([0., 0., 0., ..., 0., 0., 0.])

In [47]:
list(enumerate(cosine_sim[indices['Newlyweds']]))

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.0),
 (49, 0.0),
 (50, 0.0),
 (51, 0.0),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.0),
 (58, 0.0),
 (59, 0.0),
 (60, 0.0),
 (61, 0.0),
 (62, 0.0),
 (63, 0.0),
 (64, 0.0),
 (65, 0.0),
 (66, 0.0),
 (67, 0.0),
 (68, 0.0),
 (69, 0.0),
 (70, 0.0),
 (71, 0.0),
 (72, 0.0),
 (73, 0.0),
 (74, 0.0),
 (75, 0.0),
 (76, 0.0),
 (77, 0.0),
 (78, 0.0),
 (79, 0.0),
 (80, 0.0),
 (81, 0.0),
 (82, 0.0),
 (83, 0.0),
 (

In [50]:
sorted(list(enumerate(cosine_sim[indices['Newlyweds']])), key=lambda x: x[1], reverse=True)

[(4799, 1.0000000000000002),
 (3969, 0.163196383455196),
 (616, 0.16038391928188536),
 (2689, 0.1559273345217069),
 (1576, 0.14360247193668715),
 (2290, 0.13569452086355288),
 (504, 0.13085786584044767),
 (866, 0.12008671226577512),
 (4576, 0.11860285818486715),
 (3025, 0.11588892984671503),
 (3155, 0.10906952890446106),
 (242, 0.1082101771218739),
 (3479, 0.10724969380917379),
 (869, 0.10387261895824548),
 (1071, 0.10387217759656119),
 (2869, 0.09679316909067763),
 (1223, 0.09642437836076359),
 (4641, 0.09405288245375722),
 (2962, 0.08901256793695153),
 (3559, 0.08391116487015626),
 (4591, 0.08326384994868444),
 (1970, 0.0820655909186915),
 (3638, 0.08059309461437097),
 (2688, 0.07872657682277753),
 (1110, 0.07842548044020586),
 (4616, 0.07625223076592522),
 (4584, 0.07555254150626438),
 (3610, 0.07091093890693127),
 (3253, 0.07001088918731717),
 (971, 0.06871964524993948),
 (2795, 0.06737489602719182),
 (1856, 0.06663821029251138),
 (1385, 0.06483418556448954),
 (1949, 0.064277106999

###### # Function that takes in movie title as input and outputs most similar movies

In [52]:
def get_recommendations(title, cosine_sim=cosine_sim):
    
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all movies with that movie
    similar_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on similarity scores in descending order
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    similar_scores = similar_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in similar_scores]
    
    # Return the top 10 most similar movies
    return df3['title'].iloc[movie_indices]

In [59]:
get_recommendations('Avatar')

3604                       Apollo 18
2130                    The American
634                       The Matrix
1341            The Inhabited Island
529                 Tears of the Sun
1610                           Hanna
311     The Adventures of Pluto Nash
847                         Semi-Pro
775                        Supernova
2628             Blood and Chocolate
Name: title, dtype: object