In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
credits=pd.read_csv('tmdb/tmdb_5000_credits.csv')
movies=pd.read_csv('tmdb/tmdb_5000_movies.csv')

In [3]:
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [4]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [5]:
credits.columns=['id','title','cast','crew']
movies=movies.merge(credits,on='id')

In [6]:
movies['overview'].head(5)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [7]:
movies['overview']=movies['overview'].fillna('')

In [8]:
def create_soup(x):
    return ''.join(x['keywords']) + '' + ''.join(x['genres']) + '' + ''.join(x['overview'])
movies['soup']=movies.apply(create_soup,axis=1)

In [9]:
tfidf=TfidfVectorizer(stop_words='english')
tfidf_matrix=tfidf.fit_transform(movies['soup'])
tfidf_matrix.shape

(4803, 32768)

In [10]:
cosine_sim= linear_kernel(tfidf_matrix,tfidf_matrix)


In [11]:
indices=pd.Series(movies.index,index=movies['original_title']).drop_duplicates()

In [12]:
def get_recommendations(original_title,cosine_sim=cosine_sim):
    idx= indices[original_title]
    sim_scores=list(enumerate(cosine_sim[idx]))
    sim_scores=sorted(sim_scores,key=lambda x:x[1],reverse=True)
    sim_scores=sim_scores[1:11]
    movies_indices =[i[0] for i in sim_scores] 
    return movies['original_title'].iloc[movies_indices]

In [15]:
get_recommendations('Avatar',cosine_sim)

2403                 Aliens
838                  Alien³
373         Mission to Mars
305         Treasure Planet
322       The Fifth Element
1531              Moonraker
278      Planet of the Apes
582     Battle: Los Angeles
3158                  Alien
95             Interstellar
Name: original_title, dtype: object