In [1]:
#Importing all the required libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
#Reading the datasets downloaded from kaggle
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

In [3]:
#View Column first dataset
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [4]:
#View Column second dataset
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [5]:
#As the datasets are saperate so at first we need to cobine them
credits.columns = ['id', 'title', 'cast', 'crew']
movies = movies.merge(credits, on='id')

In [6]:
# The overview column contains description for each movie.
movies['overview'].head(5)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [7]:
#replace NaN values with empty strings
movies['overview'] = movies['overview'].fillna('')

In [8]:
#using movie descriptions, the keywords associated with the movie and the genre column
def create_join(x):
    return ''.join(x['keywords']) + '' + ''.join(x['genres']) + '' + ''.join(x['overview'])
movies['join'] = movies.apply(create_join, axis=1)

In [9]:
#Creating TF-IDF using scikit-learn
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrices = tfidf.fit_transform(movies['join'])
tfidf_matrices.shape

(4803, 32768)

In [10]:
# Using Cosine Similarity but we can use euclidean or any other method
# instead of cosine_similarity we are using linear_kernel because it is faster works exactly the same
cosine_similarity = linear_kernel(tfidf_matrices,tfidf_matrices)

In [12]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(movies.index, index=credits['title']).drop_duplicates()

In [28]:
def Recommendation_get(title, cosine_similarity = cosine_similarity):
    idx = indices[title]
    similar_scores = list(enumerate(cosine_similarity[idx]))
    similar_scores = sorted(similar_scores, key = lambda x: x[1], reverse = True)
    similar_scores = similar_scores[1:21]
    movie_indices = [i[0] for i in similar_scores]
    return credits['title'].iloc[movie_indices]

In [41]:
title = input(str('Enter a movie title that you watched recently: '))
print("\t")
print("If you like that then you might like these .........")
Recommendation_get(title, cosine_similarity)

Enter a movie title that you watched recently: Titanic
	
If you like that then you might like these .........


104                                        Poseidon
2902                                       Triangle
1269                              Raise the Titanic
818                                Captain Phillips
1          Pirates of the Caribbean: At World's End
310                         In the Heart of the Sea
770                                   Event Horizon
17      Pirates of the Caribbean: On Stranger Tides
2428                              Brooklyn's Finest
3696                    Four Weddings and a Funeral
1170                        The Talented Mr. Ripley
2442                                Southland Tales
106                                 Shrek the Third
2217                       Everyone Says I Love You
216                                      Life of Pi
2935                                       Brooklyn
2493                                  The Immigrant
3568                   20,000 Leagues Under the Sea
3746                              The Boy Next Door
1291        