### Content Based Recommendation System

### Read the Dataset `movies_metadata.csv`

In [19]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
movies_df = pd.read_csv('movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
movies_df.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
movies_df.shape

(45466, 24)

In [5]:
movies_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

### Create a new column with name 'description' combining `'overview' and 'tagline'` columns in the given dataset

In [6]:
movies_df.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [7]:
movies_df['description'] = movies_df['tagline'] + movies_df['overview']

In [8]:
movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'description'],
      dtype='object')

### Lets drop the null values in `description` column

In [9]:
movies_df['description'].isnull().sum()

25062

In [10]:
movies_df.dropna(subset=['description'], inplace=True)

In [11]:
movies_df.shape

(20404, 25)

### Keep the first occurance and drop duplicates of each title in column `title`

In [12]:
movies_df['title'].drop_duplicates(inplace=True)

In [13]:
movies_df.shape

(20404, 25)

### As we might have dropped a few rows with duplicate `title` in above step, just reset the index [make sure you are not adding any new column to the dataframe while doing reset index]

In [14]:
movies_df.reset_index(inplace=True)

In [15]:
movies_df.head()

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description
0,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,Roll the dice and unleash the excitement!When ...
1,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,Still Yelling. Still Fighting. Still Ready for...
2,3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,Friends are the people who let you be yourself...
3,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,Just When His World Is Back To Normal... He's ...
4,5,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,...,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,A Los Angeles Crime SagaObsessive master thief...


### Generate tf-idf matrix using the column `description`. Consider till 3-grams, with minimum document frequency as 0.

In [17]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tf_idf_matrix = tf.fit_transform(movies_df['description'])
print(tf_idf_matrix)

  (0, 711707)	0.1036894980542005
  (0, 345709)	0.1036894980542005
  (0, 890723)	0.1036894980542005
  (0, 436387)	0.1036894980542005
  (0, 910839)	0.1036894980542005
  (0, 896240)	0.1036894980542005
  (0, 843113)	0.1036894980542005
  (0, 425366)	0.1036894980542005
  (0, 392956)	0.1036894980542005
  (0, 412348)	0.1036894980542005
  (0, 505022)	0.1036894980542005
  (0, 32533)	0.1036894980542005
  (0, 905169)	0.1036894980542005
  (0, 632355)	0.1036894980542005
  (0, 1192603)	0.1036894980542005
  (0, 7923)	0.1036894980542005
  (0, 424730)	0.1036894980542005
  (0, 536298)	0.1036894980542005
  (0, 1088910)	0.1036894980542005
  (0, 22346)	0.1036894980542005
  (0, 32490)	0.1036894980542005
  (0, 545151)	0.1036894980542005
  (0, 1120172)	0.1036894980542005
  (0, 1183850)	0.1036894980542005
  (0, 657943)	0.1036894980542005
  :	:
  (20403, 1188186)	0.1824791909744087
  (20403, 497555)	0.1824791909744087
  (20403, 1170264)	0.1824791909744087
  (20403, 425619)	0.1824791909744087
  (20403, 837315)	0.

In [18]:
tf_idf_matrix.shape

(20404, 1202356)

### create cosine similarity matrix

In [20]:
cosine_similarity = linear_kernel(tf_idf_matrix, tf_idf_matrix)

In [21]:
cosine_similarity.shape

(20404, 20404)

### Write a function with name `recommend` which takes `title` as argument and returns a list of 10 recommended title names in the output based on the above cosine similarities

In [71]:
def recommend(title):
    id = movies_df.loc[movies_df['title']==title].index
    top_movies_index = np.flip(np.argsort(cosine_similarity[id[0],]),axis=0)[0:10]
    return movies_df['title'].iloc[top_movies_index]

### Give the recommendations from above functions for movies `The Godfather` and `The Dark Knight Rises`

In [72]:
recommend('The Godfather')

611               The Godfather
865      The Godfather: Part II
16565          Honor Thy Father
12436                The Family
13078                Blood Ties
3256                       Made
4047         Johnny Dangerously
19387             Live by Night
6008                       Fury
26               Shanghai Triad
Name: title, dtype: object

In [73]:
recommend('The Dark Knight')

8307                             The Dark Knight
10957                      The Dark Knight Rises
112                               Batman Forever
988                               Batman Returns
9772                  Batman: Under the Red Hood
451                                       Batman
14018                          Batman vs Dracula
19508                      The Lego Batman Movie
11806    Batman: The Dark Knight Returns, Part 2
6502          Batman Beyond: Return of the Joker
Name: title, dtype: object