# Project - Movie Recommendation

### Importing libraries

In [50]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Loading data

In [51]:
df = pd.read_csv("dataset/movies.csv")
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [52]:
print("Shape of given data: {}".format(df.shape))

Shape of given data: (4803, 24)


### Displaying information about the dataset

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

### Checking for any null or nan values in the dataset

In [54]:
df.isnull().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

**The feature 'homepage' has the most missing values.**

### Extracting relevant features from the given dataset

In [55]:
important_features = ['genres', 'keywords', 'cast', 'director', 'tagline']
print("Important features are: {}".format(important_features))

Important features are: ['genres', 'keywords', 'cast', 'director', 'tagline']


In [56]:
df[important_features].head()

Unnamed: 0,genres,keywords,cast,director,tagline
0,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron,Enter the World of Pandora.
1,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski,"At the end of the world, the adventure begins."
2,Action Adventure Crime,spy based on novel secret agent sequel mi6,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Sam Mendes,A Plan No One Escapes
3,Action Crime Drama Thriller,dc comics crime fighter terrorist secret ident...,Christian Bale Michael Caine Gary Oldman Anne ...,Christopher Nolan,The Legend Ends
4,Action Adventure Science Fiction,based on novel mars medallion space travel pri...,Taylor Kitsch Lynn Collins Samantha Morton Wil...,Andrew Stanton,"Lost in our world, found in another."


### Checking for any null values from the selected features

In [57]:
df[important_features].isna().sum()

genres       28
keywords    412
cast         43
director     30
tagline     844
dtype: int64

**There are few null values present among the selected features.**
### Fixing null values

In [58]:
for feat in important_features:
    df[feat] = df[feat].fillna('')
    
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


**After fixing null values, lets check again for any missing values.**

In [59]:
df[important_features].isnull().sum()

genres      0
keywords    0
cast        0
director    0
tagline     0
dtype: int64

**There are no more missing values left.**
### Combining the important features

In [60]:
combined_features = df['genres']+' '+df['keywords']+' '+df['tagline']+' '+df['cast']+' '+df['director']

### Converting Text data into Vectors

In [61]:
vectorizer = TfidfVectorizer()

feature_vectors = vectorizer.fit_transform(combined_features)

print("Shape of feature vectors is: {}".format(feature_vectors.shape))

Shape of feature vectors is: (4803, 17318)


### Cosine Similarity for getting similarity score

In [62]:
similar_score = cosine_similarity(feature_vectors)
print("Similarity score is:\n{}".format(similar_score))

Similarity score is:
[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [63]:
# Displaying shape of similarity score

print("Shape of similarity score is: {}".format(similar_score.shape))

Shape of similarity score is: (4803, 4803)


### Taking user favourite movie as input

In [64]:
user_movie = input("Enter movie name: ")

Enter movie name: john carter


### List of all movies from the given dataset

In [65]:
all_movies_list = df['title'].tolist()
all_movies_list[0:6]

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3']

In [66]:
print("Length of movies list is: {}".format(len(all_movies_list)))

Length of movies list is: 4803


In [67]:
find_matching = difflib.get_close_matches(user_movie, all_movies_list)
print("Matching movie titles are: {}".format(find_matching))

Matching movie titles are: ['John Carter', 'Songcatcher', 'Coach Carter']


### Find the first close matching and get index of the title

In [68]:
close_match = find_matching[0]
print("Close match is: {}".format(close_match))

Close match is: John Carter


In [69]:
id_matching_movie = df[df.title == close_match]['index'].values[0]
print("Index of the close match is: {}".format(id_matching_movie))

Index of the close match is: 4


---

### Similarity Score

In [70]:
# This will return the similarity values for the movie entered by the user 

sim_score = list(enumerate(similar_score[id_matching_movie]))
print("Similarity score is:\n{}".format(sim_score))

Similarity score is:
[(0, 0.10702574467235304), (1, 0.03305463079248628), (2, 0.05772600875531061), (3, 0.006717078317903558), (4, 1.0), (5, 0.12989685014764749), (6, 0.008020897515385296), (7, 0.060340258968822585), (8, 0.008184976474383678), (9, 0.03832298281222101), (10, 0.05721297692372175), (11, 0.012383886426888273), (12, 0.014233825346334616), (13, 0.012851166932020926), (14, 0.07529319100526179), (15, 0.0747883106730046), (16, 0.05721539202804143), (17, 0.013616306365014916), (18, 0.07706037615219369), (19, 0.012089070811125913), (20, 0.041694129432257705), (21, 0.011699385312773366), (22, 0.00715716767717989), (23, 0.03928703526113247), (24, 0.04857182590109369), (25, 0.010913900929666012), (26, 0.03631511936724096), (27, 0.10989340697547767), (28, 0.03486350003287453), (29, 0.025926729267293502), (30, 0.03636070766215349), (31, 0.04491082754059965), (32, 0.06711474644290759), (33, 0.05996534685041098), (34, 0.0), (35, 0.03670710779261637), (36, 0.033076033414895116), (37, 0.0

In [71]:
print("Length of similarity score is: {}".format(len(sim_score)))

Length of similarity score is: 4803


### Sorting movies based on similarity score

In [72]:
# This will return a sorted list based on the similarity score

sorted_similar_movies = sorted(sim_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(4, 1.0), (2904, 0.18017696511612652), (3158, 0.16107142159805013), (4395, 0.15210617290884296), (4401, 0.15062825815280348), (328, 0.14670677967731258), (111, 0.14617319309725305), (373, 0.1442405267812881), (1473, 0.14421212750841966), (3257, 0.14420846947181268), (2121, 0.1401527127089081), (1373, 0.13976314835137943), (3367, 0.13794651765642174), (4555, 0.1371397458723799), (270, 0.13532476682576292), (1748, 0.13237311082393202), (2407, 0.13149317745862238), (5, 0.12989685014764749), (916, 0.12791675874135178), (921, 0.1262990082329127), (816, 0.1230270378231185), (305, 0.12112629853579467), (3506, 0.12052007685003181), (3378, 0.1195317044310383), (1075, 0.1184241513313847), (2163, 0.1167647400431644), (122, 0.11642573498643403), (2157, 0.11449791498665905), (239, 0.11414981559949217), (1158, 0.11377447636173509), (857, 0.11233376783748897), (27, 0.10989340697547767), (2847, 0.10842726461716617), (2587, 0.10835307982511137), (1959, 0.10765674459223615), (565, 0.10737017065333719),

---

### Top 30 movies based on similarity score

In [74]:
print("Top-30 movies recommended for you are: \n")
i = 1
for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = df[df.index==index]['title'].values[0]
  if(i<=30):
    print("{}. {}".format(i, title_from_index))
    i+=1

Top-30 movies recommended for you are: 

1. John Carter
2. Heaven is for Real
3. Alien
4. The Specials
5. The Helix... Loaded
6. Finding Nemo
7. Transformers
8. Mission to Mars
9. The Astronaut's Wife
10. American Psycho
11. Max
12. The English Patient
13. The Last Temptation of Christ
14. Enter Nowhere
15. The Martian
16. Notes on a Scandal
17. Sideways
18. Spider-Man 3
19. Daddy's Home
20. We Bought a Zoo
21. George of the Jungle
22. Treasure Planet
23. Don McKay
24. Auto Focus
25. Savages
26. The Covenant
27. X-Men Origins: Wolverine
28. Daybreakers
29. Gravity
30. Lone Survivor


---