Importing the dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing

In [None]:
# loading the data from the csv file to apandas dataframe
movies_data = pd.read_csv('/content/drive/MyDrive/archive (1) 2/tmdb_5000_movies.csv')

In [None]:
# printing the first 5 rows of the dataframe
movies_data.reset_index(inplace=True)
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [None]:
# number of rows and columns in the data frame

movies_data.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count'],
      dtype='object')

In [None]:
# selecting the relevant features for recommendation

selected_features = ['genres','keywords','tagline']
print(selected_features)

['genres', 'keywords', 'tagline']


In [None]:
# replacing the null valuess with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [None]:
# combining all the 5 selected features

combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']

In [None]:
print(combined_features)

0       [{"id": 28, "name": "Action"}, {"id": 12, "nam...
1       [{"id": 12, "name": "Adventure"}, {"id": 14, "...
2       [{"id": 28, "name": "Action"}, {"id": 12, "nam...
3       [{"id": 28, "name": "Action"}, {"id": 80, "nam...
4       [{"id": 28, "name": "Action"}, {"id": 12, "nam...
                              ...                        
4798    [{"id": 28, "name": "Action"}, {"id": 80, "nam...
4799    [{"id": 35, "name": "Comedy"}, {"id": 10749, "...
4800    [{"id": 35, "name": "Comedy"}, {"id": 18, "nam...
4801                       [] [] A New Yorker in Shanghai
4802    [{"id": 99, "name": "Documentary"}] [{"id": 15...
Length: 4803, dtype: object


In [None]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [None]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [None]:
print(feature_vectors)

  (0, 15820)	0.13067136923706554
  (0, 15625)	0.042190581271209615
  (0, 19187)	0.06115902867437435
  (0, 18221)	0.034745900795906386
  (0, 12610)	0.11947677021689071
  (0, 7811)	0.07852455953648513
  (0, 5739)	0.07884757815388245
  (0, 17580)	0.09812989462371097
  (0, 10170)	0.051270797096986086
  (0, 15132)	0.09231570917121786
  (0, 5458)	0.12412294859965346
  (0, 16645)	0.11947677021689071
  (0, 16235)	0.08169455710568298
  (0, 5012)	0.13067136923706554
  (0, 10213)	0.09308492085598188
  (0, 2739)	0.1353175476198283
  (0, 10026)	0.0943121230774711
  (0, 14775)	0.054074000786992364
  (0, 1066)	0.13067136923706554
  (0, 10570)	0.08003190699608953
  (0, 1030)	0.09708757217654103
  (0, 17540)	0.08614731530249232
  (0, 836)	0.09348371624094821
  (0, 14916)	0.0915814739862989
  (0, 499)	0.13067136923706554
  :	:
  (4800, 18127)	0.1829383057392132
  (4800, 12382)	0.05454819765741068
  (4800, 4006)	0.054671196139331954
  (4800, 10374)	0.1549226921988739
  (4800, 14775)	0.10535660066722898
 

Cosine Similarity

In [None]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [None]:
print(similarity)

[[1.         0.33622781 0.26096609 ... 0.26614764 0.         0.18793274]
 [0.33622781 1.         0.25454511 ... 0.25652912 0.         0.17182781]
 [0.26096609 0.25454511 1.         ... 0.19299489 0.         0.14120104]
 ...
 [0.26614764 0.25652912 0.19299489 ... 1.         0.         0.14646562]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.18793274 0.17182781 0.14120104 ... 0.14646562 0.         1.        ]]


In [None]:
print(similarity.shape)

(4803, 4803)


Getting the movie name from the user

Movie Recommendation Sytem

In [None]:
import json
for i in range(len(movies_data['keywords'])):
  movies_data['keywords'][i]=json.loads(movies_data['keywords'][i])
  movies_data['keywords'][i] = [d['name'] for d in movies_data['keywords'][i]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_data['keywords'][i]=json.loads(movies_data['keywords'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_data['keywords'][i] = [d['name'] for d in movies_data['keywords'][i]]


In [None]:
import difflib

def suggest_similar_movies(keyword):

    list_of_all_keywords = movies_data['keywords'].tolist()
    find_close_match=[]
    for idx, keywords in enumerate(list_of_all_keywords):
        close_matches = difflib.get_close_matches(keyword, list_of_all_keywords)
        if close_matches:
            for match in close_matches:
                find_close_match.append(keywords)
    close_match = find_close_match[0]
    index_of_the_movie=0
    for i in range(len(movies_data['keywords'])):
      if(movies_data['keywords'][i] == close_match):
        index_of_the_movie=i

    similarity_score = list(enumerate(similarity[index_of_the_movie]))

    sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

    print('Movies suggested for you: \n')

    i = 1

    for movie in sorted_similar_movies:
        index = movie[0]
        title_from_index = movies_data[movies_data.index == index]['title'].values[0]
        if i < 30:
            print(f"{i}. {title_from_index}")
            i += 1

# Example usage:
# movie_name = input('Enter your favorite movie name: ')
suggest_similar_movies(['future',])


Movies suggested for you: 

1. Avatar
2. Star Trek Into Darkness
3. Alien³
4. Aliens
5. Alien
6. Planet of the Apes
7. The Fifth Element
8. Treasure Planet
9. Moonraker
10. Southland Tales
11. Cargo
12. Titan A.E.
13. Event Horizon
14. Interstellar
15. Hav Plenty
16. Meet Dave
17. Armageddon
18. Lockout
19. Brooklyn's Finest
20. Silent Running
21. Caravans
22. Battle: Los Angeles
23. Austin Powers: The Spy Who Shagged Me
24. Delgo
25. Tracker
26. A Monster in Paris
27. Star Trek
28. Spaceballs
29. Terminator Genisys


In [None]:
import pickle
pickle.dump(movies_data, open('data.pkl', 'wb'))
pickle.dump(similarity,open('similarity1.pkl','wb'))