In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
mr_df = pd.read_csv("movies.csv")

In [3]:
selected_feature = ["genres","keywords","tagline","cast","director"]
print(selected_feature)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [4]:
for f in selected_feature:
  mr_df[f] = mr_df[f].fillna("")

In [5]:
combined_feature = mr_df["genres"]+' '+mr_df["keywords"]+' '+mr_df["tagline"]+' '+mr_df["cast"]+mr_df["director"]
print(combined_feature)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [6]:
vector = TfidfVectorizer()

In [7]:
feature_vector = vector.fit_transform(combined_feature)
print(feature_vector)

  (0, 208)	0.07667385108184376
  (0, 289)	0.08800104830707552
  (0, 6241)	0.1083630860656862
  (0, 16345)	0.10110130418423781
  (0, 6437)	0.10110130418423781
  (0, 4357)	0.2086788913449303
  (0, 3615)	0.21664083919579374
  (0, 6923)	0.1606391519765174
  (0, 17262)	0.33130376319063676
  (0, 19787)	0.12241864673585882
  (0, 3806)	0.24348426965339803
  (0, 17147)	0.2086788913449303
  (0, 5865)	0.23437015044361997
  (0, 18275)	0.06921925370543773
  (0, 20389)	0.12507033381973537
  (0, 13517)	0.08827534630310274
  (0, 13860)	0.26544401841252097
  (0, 16062)	0.15564527461690644
  (0, 20398)	0.23860058795491665
  (0, 20721)	0.19702892145320783
  (0, 16029)	0.21898867021233073
  (0, 16879)	0.20091311266965015
  (0, 19900)	0.19827148152003798
  (0, 17532)	0.16181995525842374
  (0, 10559)	0.22730069036139045
  :	:
  (4801, 423)	0.19056341009568284
  (4801, 5734)	0.2563183496274323
  (4801, 20694)	0.29933113097807007
  (4801, 16621)	0.28905348514055085
  (4801, 20559)	0.3138166592040448
  (4801, 

In [8]:
similarity = cosine_similarity(feature_vector)

In [9]:
print(similarity.shape)

(4803, 4803)


In [10]:
movie_name = input("Enter The Movie Name ")
list_of_movies = mr_df["title"].tolist()
find_close_match = difflib.get_close_matches(movie_name,list_of_movies)
close_match = find_close_match[0]
title_with_index = mr_df[mr_df.title == close_match]["index"].values[0]
similarity_score = list(enumerate(similarity[title_with_index]))
sort_similar_movie = sorted(similarity_score,key=lambda x:x[1],reverse=True)
print("movie suggested for you /n")
i=1
for movie in sort_similar_movie:
  index = movie[0]
  title_from_index = mr_df[mr_df.index == index]['title'].values[0]
  if(i<20):
    print(i,'.',title_from_index)
    i+=1

Enter The Movie Name  iron man


movie suggested for you /n
1 . Iron Man
2 . Iron Man 2
3 . Iron Man 3
4 . Avengers: Age of Ultron
5 . The Avengers
6 . Captain America: Civil War
7 . Captain America: The Winter Soldier
8 . Ant-Man
9 . X-Men
10 . X-Men: Apocalypse
11 . X2
12 . The Incredible Hulk
13 . The Helix... Loaded
14 . X-Men: Days of Future Past
15 . X-Men: First Class
16 . Captain America: The First Avenger
17 . Deadpool
18 . Guardians of the Galaxy
19 . Thor: The Dark World


In [11]:
import pickle

# Save
pickle.dump(similarity, open('similarity.pkl', 'wb'))
mr_df.to_pickle('movies.pkl')
