# Movie Recommendation System

In [1]:
#Importing the necessary libraries
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import difflib
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
#loading the data
movies= pd.read_csv("movies.csv")
ratings= pd.read_csv("ratings.csv")
tags= pd.read_csv("tags.csv")
links = pd.read_csv("links.csv")

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [5]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [6]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [10]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [11]:
print('movies ',movies.shape)
print('ratings ',ratings.shape)
print('tags ',tags.shape)
print('links ', links.shape)

movies  (9742, 3)
ratings  (100836, 4)
tags  (3683, 4)
links  (9742, 3)


# Data Preprocessing

In [12]:
links.isnull().sum()

movieId    0
imdbId     0
tmdbId     8
dtype: int64

In [13]:
links.fillna(0 , inplace=True)

In [14]:
links.isnull().sum()

movieId    0
imdbId     0
tmdbId     0
dtype: int64

In [15]:
links['tmdbId'] = links['tmdbId'].astype(int)

In [16]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [17]:
#merging movies and links dataset into one 
linked = pd.merge(movies, links , on='movieId', how='left')
linked.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357
4,5,Father of the Bride Part II (1995),Comedy,113041,11862


In [18]:
movies['genres']=movies['genres'].str.replace('|',' ')

In [19]:
len(movies.movieId.unique())

9742

In [20]:
len(ratings.movieId.unique())

9724

In [21]:
# filtering out users who have rated less that 50 movies
ratings_f=ratings.groupby('userId').filter(lambda x: len(x)>=50)
movie_list_rating=ratings_f.movieId.unique().tolist()

In [22]:
(len(ratings_f.userId.unique())/len(ratings.userId.unique()))*100

63.114754098360656

In [23]:
movies=movies[movies.movieId.isin(movie_list_rating)]

In [24]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [25]:
#Merge the movies and tags dataset into one

mixed=pd.merge(movies, tags, on='movieId', how='left')
mixed.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0
3,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0
4,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0


In [26]:
mixed.drop(['timestamp'], axis=1, inplace=True)

In [27]:
mixed.head(10)

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun
3,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy
4,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game
5,2,Jumanji (1995),Adventure Children Fantasy,62.0,Robin Williams
6,2,Jumanji (1995),Adventure Children Fantasy,474.0,game
7,3,Grumpier Old Men (1995),Comedy Romance,289.0,moldy
8,3,Grumpier Old Men (1995),Comedy Romance,289.0,old
9,4,Waiting to Exhale (1995),Comedy Drama Romance,,


In [28]:
#creating metadata

mixed.fillna("", inplace=True)
mixed=pd.DataFrame(mixed.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))

Final=pd.merge(movies, mixed, on='movieId', how='left')
Final['metadata']= Final[['tag', 'genres']].apply(lambda x: ' '.join(x), axis=1)
Final[['movieId', 'metadata']].head()

Unnamed: 0,movieId,metadata
0,1,pixar pixar fun Adventure Animation Children C...
1,2,fantasy magic board game Robin Williams game A...
2,3,moldy old Comedy Romance
3,4,Comedy Drama Romance
4,5,pregnancy remake Comedy


In [29]:
Final.head()

Unnamed: 0,movieId,title,genres,tag,metadata
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun,pixar pixar fun Adventure Animation Children C...
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game,fantasy magic board game Robin Williams game A...
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old,moldy old Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance,,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake,pregnancy remake Comedy


In [30]:
Final.shape

(9633, 5)

# Creating a content based matrix from movie metadata using tfidf vectors

In [31]:
tfidf=TfidfVectorizer(stop_words='english')
tfidf_matrix=tfidf.fit_transform(Final['metadata'])
tfidf_df=pd.DataFrame(tfidf_matrix.toarray(), index=Final.index.tolist())
print(tfidf_df.shape)

(9633, 1675)


In [32]:
svd=TruncatedSVD(n_components=200)
latent_matrix=svd.fit_transform(tfidf_df)

In [33]:
movie_df=pd.DataFrame(latent_matrix[:,0:200], index=Final.title.tolist())

In [34]:
movie_df.shape

(9633, 200)

# Creating a collaborative based matrix from ratings using tfidf vectors

In [35]:
ratings_f.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [36]:
ratings_f.drop(['timestamp'], axis=1, inplace=True)

In [37]:
ratings_f.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [38]:
ratings_f1=pd.merge(movies[['movieId']], ratings_f, on='movieId', how='right')

In [39]:
ratings_f2= ratings_f1.pivot(index='movieId', columns='userId', values='rating').fillna(0)

In [40]:
ratings_f2.head()

userId,1,4,6,7,10,11,15,16,17,18,...,600,601,602,603,604,605,606,607,608,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,3.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
4,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [41]:
len(ratings_f.movieId.unique())

9633

In [42]:
svd=TruncatedSVD(n_components=200)
latent_matrix2=svd.fit_transform(ratings_f2)
ratings_df=pd.DataFrame(latent_matrix2[:,0:200], index=Final.title.tolist())

In [43]:
ratings_df.shape

(9633, 200)

# The recommendation system made using hybrid approach  

In [44]:
#the function recommend() that will recommend movies on the basis of the user's input

def recommend(movie):
    movie_name = movie.capitalize()

    # creating a list with all the movies given in the data set
    list_of_all_titles = movies['title'].tolist()

    # finding the close match for the movie name given by the user
    find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
    name = find_close_match[0]

    a1= np.array(movie_df.loc[name]).reshape(1,-1)
    a2= np.array(ratings_df.loc[name]).reshape(1,-1)
     
    #Calculating the score for content based system  
    score1=cosine_similarity(movie_df, a1).reshape(-1)
    
    #Calculating the score for collaborative based system  
    score2=cosine_similarity(ratings_df, a2).reshape(-1)
    
    #Calculating the score for the hybrid system of content based and collaborative based system  
    hybrid=((score1+score2)/2.0)
    
    dictDf = {'content': score1, 'collabarative': score2, 'hybrid': hybrid }
    similar=pd.DataFrame(dictDf, index=movie_df.index)
    similar.sort_values('hybrid', ascending=False, inplace=True)
    
    #outputing the names of the movies
    for i in range(len(similar.head(21))):
        print(i, similar.index[i])
     
    
# recommend(movie_name)        

In [45]:
movie_name=input("Enter the name of movie: ")
recommend(movie_name)

Enter the name of movie: iron man
0 Iron Man (2008)
1 X-Men: Days of Future Past (2014)
2 Ant-Man (2015)
3 Avengers: Age of Ultron (2015)
4 Deadpool (2016)
5 Incredible Hulk, The (2008)
6 I, Robot (2004)
7 Iron Man 2 (2010)
8 Star Wars: Episode I - The Phantom Menace (1999)
9 Indiana Jones and the Kingdom of the Crystal Skull (2008)
10 Star Trek Beyond (2016)
11 War of the Worlds (2005)
12 Fifth Element, The (1997)
13 X-Men: First Class (2011)
14 Rogue One: A Star Wars Story (2016)
15 Guardians of the Galaxy 2 (2017)
16 Terminator 3: Rise of the Machines (2003)
17 Day After Tomorrow, The (2004)
18 Captain America: The First Avenger (2011)
19 Black Panther (2017)
20 Children of Men (2006)


In [46]:
pickle.dump(Final.to_dict(),open('movie_dict.pkl','wb'))

In [47]:
pickle.dump(movie_df.to_dict(),open('movie_df.pkl','wb'))

In [48]:
pickle.dump(ratings_df.to_dict(),open('ratings_df.pkl','wb'))

In [49]:

pickle.dump(linked.to_dict(),open('linked_dict.pkl','wb'))