# Movie recommender with all approaches

In [33]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN

Loading csv files

In [34]:
#Reading the .csv files into dataframe
movies=pd.read_csv("/content/drive/MyDrive/movies.csv")
tags=pd.read_csv("/content/drive/MyDrive/tags.csv")
ratings=pd.read_csv("/content/drive/MyDrive/ratings.csv")
links=pd.read_csv("/content/drive/MyDrive/links.csv")


In [35]:
#Printing first few lines of the dataframe so as to get an overview of the files
print(movies.head())
print(tags.head())
print(ratings.head())
print(links.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferrell  1445714992
3       2    89774     Boxing story  1445715207
4       2    89774              MMA  1445715200
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2  

Data Preprocessing

In [36]:
#Checking if there is any null values present
print(tags.isnull().sum())
print(movies.isnull().sum())
print(ratings.isnull().sum())
print(links.isnull().sum())

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64
movieId    0
title      0
genres     0
dtype: int64
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
movieId    0
imdbId     0
tmdbId     8
dtype: int64


In [37]:
#Selecting relevant columns from tags dataframe and ratings dataframe
tags=tags[['userId','movieId','tag']]
ratings=ratings[['userId','movieId','rating']]


Data Cleaning

In [38]:
#converting genres column of movies dataframe from string into list by splitting on |
movies['genres']=movies['genres'].apply(lambda x:x.split('|'))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [39]:
#Removing Whitespaces from the element of tag column
tags['tag']=tags['tag'].apply(lambda x: ''.join(x.split()))
tags.head()


Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,Highlyquotable
2,2,60756,willferrell
3,2,89774,Boxingstory
4,2,89774,MMA


In [40]:
#Combining all the tags given to a movie and creating a new dataframe
combined_tag=tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
combined_tag.head()

Unnamed: 0,movieId,tag
0,1,pixar pixar fun
1,2,fantasy magicboardgame RobinWilliams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake


In [41]:
#Creating a new dataframe merged_df which contains movies,combined_tag dataframe
merged_df=pd.merge(movies, combined_tag, on='movieId', how='left')
merged_df.head()


Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",pixar pixar fun
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",fantasy magicboardgame RobinWilliams game
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",moldy old
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",
4,5,Father of the Bride Part II (1995),[Comedy],pregnancy remake


In [42]:
#Merging the column genres and tag
merged_df['description'] = merged_df['genres'].apply(lambda x: ' '.join(x)) + ' ' + merged_df['tag'].fillna('')
merged_df.drop(columns=['genres', 'tag'], inplace=True)
merged_df.head()

Unnamed: 0,movieId,title,description
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji (1995),Adventure Children Fantasy fantasy magicboardg...
2,3,Grumpier Old Men (1995),Comedy Romance moldy old
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy pregnancy remake


In [43]:
#Adding tmdb id to the dataframe
merged_df["tmdbId"]=links["tmdbId"]

In [44]:
merged_df.head()

Unnamed: 0,movieId,title,description,tmdbId
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,862.0
1,2,Jumanji (1995),Adventure Children Fantasy fantasy magicboardg...,8844.0
2,3,Grumpier Old Men (1995),Comedy Romance moldy old,15602.0
3,4,Waiting to Exhale (1995),Comedy Drama Romance,31357.0
4,5,Father of the Bride Part II (1995),Comedy pregnancy remake,11862.0


Performing data cleaning on merged_df


In [45]:
#Converting description column data to lower case
merged_df["description"]=merged_df["description"].apply(lambda x:x.lower())
merged_df.head()

Unnamed: 0,movieId,title,description,tmdbId
0,1,Toy Story (1995),adventure animation children comedy fantasy pi...,862.0
1,2,Jumanji (1995),adventure children fantasy fantasy magicboardg...,8844.0
2,3,Grumpier Old Men (1995),comedy romance moldy old,15602.0
3,4,Waiting to Exhale (1995),comedy drama romance,31357.0
4,5,Father of the Bride Part II (1995),comedy pregnancy remake,11862.0


In [46]:
#Removing repetitive words from each description element
def remove_repetitive_words(description):
    words = description.split()
    unique_words = list(set(words))
    return ' '.join(unique_words)

# Apply the function to the description column
merged_df['description'] = merged_df['description'].apply(remove_repetitive_words)

In [47]:
merged_df.head()

Unnamed: 0,movieId,title,description,tmdbId
0,1,Toy Story (1995),fun comedy fantasy pixar adventure children an...,862.0
1,2,Jumanji (1995),robinwilliams magicboardgame fantasy game adve...,8844.0
2,3,Grumpier Old Men (1995),old comedy moldy romance,15602.0
3,4,Waiting to Exhale (1995),drama comedy romance,31357.0
4,5,Father of the Bride Part II (1995),pregnancy remake comedy,11862.0


In [48]:

def remove_year(title):
    return re.sub(r'\s*\(\d{4}\)', '', title).strip()

# Preprocess movie titles
merged_df['title']=merged_df['title'].apply(remove_year)
merged_df.head()

Unnamed: 0,movieId,title,description,tmdbId
0,1,Toy Story,fun comedy fantasy pixar adventure children an...,862.0
1,2,Jumanji,robinwilliams magicboardgame fantasy game adve...,8844.0
2,3,Grumpier Old Men,old comedy moldy romance,15602.0
3,4,Waiting to Exhale,drama comedy romance,31357.0
4,5,Father of the Bride Part II,pregnancy remake comedy,11862.0


Different Approaches

K means clustering

In [49]:
#Finding unique values present in genre columns of movies dataframe
all_genres = [genre for sublist in movies['genres'] for genre in sublist]

# Get unique values using set
unique_genres = set(all_genres)
print(unique_genres)

{'Adventure', 'Romance', 'Thriller', 'Drama', 'Crime', 'Documentary', 'Musical', 'Fantasy', 'IMAX', 'Animation', 'Film-Noir', 'Children', 'Sci-Fi', 'Comedy', 'Action', 'War', 'Horror', 'Western', '(no genres listed)', 'Mystery'}


In [50]:

# Convert genres column into binary matrix
mlb = MultiLabelBinarizer()
genres_matrix = mlb.fit_transform(movies['genres'])
genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)

# Number of clusters is the total number of unique genres
k = len(mlb.classes_)

# Apply k-means clustering
kmeans = KMeans(n_clusters=k,n_init=10)
kmeans.fit(genres_df)

# Assign cluster labels to the DataFrame
movies['cluster'] = kmeans.labels_

def recommend_top_movies(movie_title, df, top_n=5):
    # Find cluster label of the given movie
    cluster_label = df[df['title'] == movie_title]['cluster'].values[0]

    # Get other movies in the same cluster
    cluster_movies = df[df['cluster'] == cluster_label]

    # Compute cosine similarity between the given movie and other movies in the cluster
    movie_index = cluster_movies.index[0]
    similarities = cosine_similarity(genres_df.iloc[movie_index].values.reshape(1, -1), genres_df.iloc[cluster_movies.index])

    # Sort movies by similarity score (excluding the given movie itself)
    similar_movies = sorted(list(zip(cluster_movies['title'], similarities[0])), key=lambda x: x[1], reverse=True)[1:]

    # Recommend top N similar movies
    top_similar_movies = similar_movies[:top_n]

    return top_similar_movies

movie_title="Now and Then (1995)"
# Get the recommended movies
top_similar_movies = recommend_top_movies(movie_title, movies, top_n=5)

# Print the recommended movie titles
print("Top 5 recommended movies for '{}':".format(movie_title))
for idx, (title, _) in enumerate(top_similar_movies, start=1):
    print("{}. {}".format(idx, title))



Top 5 recommended movies for 'Now and Then (1995)':
1. Othello (1995)
2. Dangerous Minds (1995)
3. Cry, the Beloved Country (1995)
4. Restoration (1995)
5. Georgia (1995)


In [51]:
#Creating an object of porter stemmer class
ps=PorterStemmer()

In [52]:
#Function for performing stemming
def stem(text):
  y=[]
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join (y)

In [53]:
#Applying stemming to description column
merged_df["description"]=merged_df["description"].apply(stem)

In [54]:
merged_df.head()

Unnamed: 0,movieId,title,description,tmdbId
0,1,Toy Story,fun comedi fantasi pixar adventur children anim,862.0
1,2,Jumanji,robinwilliam magicboardgam fantasi game advent...,8844.0
2,3,Grumpier Old Men,old comedi moldi romanc,15602.0
3,4,Waiting to Exhale,drama comedi romanc,31357.0
4,5,Father of the Bride Part II,pregnanc remak comedi,11862.0


Using count vectorizer and cosine similarity

In [55]:
#Initialize CountVectorizer to convert text data into numerical vectors
cv=CountVectorizer(max_features=5000,stop_words="english") #stop words=eng removes english stop words


In [56]:
#The fit_transform() method converts the text data into a sparse matrix representation where each row corresponds to a movie
vectors=cv.fit_transform(merged_df["description"]).toarray()

In [57]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [58]:
cv.get_feature_names_out()

array(['06oscarnominatedbestmovie', '1900', '1920', ..., 'zoekazan',
       'zombi', 'zooeydeschanel'], dtype=object)

In [59]:
#Finding cosine similarity
similar=cosine_similarity(vectors)

In [60]:
#function to recommend movie
def recommend_movie(movie):
  movie_index=merged_df[merged_df['title']==movie].index[0]
  dist=similar[movie_index]
  movie_list=sorted(list(enumerate(dist)),reverse=True,key=lambda x:x[1])[1:6]
  for i in movie_list:
    print(merged_df.iloc[i[0]].title)

In [61]:
recommend_movie('Big Daddy')

Grown Ups 2
Four Rooms
Ace Ventura: When Nature Calls
Bio-Dome
Friday


Using DBSCAN

In [62]:

# Step 1: Vectorize movie descriptions
cv = CountVectorizer(max_features=5000, stop_words="english")
vectors = cv.fit_transform(merged_df["description"]).toarray()

# Step 2: Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.25, min_samples=20)  # Adjust parameters as needed
cluster_labels = dbscan.fit_predict(vectors)

# Step 3: Define a function to recommend movies from the same cluster
def recommend_movie(movie):
    # Find the index of the movie in 'merged_df'
    movie_index = merged_df[merged_df['title'] == movie].index[0]

    # Get the cluster label for the given movie
    movie_cluster = cluster_labels[movie_index]

    # Find other movies in the same cluster
    cluster_movies = merged_df[cluster_labels == movie_cluster]

    # Exclude the given movie from recommendations
    cluster_movies = cluster_movies[cluster_movies['title'] != movie]

    # Print the titles of recommended movies
    for title in cluster_movies['title'].head(5):  # Print top 5 recommended movies
        print(title)

# Example usage: recommend movies similar to "Toy Story"
recommend_movie("Toy Story")


Jumanji
Grumpier Old Men
Father of the Bride Part II
Sabrina
American President, The


Using agglomerative clustering

In [63]:

# Vectorize movie descriptions
cv = CountVectorizer(max_features=5000, stop_words="english")
vectors = cv.fit_transform(merged_df["description"]).toarray()

# Applying Agglomerative Clustering
agglomerative = AgglomerativeClustering(n_clusters=5)  # Adjust parameters as needed
cluster_labels = agglomerative.fit_predict(vectors)

# Defining a function to recommend movies from the same cluster
def recommend_movie(movie):
    # Find the index of the movie in 'merged_df'
    movie_index = merged_df[merged_df['title'] == movie].index[0]

    # Get the cluster label for the given movie
    movie_cluster = cluster_labels[movie_index]

    # Find other movies in the same cluster
    cluster_movies = merged_df[cluster_labels == movie_cluster]

    # Exclude the given movie from recommendations
    cluster_movies = cluster_movies[cluster_movies['title'] != movie]

    # Print the titles of recommended movies
    for title in cluster_movies['title'].head(5):  # Print top 5 recommended movies
        print(title)

# Example usage: recommend movies similar to "Toy Story"
recommend_movie("Toy Story")

Jumanji
Tom and Huck
Balto
Four Rooms
Ace Ventura: When Nature Calls


# Dumping using pickle.dump

This section is for dumping files which we will be using in making website

In [64]:
import pickle

In [65]:
pickle.dump(merged_df.to_dict(),open("movies_web.pkl","wb"))

In [66]:
pickle.dump(similar,open("similar_web.pkl","wb"))