# Web Demo




In [3]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN

Loading csv files

In [4]:
#Reading the .csv files into dataframe
movies=pd.read_csv("/content/drive/MyDrive/movies.csv")
tags=pd.read_csv("/content/drive/MyDrive/tags.csv")
ratings=pd.read_csv("/content/drive/MyDrive/ratings.csv")
links=pd.read_csv("/content/drive/MyDrive/links.csv")


In [5]:
#Printing first few lines of the dataframe so as to get an overview of the files
print(movies.head())
print(tags.head())
print(ratings.head())
print(links.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferrell  1445714992
3       2    89774     Boxing story  1445715207
4       2    89774              MMA  1445715200
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2  

Data Preprocessing

In [6]:
#Checking if there is any null values present
print(tags.isnull().sum())
print(movies.isnull().sum())
print(ratings.isnull().sum())
print(links.isnull().sum())

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64
movieId    0
title      0
genres     0
dtype: int64
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
movieId    0
imdbId     0
tmdbId     8
dtype: int64


In [7]:
#Selecting relevant columns from tags dataframe and ratings dataframe
tags=tags[['userId','movieId','tag']]
ratings=ratings[['userId','movieId','rating']]


Data Cleaning

In [8]:
#converting genres column of movies dataframe from string into list by splitting on |
movies['genres']=movies['genres'].apply(lambda x:x.split('|'))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [9]:
#Removing Whitespaces from the element of tag column
tags['tag']=tags['tag'].apply(lambda x: ''.join(x.split()))
tags.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['tag']=tags['tag'].apply(lambda x: ''.join(x.split()))


Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,Highlyquotable
2,2,60756,willferrell
3,2,89774,Boxingstory
4,2,89774,MMA


In [10]:
#Combining all the tags given to a movie and creating a new dataframe
combined_tag=tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
combined_tag.head()

Unnamed: 0,movieId,tag
0,1,pixar pixar fun
1,2,fantasy magicboardgame RobinWilliams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake


In [11]:
#Creating a new dataframe merged_df which contains movies,combined_tag dataframe
merged_df=pd.merge(movies, combined_tag, on='movieId', how='left')
merged_df.head()


Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",pixar pixar fun
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",fantasy magicboardgame RobinWilliams game
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",moldy old
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",
4,5,Father of the Bride Part II (1995),[Comedy],pregnancy remake


In [12]:
#Merging the column genres and tag
merged_df['description'] = merged_df['genres'].apply(lambda x: ' '.join(x)) + ' ' + merged_df['tag'].fillna('')
merged_df.drop(columns=['genres', 'tag'], inplace=True)
merged_df.head()

Unnamed: 0,movieId,title,description
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji (1995),Adventure Children Fantasy fantasy magicboardg...
2,3,Grumpier Old Men (1995),Comedy Romance moldy old
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy pregnancy remake


In [13]:
#Adding tmdb id to the dataframe
merged_df["tmdbId"]=links["tmdbId"]

In [14]:
merged_df.head()

Unnamed: 0,movieId,title,description,tmdbId
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,862.0
1,2,Jumanji (1995),Adventure Children Fantasy fantasy magicboardg...,8844.0
2,3,Grumpier Old Men (1995),Comedy Romance moldy old,15602.0
3,4,Waiting to Exhale (1995),Comedy Drama Romance,31357.0
4,5,Father of the Bride Part II (1995),Comedy pregnancy remake,11862.0


Performing data cleaning on merged_df


In [15]:
#Converting description column data to lower case
merged_df["description"]=merged_df["description"].apply(lambda x:x.lower())
merged_df.head()

Unnamed: 0,movieId,title,description,tmdbId
0,1,Toy Story (1995),adventure animation children comedy fantasy pi...,862.0
1,2,Jumanji (1995),adventure children fantasy fantasy magicboardg...,8844.0
2,3,Grumpier Old Men (1995),comedy romance moldy old,15602.0
3,4,Waiting to Exhale (1995),comedy drama romance,31357.0
4,5,Father of the Bride Part II (1995),comedy pregnancy remake,11862.0


In [16]:
#Removing repetitive words from each description element
def remove_repetitive_words(description):
    words = description.split()
    unique_words = list(set(words))
    return ' '.join(unique_words)

# Apply the function to the description column
merged_df['description'] = merged_df['description'].apply(remove_repetitive_words)

In [17]:
merged_df.head()

Unnamed: 0,movieId,title,description,tmdbId
0,1,Toy Story (1995),adventure fantasy fun pixar animation comedy c...,862.0
1,2,Jumanji (1995),adventure robinwilliams fantasy magicboardgame...,8844.0
2,3,Grumpier Old Men (1995),old romance comedy moldy,15602.0
3,4,Waiting to Exhale (1995),romance comedy drama,31357.0
4,5,Father of the Bride Part II (1995),pregnancy comedy remake,11862.0


In [18]:

def remove_year(title):
    return re.sub(r'\s*\(\d{4}\)', '', title).strip()

# Preprocess movie titles
merged_df['title']=merged_df['title'].apply(remove_year)
merged_df.head()

Unnamed: 0,movieId,title,description,tmdbId
0,1,Toy Story,adventure fantasy fun pixar animation comedy c...,862.0
1,2,Jumanji,adventure robinwilliams fantasy magicboardgame...,8844.0
2,3,Grumpier Old Men,old romance comedy moldy,15602.0
3,4,Waiting to Exhale,romance comedy drama,31357.0
4,5,Father of the Bride Part II,pregnancy comedy remake,11862.0


In [19]:
#Creating an object of porter stemmer class
ps=PorterStemmer()

In [20]:
#Function for performing stemming
def stem(text):
  y=[]
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join (y)

In [21]:
#Applying stemming to description column
merged_df["description"]=merged_df["description"].apply(stem)

In [22]:
merged_df.head()

Unnamed: 0,movieId,title,description,tmdbId
0,1,Toy Story,adventur fantasi fun pixar anim comedi children,862.0
1,2,Jumanji,adventur robinwilliam fantasi magicboardgam ch...,8844.0
2,3,Grumpier Old Men,old romanc comedi moldi,15602.0
3,4,Waiting to Exhale,romanc comedi drama,31357.0
4,5,Father of the Bride Part II,pregnanc comedi remak,11862.0


Using count vectorizer and cosine similarity

In [23]:
#Initialize CountVectorizer to convert text data into numerical vectors
cv=CountVectorizer(max_features=5000,stop_words="english") #stop words=eng removes english stop words


In [24]:
#The fit_transform() method converts the text data into a sparse matrix representation where each row corresponds to a movie
vectors=cv.fit_transform(merged_df["description"]).toarray()

In [25]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
cv.get_feature_names_out()

array(['06oscarnominatedbestmovie', '1900', '1920', ..., 'zoekazan',
       'zombi', 'zooeydeschanel'], dtype=object)

In [27]:
#Finding cosine similarity
similar=cosine_similarity(vectors)

In [28]:
#function to recommend movie
def recommend_movie(movie):
  movie_index=merged_df[merged_df['title']==movie].index[0]
  dist=similar[movie_index]
  movie_list=sorted(list(enumerate(dist)),reverse=True,key=lambda x:x[1])[1:6]
  for i in movie_list:
    print(merged_df.iloc[i[0]].title)

In [29]:
recommend_movie('Big Daddy')

Grown Ups 2
Four Rooms
Ace Ventura: When Nature Calls
Bio-Dome
Friday
