# Movie recommendation system using genre of the movie

### Importing Libraries

In [2]:
import pandas as pd
import numpy as np

In [2]:
## Reading the data

In [3]:
movies_df = pd.read_csv('movies.csv')

In [4]:
## shape of the data
movies_df.shape

(62423, 3)

In [5]:
movies_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [7]:
# checking for null values
movies_df.isnull().sum()

movieId      0
title        0
genres       0
year       410
dtype: int64

In [8]:
movies_df.title = movies_df.title.str.lower()

In [9]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,toy story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,jumanji,Adventure|Children|Fantasy,1995
2,3,grumpier old men,Comedy|Romance,1995
3,4,waiting to exhale,Comedy|Drama|Romance,1995
4,5,father of the bride part ii,Comedy,1995


There is no null values in the dataset but we have to check the details in inside the genre column

In [10]:
movies_df.genres.value_counts()

Drama                                        9056
Comedy                                       5674
(no genres listed)                           5062
Documentary                                  4731
Comedy|Drama                                 2386
                                             ... 
Drama|Fantasy|Horror|Thriller|War               1
Crime|Drama|Fantasy|Mystery|Thriller            1
Children|Comedy|Drama|Western                   1
Action|Animation|Children|Fantasy|Sci-Fi        1
Adventure|Animation|Drama|Fantasy|Romance       1
Name: genres, Length: 1639, dtype: int64

In [11]:
movies_df[movies_df.genres=='(no genres listed)']

Unnamed: 0,movieId,title,genres,year
15881,83773,away with words (san tiao ren),(no genres listed),1999
16060,84768,glitterbug,(no genres listed),1994
16351,86493,"age of the earth, the (a idade da terra)",(no genres listed),1980
16491,87061,trails (veredas),(no genres listed),1978
17404,91246,milky way (tejút),(no genres listed),2007
...,...,...,...,...
62400,209101,hua yang de nian hua,(no genres listed),2001
62401,209103,tsar ivan the terrible,(no genres listed),1991
62407,209133,the riot and the dance,(no genres listed),2018
62415,209151,mao zedong 1949,(no genres listed),2019


The genres contain values like 'no genre listed' so we can remove those from the data set for now

In [10]:
movies_df= movies_df.drop(movies_df[movies_df.genres=='(no genres listed)'].index,axis=0)

In [19]:
movies_df.title.drop_duplicates(keep='first',inplace=True)

In [21]:
movies_df.title

0                          toy story
1                            jumanji
2                   grumpier old men
3                  waiting to exhale
4        father of the bride part ii
                    ...             
62416                   happy flight
62417            santosh subramaniam
62419             window of the soul
62420                      bad poems
62422        women of devil's island
Name: title, Length: 53818, dtype: object

In [22]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,toy story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,jumanji,Adventure|Children|Fantasy,1995
2,3,grumpier old men,Comedy|Romance,1995
3,4,waiting to exhale,Comedy|Drama|Romance,1995
4,5,father of the bride part ii,Comedy,1995


In [23]:
#making into lower values to easy search 
movies_df.title= movies_df.title.str.lower()

In [24]:
movies_df['genres'] = movies_df.genres.str.split('|')

In [25]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,toy story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,jumanji,"[Adventure, Children, Fantasy]",1995
2,3,grumpier old men,"[Comedy, Romance]",1995
3,4,waiting to exhale,"[Comedy, Drama, Romance]",1995
4,5,father of the bride part ii,[Comedy],1995


In [26]:
from mlxtend.preprocessing import TransactionEncoder
genre_encoder = TransactionEncoder()
genres_df= genre_encoder.fit_transform(movies_df.genres)


In [27]:
genres_df

array([[False,  True,  True, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [ True,  True, False, ..., False, False, False]])

In [28]:
genres_df= pd.DataFrame(genres_df,columns=genre_encoder.columns_)

In [29]:
genres_df

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,False,True,True,True,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False
1,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57356,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False
57357,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
57358,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
57359,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False


In [30]:
genres_df.replace(to_replace=False,value=0,inplace=True)
genres_df.astype(int)

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57356,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
57357,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
57358,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
57359,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [31]:
x= genres_df

Now we apply k means clustering to cluster those movies into different 
first we will choose the number of clusters needed using elbow method

## clustering method for movies

In [32]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
SSE = []
for cluster in range(1,20):
    kmeans = KMeans(n_clusters = cluster, init='k-means++')
    kmeans.fit(x)
    SSE.append(kmeans.inertia_)

# converting the results into a dataframe and plotting them
frame = pd.DataFrame({'Cluster':range(1,20), 'SSE':SSE})
plt.figure(figsize=(12,6))
plt.plot(frame['Cluster'], frame['SSE'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')

Text(0, 0.5, 'Inertia')

In [33]:
model = KMeans(n_clusters = 9,n_init=40,random_state=100)

We will use the no.of clusters as 9

In [34]:
model.fit(x)

KMeans(n_clusters=9, n_init=40, random_state=100)

In [35]:
genres_df['clusters']= model.labels_
genres_df['moviename']= movies_df.title

In [36]:
genres_df.head(10)

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,clusters,moviename
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,toy story
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,jumanji
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,grumpier old men
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6,waiting to exhale
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,father of the bride part ii
5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,heat
6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,sabrina
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,tom and huck
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,sudden death
9,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,goldeneye


# Function to recommendation

In [121]:
def recome(movie):
    movie_s = genres_df[genres_df['moviename']==movie].iloc[:,0:-2]
    cluster= np.array(genres_df[genres_df['moviename']==movie].clusters)[0]
    similar= genres_df[genres_df['clusters']==cluster].iloc[:,0:-2]  
    dist= []
    from scipy.spatial import distance
    for i in similar.index:
        dis=distance.cosine(movie_s,similar.loc[i,:])
        dist.append((dis,i))

    sorted_dis= sorted(dist,reverse=False)[1:6]
    movies_name=[]
    for i in sorted_dis:
        movies_name.append(genres_df['moviename'].iloc[i[1]])
    return(movies_name)

In [122]:
recome('toy story')

['antz',
 'toy story 2',
 'adventures of rocky and bullwinkle, the',
 "emperor's new groove, the",
 'monsters, inc.']