In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.head(1)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [4]:
ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144


Cleaning the genres column

In [5]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [6]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [7]:
movies.duplicated().sum(), ratings.duplicated().sum()

(0, 0)

In [8]:
movies.genres.head()

0    Adventure|Animation|Children|Comedy|Fantasy
1                     Adventure|Children|Fantasy
2                                 Comedy|Romance
3                           Comedy|Drama|Romance
4                                         Comedy
Name: genres, dtype: object

In [9]:
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

In [10]:
movies['tags'] = movies['genres'].apply(lambda x: [i.replace(' ','') for i in x])
movies['tags'] = movies['genres'].apply(lambda x: [i.replace('-','') for i in x])

In [11]:
movies['tags']

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9120                          [Adventure, Drama, Romance]
9121                  [Action, Adventure, Fantasy, SciFi]
9122                                        [Documentary]
9123                                             [Comedy]
9124                                        [Documentary]
Name: tags, Length: 9125, dtype: object

In [12]:
movies.head()

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy],[Comedy]


In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


# Content Based Reccomender

In [14]:
movies['tags'] = movies['tags'].apply(lambda x: ' '.join(x))

In [15]:
movies['tags'] = movies['tags'].apply(lambda x: x.lower())

In [16]:
movies.head()

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",adventure animation children comedy fantasy
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",adventure children fantasy
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",comedy romance
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",comedy drama romance
4,5,Father of the Bride Part II (1995),[Comedy],comedy


In [17]:
# import nltk
# from nltk.stem.porter import PorterStemmer
# ps = PorterStemmer()

In [18]:
# def stem(text):
#   y = []

#   for i in text.split():
#     y.append(ps.stem(i))

#   return ' '.join(y)

In [19]:
# movies['tags'] = movies['tags'].apply(stem)

In [20]:
# movies.head()

In [21]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
vectors = cv.fit_transform(movies['tags']).toarray()

In [22]:
vectors.shape

(9125, 22)

In [23]:
cv.get_feature_names()



['action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'filmnoir',
 'genres',
 'horror',
 'imax',
 'listed',
 'musical',
 'mystery',
 'no',
 'romance',
 'scifi',
 'thriller',
 'war',
 'western']

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
similarity = cosine_similarity(vectors)

In [28]:
similarity[0].shape

(9125,)

In [52]:
def genre_based_recc(movie,n):
  movie_index = movies[movies['title']==movie].index[0]
  dist = similarity[movie_index]
  movies_list = sorted(list(enumerate(dist)), reverse=True, key=lambda x:x[1])
  count = 0
  for i in movies_list:
    
    if (movie != movies.iloc[i[0]].title and count != n):
      print(movies.iloc[i[0]].title)
      count+=1

In [54]:
print(movies['title'][10])
genre_based_recc(movies['title'][10],10)

American President, The (1995)
Waiting to Exhale (1995)
Mighty Aphrodite (1995)
Postman, The (Postino, Il) (1994)
Beautiful Girls (1996)
Something to Talk About (1995)
Don Juan DeMarco (1995)
Eat Drink Man Woman (Yin shi nan nu) (1994)
Nobody's Fool (1994)
Corrina, Corrina (1994)
It Could Happen to You (1994)


# Collaborative Filtering System

In [55]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [60]:
new_df = ratings.pivot(index='movieId', columns='userId',values='rating')
new_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,3.0,,4.0,,...,,4.0,3.5,,,,,,4.0,5.0
2,,,,,,,,,,,...,5.0,,,3.0,,,,,,
3,,,,,4.0,,,,,,...,,,,3.0,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,3.0,,,,,,


In [61]:
new_df.fillna(0,inplace=True)

In [62]:
new_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
no_user_voted = ratings.groupby('movieId')['rating'].agg('count')
no_movies_voted = ratings.groupby('userId')['rating'].agg('count')

In [79]:
no_movies_voted

userId
1       20
2       76
3       51
4      204
5      100
      ... 
667     68
668     20
669     37
670     31
671    115
Name: rating, Length: 671, dtype: int64

In [80]:
no_user_voted

movieId
1         247
2         107
3          59
4          13
5          56
         ... 
161944      1
162376      1
162542      1
162672      1
163949      1
Name: rating, Length: 9066, dtype: int64

In [85]:
new_df = new_df.loc[:,no_movies_voted[no_movies_voted > 50].index]
new_df = new_df.loc[no_user_voted[no_user_voted > 10].index,:]

In [86]:
new_df.head()

userId,2,3,4,5,7,8,12,13,15,17,...,655,656,658,659,660,662,664,665,667,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,2.0,0.0,...,0.0,0.0,0.0,0.0,2.5,0.0,3.5,0.0,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,4.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,0.0,0.0
3,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0


In [88]:
new_df.shape

(2083, 421)

In [89]:
from scipy.sparse import csr_matrix

In [90]:
csr_data = csr_matrix(new_df.values)
new_df.reset_index(inplace=True)

In [91]:
from sklearn.neighbors import NearestNeighbors

In [108]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [131]:
def collab_recc(movie,n):
  movie_list = movies[movies['title'].str.contains(movie)]

  if len(movie_list):
    movie_id = movie_list.iloc[0]['movieId']
    movie_id = new_df[new_df['movieId']==movie_id].index[0]

    dist , indices = knn.kneighbors(csr_data[movie_id], n_neighbors=n+1)
    rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(), dist.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    recommend_frame = []

    for val in rec_movie_indices:
      movie_id = new_df.iloc[val[0]]['movieId']
      id = movies[movies['movieId']==movie_id].index
      recommend_frame.append({'Title':movies.iloc[id]['title'].values[0],'Dist':val[1]})
    
    df = pd.DataFrame(recommend_frame, index=range(1,n+1))
    return df
  else:
    return 'No movies found. Please check your input.'



In [132]:
collab_recc('Iron Man',10)

Unnamed: 0,Title,Dist
1,Inception (2010),0.385641
2,"Dark Knight Rises, The (2012)",0.38558
3,WALL·E (2008),0.377571
4,300 (2007),0.364423
5,Guardians of the Galaxy (2014),0.363696
6,"Avengers, The (2012)",0.358591
7,Star Trek (2009),0.329935
8,Avatar (2009),0.329733
9,Batman Begins (2005),0.329124
10,"Dark Knight, The (2008)",0.210279
