# Reccomendation system
This is a demonstration of recommendations systems built using IMDB ratings data to recommend new movies to users based on their behavior.

In [66]:
# import libraries
import pandas as pd
from scipy.sparse import csr_matrix

In [82]:
movies = pd.read_csv("IMDB-Dataset/movies.csv")
ratings = pd.read_csv("IMDB-Dataset/ratings.csv")

In [83]:
print(movies.head())
print(ratings.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating   timestamp
0       1       16     4.0  1217897793
1       1       24     1.5  1217895807
2       1       32     4.0  1217896246
3       1       47     4.0  1217896556
4       1       50     4.0  1217896523


In [84]:
# check the shape of the movies
movies.describe(include='all')

Unnamed: 0,movieId,title,genres
count,10329.0,10329,10329
unique,,10327,938
top,,War of the Worlds (2005),Drama
freq,,2,1385
mean,31924.282893,,
std,37734.741149,,
min,1.0,,
25%,3240.0,,
50%,7088.0,,
75%,59900.0,,


In [85]:
# check the shape of ratingss
ratings.describe(include='all')

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


In [None]:
movies_explode = movies.copy()
movies_explode.genres = movies_explode.genres.str.split('|')
movies_explode = movies_explode.explode('genres')

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


In [87]:
movies_explode.genres.value_counts()

genres
Drama                 5220
Comedy                3515
Thriller              2187
Romance               1788
Action                1737
Crime                 1440
Adventure             1164
Horror                1001
Sci-Fi                 860
Mystery                675
Fantasy                670
Children               540
War                    503
Documentary            415
Musical                409
Animation              401
Western                235
Film-Noir              195
IMAX                   152
(no genres listed)       7
Name: count, dtype: int64

In [88]:
movies_explode.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [89]:
# One Hot Encoding the genres
movies_encode = pd.concat([movies_explode.iloc[:,:2],pd.get_dummies(movies_explode.genres)],axis=1)
movies_encode = movies_encode.groupby(['movieId','title'], as_index=False).sum()
movies_encode.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
full_data = pd.merge(ratings, movies_encode, on='movieId')
full_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,16,4.0,1217897793,Casino (1995),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,24,1.5,1217895807,Powder (1995),0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,32,4.0,1217896246,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0
3,1,47,4.0,1217896556,Seven (a.k.a. Se7en) (1995),0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,4.0,1217896523,"Usual Suspects, The (1995)",0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


# User Based Collaborative Filtering
Here we will use KNN to find the nearest neighbors of movies based on users ratings of these movies

In [91]:
# For this we will just use the movieId, userId and rating columns. We convert it to a matrix
dataset = pd.pivot(ratings.iloc[:,:3], index='movieId', columns='userId', values='rating').fillna(0)

In [92]:
# First we filter out the movies that have less than 10 ratings and we filter out the users who have rated less than 50 movies
movie_rate_cnt = ratings.groupby('movieId').size()
user_rate_cnt = ratings.groupby('userId').size()

dataset = dataset.loc[movie_rate_cnt.index[movie_rate_cnt.values > 10],user_rate_cnt.index[user_rate_cnt.values > 50]]

In [93]:
# Since lot of values are zeros (i.e. data is sparse) we will convert it to a sparse matrix
csr_data = csr_matrix(dataset.values)
dataset.reset_index(inplace=True)

In [94]:
print(csr_data)

  (0, 3)	4.0
  (0, 6)	5.0
  (0, 8)	4.0
  (0, 11)	5.0
  (0, 20)	3.0
  (0, 21)	4.0
  (0, 22)	4.5
  (0, 23)	4.0
  (0, 26)	5.0
  (0, 29)	1.5
  (0, 30)	4.0
  (0, 33)	4.0
  (0, 34)	3.0
  (0, 35)	3.0
  (0, 37)	3.0
  (0, 39)	2.0
  (0, 43)	4.0
  (0, 44)	4.5
  (0, 45)	5.0
  (0, 46)	2.0
  (0, 50)	3.5
  (0, 51)	3.0
  (0, 53)	3.0
  (0, 55)	4.5
  (0, 56)	4.0
  :	:
  (2158, 104)	4.5
  (2158, 105)	3.5
  (2158, 151)	5.0
  (2158, 155)	3.5
  (2158, 191)	4.5
  (2158, 249)	4.0
  (2158, 275)	3.5
  (2158, 330)	3.5
  (2158, 377)	4.0
  (2158, 389)	4.0
  (2159, 26)	3.5
  (2159, 34)	4.0
  (2159, 57)	4.5
  (2159, 104)	4.0
  (2159, 108)	4.5
  (2159, 141)	5.0
  (2159, 151)	4.0
  (2159, 155)	5.0
  (2159, 178)	2.5
  (2159, 186)	4.5
  (2159, 191)	4.5
  (2159, 249)	3.5
  (2159, 324)	4.0
  (2159, 354)	5.0
  (2159, 389)	3.0


In [95]:
# Fitting the model
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

In [142]:
# writing a function to take movie names and return the recommendations
def get_movie_recommendation(movie_name, number_of_rec):
    movie_idx = movies[movies['title'].str.contains(movie_name)]
    if not len(movie_idx):
        print('Movie not in database.')
        return None
    movie_idx = movie_idx.iloc[0]['movieId']
    movie_idx = dataset[dataset['movieId'] == movie_idx]
    if not len(movie_idx):
        print('Movie does not have enough data.')
        return None
    movie_idx = movie_idx.index[0]
    distance, indexes = knn.kneighbors(csr_data[movie_idx], n_neighbors=number_of_rec+1)
    rec_movie_dist = sorted(list(zip(indexes.squeeze().tolist(), distance.squeeze().tolist())), key = lambda x: x[1], reverse=True)[:-1]
    rec_movie_list = []
    for val in rec_movie_dist:
        movie_index = dataset.loc[val[0]]['movieId']
        rec_movie_list.append({'title': movies[movies['movieId'] == movie_index].iloc[0,1], 'distance': val[1]})

    return pd.DataFrame(rec_movie_list)
    # return rec_movie_dist


In [144]:
results = get_movie_recommendation('Iron Man', 10)
results

Unnamed: 0,title,distance
0,Batman Begins (2005),0.374727
1,WALL·E (2008),0.370822
2,Watchmen (2009),0.362019
3,300 (2007),0.355342
4,"Avengers, The (2012)",0.346966
5,Inception (2010),0.340948
6,"Bourne Ultimatum, The (2007)",0.33574
7,Casino Royale (2006),0.307617
8,Star Trek (2009),0.293528
9,"Dark Knight, The (2008)",0.246059


In [139]:
movie_idx = movies[movies['title'].str.contains('The Dark Knight')].iloc[0]['movieId']
movie_idx = dataset[dataset['movieId'] == movie_idx]#.index[0]
movie_idx

userId,movieId,1,3,4,5,6,7,8,9,11,...,656,657,659,661,662,664,665,666,667,668
