In [2]:
import numpy as np
import pandas as pd

In [3]:
#Loading movielens data

#User's data
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv("ml-100k\\u.user", sep='|', names=users_cols, parse_dates=True) 
#Ratings
rating_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv("ml-100k\\u.data", sep='\t', names=rating_cols)
#Movies
movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv("ml-100k\\u.item", sep='|', names=movie_cols, usecols=range(5),encoding='latin-1')


In [4]:
#Merging movie data with their ratings
movie_ratings = pd.merge(movies, ratings)
#merging movie_ratings data with the User's dataframe
df = pd.merge(movie_ratings, users)
 #pre-processing
 #dropping colums that aren't needed
df.drop(df.columns[[3,4,7]], axis=1, inplace=True)
ratings.drop( "unix_timestamp", inplace = True, axis = 1 ) 
movies.drop(movies.columns[[3,4]], inplace = True, axis = 1 )

In [5]:
df.head(2)

Unnamed: 0,movie_id,title,release_date,user_id,rating,age,sex,occupation,zip_code
0,1,Toy Story (1995),01-Jan-1995,308,4,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,308,5,60,M,retired,95076


In [6]:
#Pivot Table(This creates a matrix of users and movie_ratings)
ratings_matrix = ratings.pivot_table(index=['movie_id'],columns=['user_id'],values='rating').reset_index(drop=True)
ratings_matrix.fillna( 0, inplace = True )
ratings_matrix.head(3)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#Cosine Similarity(Creates a cosine matrix of similaraties ..... which is the pairwise distances
# between two items )
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

movie_similarity = 1 - pairwise_distances( ratings_matrix.as_matrix(), metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) 
ratings_matrix = pd.DataFrame( movie_similarity )
ratings_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.000000,0.402382,0.330245,0.454938,0.286714,0.116344,0.620979,0.481114,0.496288,0.273935,...,0.035387,0.000000,0.000000,0.000000,0.035387,0.0,0.0,0.0,0.047183,0.047183
1,0.402382,0.000000,0.273069,0.502571,0.318836,0.083563,0.383403,0.337002,0.255252,0.171082,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.078299,0.078299
2,0.330245,0.273069,0.000000,0.324866,0.212957,0.106722,0.372921,0.200794,0.273669,0.158104,...,0.000000,0.000000,0.000000,0.000000,0.032292,0.0,0.0,0.0,0.000000,0.096875
3,0.454938,0.502571,0.324866,0.000000,0.334239,0.090308,0.489283,0.490236,0.419044,0.252561,...,0.000000,0.000000,0.094022,0.094022,0.037609,0.0,0.0,0.0,0.056413,0.075218
4,0.286714,0.318836,0.212957,0.334239,0.000000,0.037299,0.334769,0.259161,0.272448,0.055453,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.094211
5,0.116344,0.083563,0.106722,0.090308,0.037299,0.000000,0.139617,0.083876,0.151064,0.203097,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000
6,0.620979,0.383403,0.372921,0.489283,0.334769,0.139617,0.000000,0.423515,0.527462,0.318623,...,0.000000,0.051498,0.000000,0.000000,0.051498,0.0,0.0,0.0,0.051498,0.051498
7,0.481114,0.337002,0.200794,0.490236,0.259161,0.083876,0.423515,0.000000,0.424429,0.267764,...,0.000000,0.082033,0.065627,0.065627,0.082033,0.0,0.0,0.0,0.082033,0.000000
8,0.496288,0.255252,0.273669,0.419044,0.272448,0.151064,0.527462,0.424429,0.000000,0.288514,...,0.000000,0.000000,0.057360,0.057360,0.071700,0.0,0.0,0.0,0.057360,0.071700
9,0.273935,0.171082,0.158104,0.252561,0.055453,0.203097,0.318623,0.267764,0.288514,0.000000,...,0.000000,0.000000,0.080264,0.080264,0.000000,0.0,0.0,0.0,0.000000,0.000000


In [9]:
#Recommender System in Action

try:
    user_inp=input('Enter the reference movie title based on which recommendations are to be made: ')
    #user_inp="Speed (1994)"
    inp=movies[movies['title']==user_inp].index.tolist()
    inp=inp[0]
    
    movies['similarity'] = ratings_matrix.iloc[inp]
    movies.columns = ['movie_id', 'title', 'release_date','similarity']
    movies.head(5)
    
except:
    print("Sorry, the movie is not in the database!")
    
print("Recommended movies based on your choice of ",user_inp ,": \n", movies.sort_values( ["similarity"], ascending = False )[1:10])


Enter the reference movie title based on which recommendations are to be made: Avengers-The Age of Ultron
Sorry, the movie is not in the database!
Recommended movies based on your choice of  Avengers-The Age of Ultron : 
      movie_id                                         title release_date  \
180       181                     Return of the Jedi (1983)  14-Mar-1997   
120       121                 Independence Day (ID4) (1996)  03-Jul-1996   
116       117                              Rock, The (1996)  07-Jun-1996   
404       405                    Mission: Impossible (1996)  22-May-1996   
150       151  Willy Wonka and the Chocolate Factory (1971)  01-Jan-1971   
221       222               Star Trek: First Contact (1996)  22-Nov-1996   
99        100                                  Fargo (1996)  14-Feb-1997   
236       237                          Jerry Maguire (1996)  13-Dec-1996   
173       174                Raiders of the Lost Ark (1981)  01-Jan-1981   

     similarity  