In [51]:
import numpy as np
import pandas as pd
import warnings

In [52]:
warnings.filterwarnings('ignore')

In [53]:
column_names = ['User_ID', 'Item_ID', 'Ratings', 'Timestamp']
user_rating = pd.read_csv('u.data', sep="\t", names=column_names)
user_rating.head()

Unnamed: 0,User_ID,Item_ID,Ratings,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [54]:
column_names2 = ['Item_ID', 'Title']
movies = pd.read_csv('u.item', sep="\|", header=None)[[0, 1]]
#movies = movies[[0, 1]]
movies.columns = column_names2
movies.head()

Unnamed: 0,Item_ID,Title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [55]:
working_dataset = pd.merge(user_rating, movies, on='Item_ID')
working_dataset.head()

Unnamed: 0,User_ID,Item_ID,Ratings,Timestamp,Title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [56]:
movie_ratings = pd.DataFrame(working_dataset.groupby('Title').mean()['Ratings'])
movie_ratings.head()

Unnamed: 0_level_0,Ratings
Title,Unnamed: 1_level_1
'Til There Was You (1997),2.333333
1-900 (1994),2.6
101 Dalmatians (1996),2.908257
12 Angry Men (1957),4.344
187 (1997),3.02439


In [57]:
movie_ratings['Number_of_Ratings'] = pd.DataFrame(working_dataset.groupby('Title').count()['Ratings'])
movie_ratings.dropna(inplace=True)
movie_ratings.head()

Unnamed: 0_level_0,Ratings,Number_of_Ratings
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,9
1-900 (1994),2.6,5
101 Dalmatians (1996),2.908257,109
12 Angry Men (1957),4.344,125
187 (1997),3.02439,41


Movie Recommandation System

In [58]:
working_dataset.head()

Unnamed: 0,User_ID,Item_ID,Ratings,Timestamp,Title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [59]:
#Creating Rating Matrix as User x Movies
user_rating2 = working_dataset.pivot_table(index='User_ID', columns='Title', values='Ratings')
user_rating2.head(20)

#Titanic ratings from Matrix
titanic_ratings = user_rating2['Titanic (1997)']
titanic_ratings.head()

#Finding correlation of Titanic with others
titanic_correlation = user_rating2.corrwith(titanic_ratings)
titanic_correlation.head()

#Removing NaN values
titanic_correlation = pd.DataFrame(titanic_correlation, columns=['Correlation'])
titanic_correlation.dropna(inplace=True)
titanic_correlation.head()

#Now finding recommendations by sorting correlation factor as descending
titanic_correlation.sort_values('Correlation', ascending=False).head(10)

Unnamed: 0_level_0,Correlation
Title,Unnamed: 1_level_1
Nadja (1994),1.0
"Pest, The (1997)",1.0
"Savage Nights (Nuits fauves, Les) (1992)",1.0
For Ever Mozart (1996),1.0
"Jerky Boys, The (1994)",1.0
"Newton Boys, The (1998)",1.0
Hearts and Minds (1996),1.0
"Simple Wish, A (1997)",1.0
Purple Noon (1960),1.0
Year of the Horse (1997),1.0


In [60]:
#As the correlation factor is not considering the number of people rated the movie so we will need to set up a threshold value for the same.
movie_ratings

titanic_correlation = titanic_correlation.join(movie_ratings['Number_of_Ratings'])

In [61]:
titanic_correlation.head()

Unnamed: 0_level_0,Correlation,Number_of_Ratings
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),-0.062017,9
101 Dalmatians (1996),0.120113,109
12 Angry Men (1957),0.0777,125
187 (1997),0.315654,41
2 Days in the Valley (1996),0.017295,93


In [64]:
titanic_correlation[titanic_correlation['Number_of_Ratings']>100].sort_values('Correlation',ascending=False)

Unnamed: 0_level_0,Correlation,Number_of_Ratings
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Titanic (1997),1.000000,350
"River Wild, The (1994)",0.497600,146
"Abyss, The (1989)",0.472103,151
Bram Stoker's Dracula (1992),0.443560,120
True Lies (1994),0.435104,208
...,...,...
Raging Bull (1980),-0.223660,116
Jackie Brown (1997),-0.236841,126
Brazil (1985),-0.243532,208
Cold Comfort Farm (1995),-0.307150,125


In [66]:
def recommend_movies(movie_name) :
  movie_user_rating = user_rating2[movie_name]
  similar_to_movie = user_rating2.corrwith(movie_user_rating)
  
  movie_correlation = pd.DataFrame(similar_to_movie, columns=['Correlation'])
  movie_correlation.dropna(inplace=True)
  movie_correlation = movie_correlation.join(movie_ratings['Number_of_Ratings'])
  
  recommended_movies = movie_correlation[movie_correlation['Number_of_Ratings']>100].sort_values('Correlation', ascending=False)

  return recommended_movies

In [67]:
my_next_movie_recommendation = recommend_movies('Titanic (1997)')
my_next_movie_recommendation.head()

Unnamed: 0_level_0,Correlation,Number_of_Ratings
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Titanic (1997),1.0,350
"River Wild, The (1994)",0.4976,146
"Abyss, The (1989)",0.472103,151
Bram Stoker's Dracula (1992),0.44356,120
True Lies (1994),0.435104,208


In [68]:
my_next_movie_recommendation2 = recommend_movies('Star Wars (1977)')
my_next_movie_recommendation2.head()

Unnamed: 0_level_0,Correlation,Number_of_Ratings
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),1.0,583
"Empire Strikes Back, The (1980)",0.747981,367
Return of the Jedi (1983),0.672556,507
Raiders of the Lost Ark (1981),0.536117,420
Austin Powers: International Man of Mystery (1997),0.377433,130
