In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import NMF
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error as mse


In [2]:
users = pd.read_csv('inputData/users.dat', sep='::',names = ['UserID','Gender','Age','Occupation','Zipcode'])
users[:5]

  users = pd.read_csv('inputData/users.dat', sep='::',names = ['UserID','Gender','Age','Occupation','Zipcode'])


Unnamed: 0,UserID,Gender,Age,Occupation,Zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [3]:
movies = pd.read_csv('inputData/movies.dat', sep='::',names = ['MovieID','Title','Genre'])
movies_map = dict(movies[['MovieID','Title']].values)
movies[:5]

  movies = pd.read_csv('inputData/movies.dat', sep='::',names = ['MovieID','Title','Genre'])


Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
'''
- UserIDs range between 1 and 6040 
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings
'''

data = pd.read_csv('inputData/ratings.dat',sep='::',names = ['UserID','MovieID','Rating','Timestamp'])

data[:5]

  data = pd.read_csv('inputData/ratings.dat',sep='::',names = ['UserID','MovieID','Rating','Timestamp'])


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
def get_pivoted_df(df):
    all_movies = pd.DataFrame({'UserID':[0 for _ in range(3952)],
                           'MovieID':range(1,3953)})
    all_users = pd.DataFrame({'UserID':range(1,6041),
                           'MovieID':[0 for _ in range(6040)]})
    return pd.concat([df,all_movies,all_users]).pivot('UserID','MovieID',"Rating").loc[1:,1:]

ratingsDf = get_pivoted_df(data)
ratingsDf[:5]

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [6]:
def get_rmse(pred, actual):
    pred = np.minimum(np.maximum(pred[actual.nonzero()].flatten(),1),5)    # Ignore nonzero terms
    actual = actual[actual.nonzero()].flatten() # Ignore nonzero terms
    return np.sqrt(mse(pred, actual))

In [7]:
all_models = list()
kf = KFold(4, shuffle=True, random_state=42)

for train_idx, val_idx in kf.split(data):
    print("*"*10)
    train_set = get_pivoted_df(data.loc[train_idx]).values.astype(np.int8)
    val_set = get_pivoted_df(data.loc[val_idx]).values.astype(np.int8)
    
    for n_component in [20, 24,30]:
        print ("Fitting Model {} !".format(len(all_models)))
        model = NMF(n_components=n_component, init='random')
        model.fit(train_set)
        all_models.append(model)
        
        user_features = model.transform(train_set)
        movie_features = model.components_.T
        predictions = user_features.dot(movie_features.T)

        print("n_component : {}\t\t\tRMSE : {}".format(n_component, get_rmse(predictions, val_set)))

**********
Fitting Model 0 !




n_component : 20			RMSE : 2.4894226613369095
Fitting Model 1 !




n_component : 24			RMSE : 2.484150821500668
Fitting Model 2 !




n_component : 30			RMSE : 2.4895621168179605
**********
Fitting Model 3 !
n_component : 20			RMSE : 2.4946999758819572
Fitting Model 4 !




n_component : 24			RMSE : 2.4911736769630815
Fitting Model 5 !




n_component : 30			RMSE : 2.4892985980115556
**********
Fitting Model 6 !




n_component : 20			RMSE : 2.488323719363591
Fitting Model 7 !




n_component : 24			RMSE : 2.4872706990767655
Fitting Model 8 !




n_component : 30			RMSE : 2.492048673681675
**********
Fitting Model 9 !




n_component : 20			RMSE : 2.4910369321231633
Fitting Model 10 !




n_component : 24			RMSE : 2.485809917071742
Fitting Model 11 !




n_component : 30			RMSE : 2.4934132772557915


In [8]:
all_predictions = list()
for model in all_models:
    user_features = model.transform(ratingsDf.values.astype(np.int8))
    movie_features = model.components_.T
    predictions = user_features.dot(movie_features.T)
    all_predictions.append(predictions)
len(all_predictions)

12

In [9]:
final_scores = np.mean(all_predictions, axis=0)
final_scores.shape

(6040, 3952)

In [10]:
final_scores_corrected = pd.DataFrame(np.minimum(np.maximum(final_scores, 1), 5) , columns=range(1,3953), index=range(1,6041))
final_scores_corrected[:5]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
1,3.436621,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.007652,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.557068,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.459799,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.325213,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
recommendation_scores = final_scores.copy()
recommendation_scores[ratingsDf.values.astype(np.int8).nonzero()]=-1


In [12]:
num_recommendations = 10
top_recommendations = pd.DataFrame(((-recommendation_scores).argsort(axis=1)+1)[:,:num_recommendations], index=range(1,6041)).applymap(lambda x: movies_map.get(x, 'NA'))
top_recommendations

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,"Lion King, The (1994)","Shawshank Redemption, The (1994)",Babe (1995),"Little Mermaid, The (1989)",Fantasia (1940),Lady and the Tramp (1955),Peter Pan (1953),"Jungle Book, The (1967)",Sleeping Beauty (1959),"Silence of the Lambs, The (1991)"
2,Schindler's List (1993),Men in Black (1997),"Rock, The (1996)",Star Wars: Episode IV - A New Hope (1977),Speed (1994),Good Will Hunting (1997),Air Force One (1997),Apollo 13 (1995),Face/Off (1997),Fargo (1996)
3,Ghostbusters (1984),"Matrix, The (1999)",Ferris Bueller's Day Off (1986),Airplane! (1980),Toy Story (1995),E.T. the Extra-Terrestrial (1982),"Terminator, The (1984)",Star Wars: Episode I - The Phantom Menace (1999),Big (1988),When Harry Met Sally... (1989)
4,Indiana Jones and the Last Crusade (1989),"Princess Bride, The (1987)",Aliens (1986),Terminator 2: Judgment Day (1991),"Matrix, The (1999)",Braveheart (1995),Lethal Weapon (1987),Back to the Future (1985),Batman (1989),"Fugitive, The (1993)"
5,Clerks (1994),Elizabeth (1998),Shakespeare in Love (1998),Magnolia (1999),Trainspotting (1996),American Pie (1999),Boogie Nights (1997),"Crying Game, The (1992)",Sling Blade (1996),Eyes Wide Shut (1999)
...,...,...,...,...,...,...,...,...,...,...
6036,Fight Club (1999),"Ice Storm, The (1997)",Dead Man Walking (1995),Trainspotting (1996),Pulp Fiction (1994),Glengarry Glen Ross (1992),Mad Max (1979),"Sex, Lies, and Videotape (1989)",Blue Velvet (1986),"Rocky Horror Picture Show, The (1975)"
6037,"Graduate, The (1967)","Clockwork Orange, A (1971)","Maltese Falcon, The (1941)",Star Wars: Episode V - The Empire Strikes Back...,Young Frankenstein (1974),Brazil (1985),Bonnie and Clyde (1967),Lawrence of Arabia (1962),Gone with the Wind (1939),Raging Bull (1980)
6038,Star Wars: Episode V - The Empire Strikes Back...,"Princess Bride, The (1987)",Star Wars: Episode IV - A New Hope (1977),"Close Shave, A (1995)",Shakespeare in Love (1998),Young Frankenstein (1974),Toy Story (1995),Groundhog Day (1993),Back to the Future (1985),Raiders of the Lost Ark (1981)
6039,Some Like It Hot (1959),North by Northwest (1959),"Philadelphia Story, The (1940)",Double Indemnity (1944),Star Wars: Episode V - The Empire Strikes Back...,"African Queen, The (1951)",Snow White and the Seven Dwarfs (1937),"Manchurian Candidate, The (1962)",Strangers on a Train (1951),Gone with the Wind (1939)
