In [92]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import f1_score
from IPython.display import Image
import itertools
import sklearn

In [93]:
df = pd.read_csv("https://raw.githubusercontent.com/albanda/CE888/master/lab4-recommender/jester-data-1.csv", header = None)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24983 entries, 0 to 24982
Columns: 101 entries, 0 to 100
dtypes: float64(100), int64(1)
memory usage: 19.3 MB


In [94]:
# Replace 99.00 with Nan values
df = df.replace(99.00, np.NaN)
# Drop number of jokes rated by the user; we don't need this for our review matrix
df = df.drop(columns = [0])

In [95]:
mean = []
for column in df:
    mean.append(df[column].mean())
Best_Rated = (max(mean), mean.index(max(mean))) 
Worst_Rated = (min(mean), mean.index(min(mean)))

print(Best_Rated, Worst_Rated)

(3.6650848950824924, 49) (-3.8338796373689963, 57)


In [96]:
arr = df.values
rated = np.where(arr!=99)

In [97]:
def replace(orig, percentage=0.1):
  """
  Replaces 'percentage'% of the original values in 'orig' with 99's
  :param orig: original data array
  :param percentage: percentage of values to replace (0<percentage<1)
  """
  new_data = orig.copy()
  rated = np.where(arr!=99)
  n_rated = len(rated[0])
  idx = np.random.choice(n_rated, size=int(percentage*n_rated), replace=False)
  new_data[rated[0][idx], rated[1][idx]] = 99
  return new_data, (rated[0][idx], rated[1][idx])

In [98]:
new_arr, idx = replace(arr, 0.1)

In [99]:
# Checking the old array doesn't have a 99
arr[idx[0][0], idx[1][0]]

-0.92

In [100]:
# Checking the new array does have a 99 (this is repeated for 10% of the data)
new_arr[idx[0][0], idx[1][0]]

99.0

In [102]:
n_latent_factors = 2
user_ratings = arr
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_factors))
latent_item_features = np.random.random((user_ratings.shape[1], n_latent_factors))

In [106]:
def predict_rating(user_row, item_row):
    user_values = latent_user_preferences[user_row]
    item_values = latent_item_features[item_row]
    return user_values.dot(item_values)

In [107]:
def train(user_row, item_row, rating, alpha=0.0001):
    err = rating - predict_rating(user_row, item_row)
    temp = latent_user_preferences[user_row]
    latent_user_preferences[user_row] += alpha * err * latent_item_features[item_row]
    latent_item_features[item_row] += alpha * err * temp


In [108]:
def sgd(iterations):
    mse_history = []
    for iteration in range(iterations):
        error = []
        for user_row in range(latent_user_preferences.shape[0]):
            for item_row in range(latent_item_features.shape[0]):
                rating = user_ratings[user_row][item_row]
                if not np.isnan(rating) and rating!=99:
                    err = train(user_row, item_row, rating)
                    error.append(err)
        mse = (np.array(error) ** 2).mean()   
        if (iteration % 10) == 0:
            print('Iteration %d/%d:    MSE=%.3f' % (iteration, iterations, mse))
            mse_history.append(mse)
    return mse_history

In [110]:
hist = sgd_svd(10)

In [111]:
predictions = latent_user_preferences.dot(latent_item_features.T)
predictions

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [113]:
predict_rating()

4.56

In [200]:
df_movies = pd.read_excel('movies_latent_factors.xlsx')

In [201]:
df_movies

Unnamed: 0,Movie ID,Title,Factor1,Factor2,Factor3,Factor4,Factor5,Factor6,Factor7,Factor8,Factor9,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15
0,11,Star Wars: Episode IV - A New Hope (1977),-1.521848,-1.038507,2.027269,0.247933,-0.594548,2.513260,-1.848910,0.476710,-0.224146,-0.760681,-0.973915,0.862379,0.403861,1.129616,-0.248806
1,12,Finding Nemo (2003),-0.342185,-0.296586,-0.385962,2.443297,-1.097015,-0.619465,0.572887,0.329516,-0.712228,0.323554,-0.561948,-0.142405,-0.564415,0.506876,1.274993
2,13,Forrest Gump (1994),-2.240888,-0.438815,-2.275177,0.614548,0.904469,0.711919,-0.420876,1.302036,-0.868418,-0.160122,0.633667,0.133138,-0.330276,-2.209004,-0.419092
3,14,American Beauty (1999),-0.634531,2.186059,-0.066681,0.086197,0.517558,-0.185319,-0.412352,0.063841,0.075937,-0.577682,0.526803,-1.465557,-0.819682,0.549010,-0.681191
4,22,Pirates of the Caribbean: The Curse of the Bla...,0.517348,-1.456763,0.369161,0.073903,-0.098332,-0.594722,0.828888,0.033626,-0.707414,0.717877,1.394326,-0.786986,1.429191,0.855746,-2.292566
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,9806,The Incredibles (2004),0.159967,-0.051123,0.311183,1.841867,-1.952736,-0.836041,0.639252,-0.413082,-0.310868,0.448225,-0.846044,-0.158192,-0.649164,0.045580,1.017861
96,10020,Beauty and the Beast (1991),1.286288,0.004957,-0.157640,2.579640,-0.042687,-0.256582,-0.007971,0.458163,0.091394,0.323339,-0.862833,-0.508185,0.994535,0.868700,-0.487076
97,36657,X-Men (2000),0.811901,-0.893280,0.668953,-0.885074,-1.070468,-0.973154,-0.037449,-0.925609,-0.122112,-0.328822,-0.163103,-0.307299,0.642105,-1.355542,0.067790
98,36658,X2: X-Men United (2003),1.161006,-0.715613,0.712633,-0.688808,-0.717785,-1.007047,-0.267268,-0.802322,-0.249239,-0.540419,-0.291422,-0.240216,0.290714,-1.217250,-0.288130


In [202]:
df_movies.loc[df['Movie ID'] == 38]

Unnamed: 0,Movie ID,Title,Factor1,Factor2,Factor3,Factor4,Factor5,Factor6,Factor7,Factor8,Factor9,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15
6,38,Eternal Sunshine of the Spotless Mind (2004),-0.695539,1.771799,0.143646,0.244718,0.434222,-0.367995,-0.186841,-0.961435,0.240228,-0.474231,0.492446,-1.041901,-0.181657,0.479142,0.237458


In [203]:
#df.index[df["Factor8"]==df["Factor8"].min()].tolist()
df_movies

Unnamed: 0,Movie ID,Title,Factor1,Factor2,Factor3,Factor4,Factor5,Factor6,Factor7,Factor8,Factor9,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15
0,11,Star Wars: Episode IV - A New Hope (1977),-1.521848,-1.038507,2.027269,0.247933,-0.594548,2.513260,-1.848910,0.476710,-0.224146,-0.760681,-0.973915,0.862379,0.403861,1.129616,-0.248806
1,12,Finding Nemo (2003),-0.342185,-0.296586,-0.385962,2.443297,-1.097015,-0.619465,0.572887,0.329516,-0.712228,0.323554,-0.561948,-0.142405,-0.564415,0.506876,1.274993
2,13,Forrest Gump (1994),-2.240888,-0.438815,-2.275177,0.614548,0.904469,0.711919,-0.420876,1.302036,-0.868418,-0.160122,0.633667,0.133138,-0.330276,-2.209004,-0.419092
3,14,American Beauty (1999),-0.634531,2.186059,-0.066681,0.086197,0.517558,-0.185319,-0.412352,0.063841,0.075937,-0.577682,0.526803,-1.465557,-0.819682,0.549010,-0.681191
4,22,Pirates of the Caribbean: The Curse of the Bla...,0.517348,-1.456763,0.369161,0.073903,-0.098332,-0.594722,0.828888,0.033626,-0.707414,0.717877,1.394326,-0.786986,1.429191,0.855746,-2.292566
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,9806,The Incredibles (2004),0.159967,-0.051123,0.311183,1.841867,-1.952736,-0.836041,0.639252,-0.413082,-0.310868,0.448225,-0.846044,-0.158192,-0.649164,0.045580,1.017861
96,10020,Beauty and the Beast (1991),1.286288,0.004957,-0.157640,2.579640,-0.042687,-0.256582,-0.007971,0.458163,0.091394,0.323339,-0.862833,-0.508185,0.994535,0.868700,-0.487076
97,36657,X-Men (2000),0.811901,-0.893280,0.668953,-0.885074,-1.070468,-0.973154,-0.037449,-0.925609,-0.122112,-0.328822,-0.163103,-0.307299,0.642105,-1.355542,0.067790
98,36658,X2: X-Men United (2003),1.161006,-0.715613,0.712633,-0.688808,-0.717785,-1.007047,-0.267268,-0.802322,-0.249239,-0.540419,-0.291422,-0.240216,0.290714,-1.217250,-0.288130


In [204]:
df_movies.loc[30,:]

Movie ID                     272
Title       Batman Begins (2005)
Factor1                -0.920371
Factor2                -0.694112
Factor3                 0.399493
Factor4                -0.613952
Factor5                -1.443527
Factor6                -0.688206
Factor7                 0.758949
Factor8                -1.686344
Factor9                 0.179139
Factor10               -1.098354
Factor11               -1.388191
Factor12               -0.174228
Factor13                 1.79318
Factor14                0.448899
Factor15                0.484224
Name: 30, dtype: object

In [205]:
df_users = pd.read_excel('movies_latent_factors.xlsx',
                         sheet_name='Users')
df_users.head()

Unnamed: 0,User,Factor1,Factor2,Factor3,Factor4,Factor5,Factor6,Factor7,Factor8,Factor9,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15
0,4768,-0.204024,0.161079,-0.090447,0.138495,-0.162934,0.163894,0.051502,-0.088582,0.126829,0.065967,0.085008,0.355404,0.007108,-0.118663,-0.039125
1,156,-0.189652,-0.178979,-0.09149,-0.000823,-0.032646,0.177209,-0.098123,-0.068283,-0.011575,0.120866,-0.009931,-0.048606,0.045916,0.113671,0.179873
2,5323,-0.115308,-0.090886,-0.053129,0.018472,-0.068081,-0.004828,0.113005,0.102107,0.034758,0.000693,-0.073712,-0.01946,0.108372,0.054471,-0.109552
3,174,-0.227462,-0.272532,-0.017231,0.054324,0.214755,-0.072639,-0.033122,-0.086508,-0.131479,0.180403,0.09589,-0.082396,0.036767,-0.165438,0.050692
4,4529,-0.014616,-0.102218,-0.107935,0.155784,-0.123362,-0.118228,-0.013549,-0.050622,0.058698,-0.1596,-0.142382,-0.132836,-0.039897,0.129063,0.102669


In [187]:
user = df_users.loc[df_users['User'] == 2665].values[0][1:]
movie = df_movies.loc[df_movies['Movie ID'] == 854].values[0][2:]
user.dot(movie)

-0.34049742926373167

In [206]:
user = df_users.loc[df_users['User'] == 2067].values[0][1:]
movie = df_movies.loc[df_movies['Movie ID'] == 278].values[0][2:]
user.dot(movie)

0.7432047201754383

In [207]:
user_pref = df_users[df_users['User'] == 156].iloc[:, -15:].values
item_feat = df_movies.iloc[:, -15:].values.T
df_movies['pred'] = user_pref.dot(item_feat).T
print(df_movies.sort_values(by='pred', axis=0, ascending=False)['Movie ID'].values)

[  603  1891  1892    11   155   604    98   602   329   122   120   453
   105   121   752  1894   197    13  8587    85   180   272   568    12
   745   278  2164  2501   585   862   280   857   640  2502   424   161
   601  8358   597  5503    77   550  9806   954  9802  1422  2024   581
  1572   812   641   607  9331    63   238   629   664  9741   807   808
  1637 36955    38   274   107   243   141   788   462   114 10020    22
   146  1597  1900   134   153   786   275 36657   187   194   955  7443
    14   680 36658   809  8467   268   854  3049   557    24   671   393
   672   414   558  4327]


In [209]:
user_pref = df_users[df_users['User'] == 4469].iloc[:, -15:].values
item_feat = df_movies.iloc[:, -15:].values.T
df_movies['pred'] = user_pref.dot(item_feat).T
print(df_movies.sort_values(by='pred', axis=0, ascending=False)['Movie ID'].values)

[  604   602  1894   155   272  3049   954   597   603    11   955   280
    98  1892  1891  8467   161  2502  2501   197   607  1572   854   329
  1422   180    22  1637 36657   268   752   640  2024   238   453   550
  9802  2164   857 36955   414   557  8587 36658   278   105  8358   558
    24  1597   393   122   664   745   680   812   121   424    77   568
   120   187  5503  9331    85  9741   107   629   274   114   146    63
    13   807   641   581  9806  4327  1900    12   141   862   788 10020
    38   809    14   601   243   134   153   585   786   275   462  7443
   808   194   672   671]
