In [3]:
!pip install openpyxl

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/albanda/CE888-2023/main/lab4-recommenders/jester-data-1.csv', header=None)
df = df.drop(0, axis=1)
df = df.replace(99, np.NaN)

In [5]:
def replace(orig, percentage=0.1):
    """
    Replaces 'percentage'% of the original values in 'orig' with 99's
    :param orig: original data array
    :param percentage: percentage of values to replace (0<percentage<1)
    """
    new_data = orig.copy()
    rated = np.where(orig!=99)
    n_rated = len(rated[0])
    idx = np.random.choice(n_rated, size=int(percentage*n_rated), replace=False)
    new_data[rated[0][idx], rated[1][idx]] = 99
    return new_data, (rated[0][idx], rated[1][idx])

In [6]:
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,-4.76,...,2.82,,,,,,-5.63,,,
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,,,,,9.03,9.27,9.03,9.27,,,...,,,,9.08,,,,,,
3,,8.35,,,1.8,8.16,-2.82,6.21,,1.84,...,,,,0.53,,,,,,
4,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [7]:
df.iloc[14388, 62]

-8.25

In [8]:
df.iloc[4209, 66]

-7.62

In [9]:
print(f"Num Users: {len(df)}, Num Jokes: {len(df.columns.tolist())}")

Num Users: 24983, Num Jokes: 100


In [10]:
worst_joke_id = df.mean(axis=0).idxmin()
print(f"Worst Rated Joke: {worst_joke_id}, Avg Rating: {df.mean(axis=0)[worst_joke_id]}")

Worst Rated Joke: 58, Avg Rating: -3.8338796373689963


In [11]:
best_joke_id = df.mean(axis=0).idxmax()
print(f"Best Rated Joke: {best_joke_id}, Avg Rating: {df.mean(axis=0)[best_joke_id]}")

Best Rated Joke: 50, Avg Rating: 3.6650848950824924


In [12]:
training_set, validation_idxes = replace(df.values)

In [13]:
n_latent_factors = 2

user_ratings = training_set.copy()
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_factors))
latent_item_features = np.random.random((user_ratings.shape[1], n_latent_factors))

def predict_rating(user_id, item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    return user_preference.dot(item_preference)

def train(user_id, item_id, rating, alpha=0.0001):
    #print(item_id)
    predicted_rating = predict_rating(user_id, item_id)
    err =  predicted_rating - rating
    #print(err)
    user_pref_values = latent_user_preferences[user_id]
    latent_user_preferences[user_id] -= alpha * err * latent_item_features[item_id]
    latent_item_features[item_id] -= alpha * err * user_pref_values
    return err

def sgd(iterations):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    mse_history = []
    for iteration in range(iterations):
        error = []
        for user_id in range(latent_user_preferences.shape[0]):
            for item_id in range(latent_item_features.shape[0]):
                rating = user_ratings[user_id, item_id]
                if not np.isnan(rating):
                    err = train(user_id, item_id, rating)
                    error.append(err)
        mse = (np.array(error) ** 2).mean()   
        if (iteration % 10000) == 0:
            print('Iteration %d/%d:\tMSE=%.6f' % (iteration, iterations, mse))
            mse_history.append(mse)
    return mse_history

hist = sgd(100)

Iteration 0/100:	MSE=1182.775529


KeyboardInterrupt: 

In [14]:
import pandas as pd
import numpy as np

items = pd.read_excel('../../../CE888-2023/lab4-recommenders/movies_latent_factors.xlsx',  sheet_name='Items')
users = pd.read_excel('../../../CE888-2023/lab4-recommenders/movies_latent_factors.xlsx',  sheet_name='Users')

In [15]:
users

Unnamed: 0,User,Factor1,Factor2,Factor3,Factor4,Factor5,Factor6,Factor7,Factor8,Factor9,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15
0,4768,-0.204024,0.161079,-0.090447,0.138495,-0.162934,0.163894,0.051502,-0.088582,0.126829,0.065967,0.085008,0.355404,0.007108,-0.118663,-0.039125
1,156,-0.189652,-0.178979,-0.09149,-0.000823,-0.032646,0.177209,-0.098123,-0.068283,-0.011575,0.120866,-0.009931,-0.048606,0.045916,0.113671,0.179873
2,5323,-0.115308,-0.090886,-0.053129,0.018472,-0.068081,-0.004828,0.113005,0.102107,0.034758,0.000693,-0.073712,-0.01946,0.108372,0.054471,-0.109552
3,174,-0.227462,-0.272532,-0.017231,0.054324,0.214755,-0.072639,-0.033122,-0.086508,-0.131479,0.180403,0.09589,-0.082396,0.036767,-0.165438,0.050692
4,4529,-0.014616,-0.102218,-0.107935,0.155784,-0.123362,-0.118228,-0.013549,-0.050622,0.058698,-0.1596,-0.142382,-0.132836,-0.039897,0.129063,0.102669
5,783,-0.020301,-0.031919,-0.036955,0.03369,0.000174,-0.003178,0.054474,0.045424,-0.053308,0.003437,-0.04175,0.059725,-0.021647,0.039873,-0.061857
6,3878,-0.091462,0.215879,-0.180453,0.085408,-0.321094,0.227947,0.053767,0.038017,0.097141,-0.139872,0.029253,0.214467,0.18925,0.06599,0.102776
7,768,0.000819,-0.009229,-0.019228,0.002703,0.012869,0.006655,0.007687,0.018747,0.014705,-0.009256,-0.010116,-0.010051,-0.002108,0.043223,-0.00667
8,4469,-0.030528,-0.011537,-0.042822,-0.014378,0.031338,0.012297,-3.8e-05,-0.010264,-0.006781,-0.01995,-0.014435,0.010926,0.0244,0.005318,-0.007983
9,1882,-0.083093,-0.02916,0.013748,0.022716,-0.062732,-0.08067,-0.019723,0.013253,-0.091305,-0.090687,0.030638,-0.113364,-0.021325,0.181508,0.022357


In [None]:
items

In [None]:
df.isna().sum()

In [16]:
items[items['Movie ID'] == 105]['Factor4']

11    0.169149
Name: Factor4, dtype: float64

In [17]:
items.iloc[items['Factor2'].idxmax()]

Movie ID                        14
Title       American Beauty (1999)
Factor1                  -0.634531
Factor2                   2.186059
Factor3                  -0.066681
Factor4                   0.086197
Factor5                   0.517558
Factor6                  -0.185319
Factor7                  -0.412352
Factor8                   0.063841
Factor9                   0.075937
Factor10                 -0.577682
Factor11                  0.526803
Factor12                 -1.465557
Factor13                 -0.819682
Factor14                   0.54901
Factor15                 -0.681191
Name: 3, dtype: object

In [25]:
def pred(user_id, movie_id):
    user_arr = users[users['User'] == user_id].iloc[:, 1:].values[0]
    item_arr = items[items['Movie ID'] == movie_id].iloc[:,2:].values[0]
    return user_arr.dot(item_arr)

def recommend(user_id, n_items=2):
    user_arr = users[users['User'] == user_id].iloc[:, 1:].values[0]
    results = []
    for movie_id in items['Movie ID'].values:
        item_arr = items[items['Movie ID'] == movie_id].iloc[:,2:].values[0]
        results.append({"Movie ID": movie_id, "Rating": user_arr.dot(item_arr)})
        
    return sorted(results, key=lambda x: x["Rating"], reverse=True)[:n_items]

In [22]:
pred(4373, 1900)

-0.14030190576082296

In [23]:
pred(3878, 122)

-1.9801680405720616

In [21]:
recommend(4469)

[{'Movie ID': 278, 'Rating': 0.20767971108623426},
 {'Movie ID': 453, 'Rating': 0.18328561173816088},
 {'Movie ID': 98, 'Rating': 0.1736114839274811}]

In [26]:
recommend(4529)

[{'Movie ID': 597, 'Rating': 1.6259524690253646},
 {'Movie ID': 12, 'Rating': 0.8660518849057814}]