In [73]:
import json
import numpy as np

from src.models.feature_vectors import FeaturesVector

FeaturesVector.generate_features_vector()
feature_vectors = FeaturesVector.FEATURES_VECTOR
features_vector = np.array(list(feature_vectors.values()))

In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split

ratings = pd.read_csv('user_cluster_ratings.csv')
rating_train_base = pd.DataFrame(columns=ratings.columns)
rating_test_base = pd.DataFrame(columns=ratings.columns)


# Group the data by user_id and split each group
for user_id, group in ratings.groupby('user_id'):
    train, test = train_test_split(group, test_size=0.2, random_state=42)
    rating_train_base = pd.concat([rating_train_base, train])
    rating_test_base = pd.concat([rating_test_base, test])


rating_train_base = rating_train_base.sort_values(by='user_id').reset_index(drop=True)
rating_test_base = rating_test_base.sort_values(by='user_id').reset_index(drop=True)
rating_train_base.to_csv('user_cluster_ratings_train.csv', index=False)
rating_test_base.to_csv('user_cluster_ratings_test.csv', index=False)


rating_matrix = ratings.values

# Convert to numpy arrays if needed
rating_train = rating_train_base.values
rating_test = rating_test_base.values

print("Training set size:", rating_train.shape)
print("Testing set size:", rating_test.shape)

Training set size: (9397, 3)
Testing set size: (2496, 3)


In [75]:
rating_train

array([[1, 106, 2.145708954130928],
       [1, 67, 1.7840255935133356],
       [1, 62, 1.0034473250624316],
       ...,
       [300, 124, 3.845816746228639],
       [300, 12, 4.044453104959272],
       [300, 116, 1.0729587552495312]], dtype=object)

In [76]:
import numpy as np
def get_items_rated_by_user(rate_matrix, user_id):
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    return (item_ids, scores) rated by user user_id
    """
    y = rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1 
    # while index in python starts from 0
    ids = np.where(y == user_id)[0] 
    item_ids = rate_matrix[ids, 1] # index starts from 0 
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

get_items_rated_by_user(rating_matrix, 253)

(array([ 83.,  45.,   7., 108.,  56., 113.,  50., 105.,   8.,   4.,  98.,
         65.,  87., 110.,  77.,  54.,  12.,  41.,  69.,  57.,  46.,  91.,
         35.,  40.,  68.,   3., 100.,  84.,  75.,  80.,   6.,   9., 116.,
         17.,   2.,  27., 111.,  16., 115., 118.,  31.,  10.,  29.,  82.,
         33.,  21., 122.,  60.]),
 array([1.06, 4.4 , 4.25, 1.85, 1.83, 2.88, 1.25, 3.64, 1.01, 2.58, 1.21,
        3.16, 1.45, 1.77, 3.08, 1.6 , 2.69, 1.42, 1.77, 4.57, 3.  , 2.27,
        1.8 , 4.07, 1.6 , 3.25, 1.06, 2.88, 3.21, 1.92, 2.27, 1.1 , 2.01,
        2.  , 3.8 , 3.22, 2.12, 3.36, 4.03, 3.57, 4.19, 3.82, 3.48, 4.59,
        3.46, 2.38, 2.35, 3.86]))

In [77]:
n_users = np.max(rating_train[:, 0])
n_users = int(n_users)

In [78]:
features_vector.shape[1]

3

In [None]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

d = features_vector.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in range(n_users):    
    ids, scores = get_items_rated_by_user(rating_train, n + 1)
    ids = list(map(int, ids))
    clf = Ridge(alpha=0.01, fit_intercept = True)
    Xhat = features_vector[ids, :]

    clf.fit(Xhat, scores) 
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

In [81]:
Yhat = features_vector.dot(W) + b

In [82]:
Yhat

array([[2.94, 3.72, 3.34, ..., 2.66, 2.66, 2.25],
       [3.08, 1.81, 2.59, ..., 3.89, 3.44, 3.11],
       [3.3 , 3.24, 2.86, ..., 2.25, 2.51, 2.9 ],
       ...,
       [3.35, 2.98, 2.73, ..., 2.35, 2.59, 3.06],
       [3.06, 2.45, 3.01, ..., 3.25, 2.87, 2.31],
       [3.29, 2.18, 2.27, ..., 3.33, 3.41, 3.97]])

In [87]:
len(Yhat)

126

In [84]:
rating_test_base

Unnamed: 0,user_id,cluster,rating
0,1,96,4.457969
1,1,33,2.364088
2,1,99,2.005402
3,1,119,3.789556
4,1,3,4.028167
...,...,...,...
2491,300,91,2.377070
2492,300,51,1.810235
2493,300,90,4.757126
2494,300,95,4.294258


In [85]:
len(rating_test)

2496

In [90]:
n = 125
np.set_printoptions(precision=2)
ids, scores = get_items_rated_by_user(rating_test, n)
ids = ids.astype(int)

Yhat[n, ids]
print('Rated movies ids :', ids )
print('True ratings     :', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids : [ 89  83  73 104 122   9  44  14 117  56]
True ratings     : [2.877488783326375 1.4172240957366808 1.370292651508085 3.729834321710912
 3.23736525097096 4.622849974471514 3.221629649939767 4.224535738464054
 1.4817585349374085 4.7873752376732845]
Predicted ratings: [3.29 2.97 3.07 3.17 3.2  3.2  2.83 2.81 3.12 3.48]


In [91]:
import math

def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred 
        se += (e*e).sum(axis = 0)
        cnt += e.size 
    return math.sqrt(se/cnt)

print('RMSE for training:', evaluate(Yhat, rating_train, W, b))
print('RMSE for test    :', evaluate(Yhat, rating_test, W, b))
# print 'RMSE for training:', evaluate(Yhat, rate_train, W, b)
# print 'RMSE for test    :', evaluate(Yhat, rate_test, W, b)

IndexError: arrays used as indices must be of integer (or boolean) type