In [16]:
import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import surprise
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
from data_helpers import *

#Load train and testset for the surprise models
file_path = "../data/data_surprise.csv"
trainset, testset = build_surprise_data(file_path)

#Loads ratings to predict
INPUT_PATH = "../data/sample_submission.csv"
ids = read_csv_sample(INPUT_PATH)

In [18]:
#Load train and testset for the custom models
train, test = split_data(load_data("../data/data_train.csv"), p_test = 0.1)

number of items: 10000, number of users: 1000


In [61]:
from implementations import *

In [20]:
Xtest = []
Xids = []

#Generate predictions with every method
rmse, Xtest, Xids, preds_test_kbm, preds_ids_kbm = knn_baseline_movie(trainset, testset, ids, Xtest, Xids)
rmse, Xtest, Xids, preds_test_kbu, preds_ids_kbu = knn_baseline_user(trainset, testset, ids, Xtest, Xids)
rmse, Xtest, Xids, preds_test_svd, preds_ids_svd = svd(trainset, testset, ids, Xtest, Xids)

kNN Baseline Movie
   Training RMSE:  0.6241475383015472
   Test RMSE:  0.9908785231084433
kNN Baseline User
   Training RMSE:  0.6641531098089988
   Test RMSE:  0.9947164198559775
SVD
   Training RMSE:  0.9543999695353427
   Test RMSE:  1.0019284327486202


In [69]:
from proj2_helpers import *
def matrix_factorization_als2(train, test, ids, Xtest, Xids):
    """Alternating Least Squares (ALS) algorithm.
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    # define parameters
    num_features = 20
    lambda_user = 0.08
    lambda_item = 0.1
    stop_criterion = 1e-4
    change = 1
    error_list = [0, 0]
    # set seed
    np.random.seed(988)

    # init ALS
    user_features, item_features = init_MF(train, num_features)
    
    # get the number of non-zero ratings for each user and item
    nnz_items_per_user, nnz_users_per_item = train.getnnz(axis=0), train.getnnz(axis=1)
    
    # group the indices by row or column index
    nz_train, nz_item_userindices, nz_user_itemindices = build_index_groups(train)

    # run ALS
    print("start the ALS algorithm...")
    while change > stop_criterion:
        # update user feature & item feature
        user_features = update_user_feature(train, item_features, lambda_user, nnz_items_per_user, nz_user_itemindices)
        item_features = update_item_feature(train, user_features, lambda_item, nnz_users_per_item, nz_item_userindices)

        error = compute_error(train, user_features, item_features, nz_train)
        
        error_list.append(error)
        change = np.fabs(error_list[-1] - error_list[-2])
        
    print("Training RMSE: {}.".format(error))
    # evaluate the test error
    nnz_row, nnz_col = test.nonzero()
    nnz_test = list(zip(nnz_row, nnz_col))
    rmse = compute_error(test, user_features, item_features, nnz_test)
    print("Test RMSE: {v}.".format(v=rmse))
    
    predictions_matrix = user_features.T @ item_features
    
    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        user = ids[0][i]
        item = ids[1][i]
        rating = round(predictions_matrix[item-1, user-1])
        preds_ids.append(rating)

    preds_ids = np.clip(preds_ids, 1, 5)
    Xids.append(preds_ids)
    
    #Predict test ratings (known)
    preds_test = compute_predictions(test, user_features, item_features, nnz_test)
    preds_test = np.clip(preds_test, 1, 5)
    Xtest.append(preds_test)
    return rmse, Xtest, Xids, preds_test, preds_ids

In [None]:
rmse, Xtest, Xids, preds_test_als, preds_ids_als = matrix_factorization_als2(train, test, ids, Xtest, Xids)

start the ALS algorithm...


In [None]:
rmse, Xtest, Xids, preds_test_sgd, preds_ids_sgd = matrix_factorization_sgd(train, test, ids, Xtest, Xids)

In [None]:
print(Xtest.shape) , (Xids.shape)

In [None]:
from sklearn.linear_model import LinearRegression
def blend(preds_test, preds_ids, testset):
    """
    Linear regression that finds the optimal weights of each model
    Argument : preds_test, predicted ratings for the known test set
               preds_ids, predicted ratings for the unknown set
               testset, the testset
    Return : estimations, the final predictions
             weights, coefficients associated to each model
    """
    print('Blending')
    
    #Known ratings of testset
    y_test = [rating for (_,_,rating) in testset]
    
    #Ridge Regression
    linreg = Ridge(alpha=0.1, normalize=True)
    
    #Fit between predicted and know ratings of testset
    linreg.fit(preds_test.T, y_test)
    weights = linreg.coef_
    
    #Predict unknown ratings
    predictions = np.clip(linreg.predict(preds_ids.T), 1, 5)
    
    print(weights, end='\n\n')
    
    #RMSE of regression
    print('Test RMSE: %f' % calculate_rmse(y_test, linreg.predict(preds_test.T)))
    
    #Rounding-off predictions
    estimations = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        estimations[j] = round(pred)
        
    return estimations, weights

In [55]:
#Blending
predictions, weights = blend(np.array(Xtest), np.array(Xids), testset)

Blending
[-1.8606002   0.50975616  0.99641749 -0.63334515  1.14645031  0.74535184
 -0.12279898  0.26282306  0.00458331]

Test RMSE: 0.986415


In [36]:
len(predictions)

1176952

In [37]:
from data_helpers import create_csv_submission

OUTPUT_PATH = "../data/submission.csv"
create_csv_submission(ids, predictions, OUTPUT_PATH)
print("File submission.csv ready to be submitted !")

File submission.csv ready to be submitted !
