In [1]:
import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import surprise
%load_ext autoreload

In [2]:
from data_helpers import *

#Load train and testset for the surprise models
file_path = "../data/data_surprise.csv"
trainset, testset = build_surprise_data(file_path)

#Loads ratings to predict
INPUT_PATH = "../data/sample_submission.csv"
ids = read_csv_sample(INPUT_PATH)

In [3]:
from implementations import *

In [None]:
Xtest = []
Xids = []

#Generate predictions with every method
rmse, Xtest, Xids = baseline_only(trainset, testset, ids, Xtest, Xids)
rmse, Xtest, Xids = knn_baseline_movie(trainset, testset, ids, Xtest, Xids)
rmse, Xtest, Xids = knn_baseline_user(trainset, testset, ids, Xtest, Xids)
rmse, Xtest, Xids = svd(trainset, testset, ids, Xtest, Xids)
rmse, Xtest, Xids = slopeone(trainset, testset, ids, Xtest, Xids)
rmse, Xtest, Xids = co_clustering(trainset, testset, ids, Xtest, Xids)

Baseline Only
   Training RMSE:  0.9892005650964464
   Test RMSE:  1.0001597472752757
kNN Baseline Movie


In [None]:
def blend(preds_test, preds_ids, testset):
    """
    Linear regression that finds the optimal weights of each model
    Argument : preds_test, predicted ratings for the known test set
               preds_ids, predicted ratings for the unknown set
               testset, the testset
    Return : estimations, the final predictions
             weights, coefficients associated to each model
    """
    print('Blending')
    
    #Known ratings of testset
    y_test = [rating for (_,_,rating) in testset]
    
    #Ridge Regression
    linreg = Ridge(alpha=0.1, fit_intercept=True)
    
    #Fit between predicted and know ratings of testset
    linreg.fit(preds_test.T, y_test)
    weights = linreg.coef_
    
    #Predict unknown ratings
    predictions = np.clip(linreg.predict(preds_ids.T), 1, 5)
    
    print(weights, end='\n\n')
    
    #RMSE of regression
    print('Test RMSE: %f' % calculate_rmse(y_test, linreg.predict(preds_test.T)))
    
    #Rounding-off predictions
    estimations = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        estimations[j] = round(pred)
        
    return estimations, weights

In [None]:
#Blending
predictions, weights = blend(np.matrix(Xtest), np.matrix(Xids), testset)

In [None]:
len(predictions)

In [None]:
from data_helpers import create_csv_submission

OUTPUT_PATH = "../data/submission.csv"
create_csv_submission(ids, predictions, OUTPUT_PATH)
print("File submission.csv ready to be submitted !")