In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
import recsys.algorithm
recsys.algorithm.VERBOSE = True

In [3]:
from recsys.algorithm.factorize import SVD

### Воспользуемся методом PureSVD из библиотеки

In [4]:
df_pair_user_film_train = pd.read_csv('train_rating.csv')
df_pair_user_film_test = pd.read_csv('test_rating.csv')

In [5]:
df_pair_user_film_train.head()

Unnamed: 0,user_id,element_id,score
0,98304,1101771,9
1,16387,188463,6
2,16387,197307,10
3,16387,197531,9
4,16387,198284,7


In [6]:
svd = SVD()
svd.load_data(filename='train_rating.csv', sep=',', format={'col':0, 'row':1, 'value':2, 'ids': int})

Loading train_rating.csv
Error (ID is not int) while reading: [u'user_id', u'element_id', u'score']



In [23]:
k = 50
svd.compute(k=k, pre_normalize=None, mean_center=True, post_normalize=True)

Creating matrix (14304 tuples)
Matrix density is: 0.0833%
Computing svd k=50, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True


In [24]:
svd.get_matrix()

SparseMatrix (4675 by 3673)
         98304      16387      65541      425990     7        ...
1101771  9.000000      ---        ---        ---        ---
188463      ---     6.000000      ---        ---        ---
197307      ---    10.000000      ---        ---        ---
197531      ---     9.000000      ---        ---        ---
198284      ---     7.000000      ---        ---        ---
190406      ---    10.000000      ---        ---        ---
192013      ---    10.000000      ---        ---        ---
198079      ---     9.000000      ---        ---        ---
194794      ---     5.000000      ---        ---        ---
195531      ---    10.000000      ---        ---        ---
195911      ---     7.000000      ---        ---        ---
218835      ---        ---     7.000000      ---        ---
220040      ---        ---     7.000000      ---        ---
1113431     ---        ---        ---     9.000000      ---
219784      ---        ---        ---     8.000000      ---
220089

In [25]:
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr

MIN_RATING = 1.0
MAX_RATING = 10.0

def print_metrics(svd_model, name):
    all_user_id_test = np.unique(df_pair_user_film_test.user_id)
    list_mse = []
    list_SPR = []

    for user_id in all_user_id_test:
        y_test = []
        y_pred = []
        tmp = df_pair_user_film_test[df_pair_user_film_test['user_id'] == user_id]
        
        if (len(tmp.element_id) <= 1):
            continue
            
        for row in tmp.iterrows():
            index, series = row
            u_id, f_id, score = series
            ITEMID = f_id
            USERID = u_id
            pred = svd_model.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)
            y_pred.append(pred)
            y_test.append(score)
        MSE = mean_squared_error(y_test, y_pred)
        list_mse.append(MSE)

        SPR = spearmanr(y_test, y_pred) 
        if (np.isnan(SPR[0])):
            y_test.append(1)
            y_pred.append(1)
            SPR = spearmanr(y_test, y_pred) 

        list_SPR.append(SPR[0])

    print name    
    print "MSE:", np.average(list_mse)
    print "Spearman:", np.average(list_SPR)
    print "MAP:", mean_average_precision(svd_model, df_pair_user_film_test)

In [26]:
def average_precision_score(predicted_rating, movieID_test, min_score=5):
    """
    Parameters
    ----------
    predicted_rating : отсортированный по убыванию список пар (movie_ID, pred_score)
    movieID_test : список movie_ID для теста
    min_score : минимальный score, фильм которого считется релевантым
    -------
    average precision
    """
    list_of_position = []
    list_of_scores = []
    k = 1
    for row in predicted_rating:
        movie_id, score = row
        if (movie_id in movieID_test):
            list_of_position.append(k) 
            list_of_scores.append(score)
        k +=1
        
    list_of_scores = np.array(list_of_scores, dtype='float32')
    list_of_position = np.array(list_of_position, dtype='float32')
    list_of_position = list_of_position[list_of_scores >= min_score]   
    
    
    arr = 1 / list_of_position 
    N = float(len(list_of_position)) 
    AP = 1 / N * np.sum(arr)
    
    return AP    

In [27]:
def mean_average_precision(svd_model, data_test):
    all_userIDs = np.unique(data_test.user_id)
    all_filmIDs = np.unique(data_test.element_id)
    list_of_average_precision = []
    
    for user_id in all_userIDs:
        test_user = data_test[data_test['user_id'] == user_id]
        movieID_test = test_user.element_id
        
        recs = svd_model.recommend(user_id, n=5000, is_row=False)
        
        try:
            AP = average_precision_score(recs, list(movieID_test), min_score=0)
        except ZeroDivisionError:
            continue
            
        if (np.isnan(AP)):
            print user_id
        list_of_average_precision.append(AP)
     
    return np.average(list_of_average_precision)    

In [28]:
print_metrics(svd, 'svd')

svd
MSE: 4.30734586116
Spearman: 0.34530441148
MAP: 0.000914731742297
