In [1]:
import random
import pandas as pd
import numpy as np

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve

from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle


In [43]:
# load data
 
data_cf = pd.read_csv('fc_matrix_red200.csv')

In [44]:
data_cf.shape

(40000, 5)

# Creation de folds (modèle collaboratif)

L'objectif de ce notebook est de créer plusieurs folds correspondant à différents trainset et testset.

Ce procesus est indispensable aux choix des paramètres de nos futurs algorithmes de segmentation. En effet, dans le notebook "analyse_cf_model" on utilisera ces folds pour choisir notemment le nombre optimal de clusters (via un apprentisage supervisé. 


Nous choisissons une méthode non exaustive de création de test/trainset. Dans cette méthode nous fixons un nombre de folds, celui-ci nous donne une estimation de la quantité de transactions que l'on souhaite prendre en compte dans chaqu'un des jeux d'entrainements et de test. Autrement dit, on cherche à supprimer des transactions pour simuler des données inconnues. Ces transactions vont être supprimées de manière aléatoire à partir du jeux original pour aboutir au jeux d'entrainement. 
Sur 5 folds une transaction à donc une probabilité de 1/5 d'être supprimée. La colonne "rating" contient le nombre de transaction (user>item) identiques par utilisateur. Il faudra donc pondérer cette probabilité par la valeur contenue dans la colonne "rating". 

Par suite, on s'assurera, dans le notebook (analyse_cf_model) que les utilisateurs contenus dans le trainset et dans le testset sont identiques

On notera que cette manière différe de celle utilisée pour générer des jeux d'entrainement et de test dans le modèle manuel. Cela s'explique par le fait que le modèle collaboratif est basé uniquement sur l'appréciation des produits par les utilisateurs. Les variables utilisateurs sont donc corrélées à la nature des items consommés. On ne tient pas ici compte explicitement des considérations temporelles par exemple ou transactionelles (qui nous contraignaient à sélectioner des commandes et non des transactions dans les jeux des test/entrainement) (voir notebook create_manual_model)


In [34]:
# first, let us define some important functions 


def nonzeros(m, row):
    for index in range(m.indptr[row], m.indptr[row+1]):
        yield m.indices[index], m.data[index]
        
        

def alternating_least_squares_cg(Cui, factors, regularization=0.01, iterations=15):
    users, items = Cui.shape

    # initialize factors randomly
    X = np.random.rand(users, factors) * 0.01
    Y = np.random.rand(items, factors) * 0.01

    Cui, Ciu = Cui.tocsr(), Cui.T.tocsr()

    for iteration in range(iterations):
        least_squares_cg(Cui, X, Y, regularization)
        least_squares_cg(Ciu, Y, X, regularization)

    return (X, Y)


def least_squares_cg(Cui, X, Y, regularization, cg_steps=3):
    users, factors = X.shape
    YtY = Y.T.dot(Y) + regularization * np.eye(factors)

    for u in range(users):
        # start from previous iteration
        x = X[u]

        # calculate residual r = (YtCuPu - (YtCuY.dot(Xu), without computing YtCuY
        r = -YtY.dot(x)
        for i, confidence in nonzeros(Cui, u):
            r += (confidence - (confidence - 1) * Y[i].dot(x)) * Y[i]

        p = r.copy()
        rsold = r.dot(r)

        for it in range(cg_steps):
            # calculate Ap = YtCuYp - without actually calculating YtCuY
            Ap = YtY.dot(p)
            for i, confidence in nonzeros(Cui, u):
                Ap += (confidence - 1) * Y[i].dot(p) * Y[i]

            # standard CG update
            alpha = rsold / p.dot(Ap)
            x += alpha * p
            r -= alpha * Ap
            rsnew = r.dot(r)
            p = r + (rsnew / rsold) * p
            rsold = rsnew

        X[u] = x

In [35]:
def plot_value_counts(col_name,df):       
    
    values_count = pd.DataFrame(df[col_name].dropna().value_counts())    
    values_count.columns = ['count']
    
    # convert the index column into a regular column.
    values_count[col_name] = [ str(i) for i in values_count.index ]
    
    # add a column with the percentage of each data point to the sum of all data points.
    values_count['percent'] = values_count['count'].div(values_count['count'].sum()).multiply(100).round(2)
    
    # change the order of the columns.
    values_count = values_count.reindex([col_name,'count','percent'],axis=1)
    values_count.reset_index(drop=True,inplace=True)
    
    return (values_count)

In [41]:
def traintest_split(df):
    """create the train and test set""" 
    
    # looking for transaction cases
    ind_rating = df.columns.get_loc('rating')
    
    df_trans = df[df.iloc[:,ind_rating]>0] 
    
    # number of different transaction    
    trans_nb = df_trans.shape[0]
    
    # define trainset and testset
    
    testset = df.copy()
    trainset = df.copy()
    
    # get the rating indice 
    ind_rating = df.columns.get_loc('rating')
    
    # sort the dataset / put non zero rating in first
    trainset = trainset.sort_values(by='rating', ascending=False)
    
    # for each non zero rating , for each transaction there is 1/5 probability to delete the transaction
    trainset['iter'] = trainset.iloc[:,ind_rating] 
    
    # get the rating indice 
    ind_iter = trainset.columns.get_loc('iter')    
               
    for i in range (int(np.amax(trainset.iloc[:,ind_rating]))):
        
        # apply the random filter max(trainset.iloc[:,ind_rating]) times 
        
        shape = trainset.iloc[:,ind_rating][trainset.iloc[:,ind_iter]>0].shape[0]
        trainset.iloc[:,ind_rating][trainset.iloc[:,ind_iter]>0] = trainset.iloc[:,ind_rating][
                            trainset.iloc[:,ind_iter]>0] - (np.random.choice(2,(shape), p=[4/5,1/5])).transpose()
        
        # remove one to index iteration columns
        
        trainset.iloc[:,ind_iter] = trainset.iloc[:,ind_iter] - 1    
     
    # different transaction bewteen testset and trains
    
    testset.loc[:,'rating'] = df.loc[:,'rating'] - trainset.loc[:,'rating'] 
    trainset = shuffle(trainset)   
    testset = shuffle(testset)
        
    return (trainset, testset)

In [45]:
def analyse_full(df,folds,reg_param,alpha_val):
    """analyse the validity of our model and give the mean of the performance evaluate by cross validation
       using our metric based on rank"""
    
    # switch datas to choose randomly user ID    
    
    df = shuffle(df)  
    
        
    for k in range (folds): 
        
        print (k)
                
        # ------------------------------------ SHARING DATASET (TRAIN/TEST) --------------------------------------
        
        # computer our train set and test set 
        print ('init traintest_split')
        train_set, test_set = traintest_split(df=df) 
        print ('final traintest_split')
        print (train_set.columns[:])
        print (np.sum(train_set.loc[:,'rating']))
        print (np.sum(test_set.loc[:,'rating']))
        
        
        # ------------------------------------I -- COMPUTING ALGORITHM ON TRAIN SET --------------------------------------
        
        # take the data we need 
        data = train_set[['userID', 'itemID', 'rating']]

        # Drop NaN columns
        data = data.dropna()
        data = data.copy()        

        # Create a numeric user_id and artist_id column
        data['userID'] = data['userID'].astype("category")
        data['itemID'] = data['itemID'].astype("category")
        data['user_id'] = data['userID'].cat.codes
        data['item_id'] = data['itemID'].cat.codes    

        # create two matricies, one for fitting the model (item-user) 
        # and one for recommendations (user-item)
        sparse_item_user = sparse.csr_matrix((data['rating'].astype(float), (data['item_id'], data['user_id'])))
        sparse_user_item = sparse.csr_matrix((data['rating'].astype(float), (data['user_id'], data['item_id'])))

        # Calculate the confidence by multiplying it by our alpha value.
        data_conf = (sparse_item_user * alpha_val).astype('double')

        # Initialize the als model and fit it using the sparse item-user matrix
        X, Y = alternating_least_squares_cg(Cui=data_conf ,factors=15, regularization=reg_param, iterations=20)

        # Get the user and item vectors from our trained model
        user_vecs_train = sparse.csr_matrix(X)
        
        
         # ------------------------------------II -- COMPUTING ALGORITHM ON TEST SET --------------------------------------
        
        # take the data we need 
        data = test_set[['userID', 'itemID', 'rating']]

        # Drop NaN columns
        data = data.dropna()
        data = data.copy()        

        # Create a numeric user_id and artist_id column
        data['userID'] = data['userID'].astype("category")
        data['itemID'] = data['itemID'].astype("category")
        data['user_id'] = data['userID'].cat.codes
        data['item_id'] = data['itemID'].cat.codes    

        # create two matricies, one for fitting the model (item-user) 
        # and one for recommendations (user-item)
        sparse_item_user = sparse.csr_matrix((data['rating'].astype(float), (data['item_id'], data['user_id'])))
        sparse_user_item = sparse.csr_matrix((data['rating'].astype(float), (data['user_id'], data['item_id'])))

        # Calculate the confidence by multiplying it by our alpha value.
        data_conf = (sparse_item_user * alpha_val).astype('double')

        # Initialize the als model and fit it using the sparse item-user matrix
        X, Y = alternating_least_squares_cg(Cui=data_conf ,factors=15, regularization=reg_param, iterations=20)

        # Get the user and item vectors from our trained model
        user_vecs_test = sparse.csr_matrix(X)
        
        # saving datas       
        user_vecs_trtot = np.zeros((user_vecs_train.shape[0],user_vecs_train.shape[1]))
        
        for i in range (user_vecs_train.shape[0]):
            for j in range (user_vecs_train.shape[1]):
                user_vecs_trtot[i,j] = user_vecs_train[i,j]
        d = {'1':user_vecs_trtot[:,0]}   
        
        for i in range (user_vecs_trtot.shape[1]-1):
            d.update({'train{}'.format(i+1):user_vecs_trtot[:,i+1]})        
        
        dff = pd.DataFrame(data=d)
        
        dff.to_csv("user_vecs_train{}.csv".format(k))
        
        user_vecs_tetot = np.zeros((user_vecs_test.shape[0],user_vecs_test.shape[1]))
        
        if (k == 0):
            for i in range (user_vecs_test.shape[0]):
                for j in range (user_vecs_test.shape[1]):
                    user_vecs_tetot[i,j] = user_vecs_test[i,j]
        
        d = {'1':user_vecs_tetot[:,0]}   
        
        for i in range (user_vecs_tetot.shape[1]-1):
            d.update({'train{}'.format(i+1):user_vecs_tetot[:,i+1]})
        
        dff = pd.DataFrame(data=d)
        dff.to_csv("user_vecs_test{}.csv".format(k))    
                
    
  

In [46]:
analyse_full(df=data_cf,folds=5,reg_param=0.1,alpha_val=50)

0
init traintest_split


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


final traintest_split
Index(['Unnamed: 0', 'try', 'itemID', 'userID', 'rating', 'iter'], dtype='object')
33613.0
8379.0
1
init traintest_split
final traintest_split
Index(['Unnamed: 0', 'try', 'itemID', 'userID', 'rating', 'iter'], dtype='object')
33534.0
8458.0
2
init traintest_split
final traintest_split
Index(['Unnamed: 0', 'try', 'itemID', 'userID', 'rating', 'iter'], dtype='object')
33657.0
8335.0
3
init traintest_split
final traintest_split
Index(['Unnamed: 0', 'try', 'itemID', 'userID', 'rating', 'iter'], dtype='object')
33582.0
8410.0
4
init traintest_split
final traintest_split
Index(['Unnamed: 0', 'try', 'itemID', 'userID', 'rating', 'iter'], dtype='object')
33673.0
8319.0
