In [6]:
import numpy as np
import pandas as pd
from random import randrange
from scipy.stats.stats import pearsonr
from sklearn.decomposition import PCA
import math

data = pd.read_csv('../Research Project/namesR.csv', low_memory=False)
users = pd.Series(data.user).unique()
items = pd.Series(data.item).unique()
items = np.sort(items)
openness = []
conscientiousness = []
extraversion = []
agreeableness = []
neuroticism = []

for user in users:
    dfuser = data.loc[data['user'] == user]
    openness.append(dfuser.iloc[0]['openness'])
    conscientiousness.append(dfuser.iloc[0]['conscientiousness'])
    extraversion.append(dfuser.iloc[0]['extraversion'])
    agreeableness.append(dfuser.iloc[0]['agreeableness'])
    neuroticism.append(dfuser.iloc[0]['neuroticism'])

df_user_personality = pd.DataFrame({ 'user' : users,
                                     'openness' : openness,
                                     'conscientiousness' : conscientiousness,
                                     'extraversion' : extraversion,
                                     'agreeableness' : agreeableness,
                                     'neuroticism' : neuroticism })


df_user_personality.head()

Unnamed: 0,agreeableness,conscientiousness,extraversion,neuroticism,openness,user
0,3,3,5,4,6,1
1,5,6,6,4,5,2
2,4,5,4,3,4,3
3,5,4,3,5,5,4
4,4,4,5,4,4,5


In [7]:
col_names =  list(data)
users = df_user_personality.index + 1
sample0 = pd.DataFrame(columns = col_names)
sample1 = pd.DataFrame(columns = col_names)
sample2 = pd.DataFrame(columns = col_names)
sample3 = pd.DataFrame(columns = col_names)
sample4 = pd.DataFrame(columns = col_names)

for user in users:
    df_user = data.loc[data['user'] == user]
    s0 = np.array_split(df_user, 5)[0]
    s1 = np.array_split(df_user, 5)[1]
    s2 = np.array_split(df_user, 5)[2]
    s3 = np.array_split(df_user, 5)[3]
    s4 = np.array_split(df_user, 5)[4]
    sample0 = pd.concat([sample0, s0])
    sample1 = pd.concat([sample1, s1])
    sample2 = pd.concat([sample2, s2])
    sample3 = pd.concat([sample3, s3])
    sample4 = pd.concat([sample4, s4])

In [8]:
def matrix_factorization(R, U, I, P, K, rating_mean, user_bias, item_bias, steps=5000, alpha=0.0002, beta=0.02):
    for step in range(steps):
        for i in range(len(R)):
            #print('User')
            #print(i)
            for j in range(len(R[i])):
                #print('Item')
                #print(j)
                if R[i][j] > 0:
                    #print('NEW ITEM')
                    ag = df_user_personality['agreeableness'][i] - 1
                    cn = 6 + df_user_personality['conscientiousness'][i]
                    ex = 13 + df_user_personality['extraversion'][i]
                    nr = 20 + df_user_personality['neuroticism'][i]
                    op = 27 + df_user_personality['openness'][i]
                    

                    p1 = P[ag, :]
                    p2 = P[cn, :]
                    p3 = P[ex, :]
                    p4 = P[nr, :]
                    p5 = P[op, :]
                    
                    sP =  p1 + p2 + p3 + p4 + p5
                    rS = U[i] + sP
                        #print('RATING')
                        #print(R[i][j])
                        #print('PREDICTION')
                        #print(rating_mean + user_bias[0][i] + item_bias[0][j] + np.dot(I[j], rS))
                    eij = R[i][j] - (rating_mean + user_bias[i] + item_bias[j] + np.dot(I[j], rS))
                        #print('ERROR')
                        #print(eij)
                    item_bias[j] = item_bias[j] + alpha * (2 * eij - beta * item_bias[j]) 
                    user_bias[i] = user_bias[i] + alpha * (2 * eij - beta * user_bias[i]) 
                    for k in range(K):
                        U[i][k] = U[i][k] + alpha * (2 * eij * I[j][k] - beta * U[i][k])
                        I[j][k] = I[j][k] + alpha * (2 * eij * (U[i][k] + sP[k]) - beta * I[j][k])
                        P[ag][k] = P[ag][k] + alpha*(2*eij*I[j][k] - beta * P[ag][k])
                        P[cn][k] = P[cn][k] + alpha*(2*eij*I[j][k] - beta * P[cn][k])
                        P[ex][k] = P[ex][k] + alpha*(2*eij*I[j][k] - beta * P[ex][k])
                        P[nr][k] = P[nr][k] + alpha*(2*eij*I[j][k] - beta * P[nr][k])
                        P[op][k] = P[op][k] + alpha*(2*eij*I[j][k] - beta * P[op][k])
    
    return U, I.T, P


In [28]:
def metrics(rated_items_test, predictions):

    precisionV = 0
    totalP = 0

    recallV = 0
    totalR = 0
    
    ndcgV = 0
    totalN = len(users)

    nrec = 10

    for user in users:

        itemsR = rated_items_test.T.nlargest(nrec, user)[0:10][user].index.tolist()
        itemsP = predictions.T.nlargest(10, user)[0:10][user].index.tolist()
    
        totalP = totalP + len(itemsR)
        
        score = []
        #print(ratingsP)
        for item in itemsP:
            meanrating = np.mean(predictions_noP[item])
            score.append(meanrating)
            #print(score)
        a = np.asfarray(score)
        b = np.sort(a)
        b = np.asfarray(b[::-1])
        #print(b)
        t =  a[0] + np.sum(a[1:] / np.log2(np.arange(2, a.size + 1)))
        t2 =  b[0] + np.sum(b[1:] / np.log2(np.arange(2, b.size + 1)))
        
        #print(t/t2)
        
        ndcgV = ndcgV + (t/t2)
        
        df = predictions.copy().T
    
        ratingsR = rated_items_test.T.nlargest(nrec, user)[0:10][user].tolist()
        ratingsP = df.nlargest(10, user)[0:10][user].tolist()
    
        for item in itemsR:
            if item in itemsP:
                precisionV = precisionV + 1
            if(ratingsR[itemsR.index(item)]>3):
                totalR = totalR + 1
                if(item in itemsP):
                    if(ratingsP [itemsP.index(item)]>3.5):
                        recallV = recallV+1 
                        
    T = np.array(rated_items_test.astype(float))
    R = np.array(predictions_noP.astype(float))                    
    error = 0
    totalE = 0
    for i in range(len(T)):
        for j in range(len(T[i])):
            if(T[i][j]>0):
                totalE = totalE + 1
                sqe = (T[i][j] - R[i][j])
                error = sqe*sqe
    
    
    return (precisionV/totalP)*100, (recallV/totalR)*100, math.sqrt(error/totalE), ndcgV/totalN

In [16]:
precision = np.empty([1,5])
recall = np.empty([1,5])
ndcg = np.empty([1,5])
error = np.empty([1,5])

precision_noP = np.empty([1,5])
recall_noP = np.empty([1,5])
ndcg_noP = np.empty([1,5])
error_noP = np.empty([1,5])

In [31]:
for iteration in range(5):
    print('ITERATION')
    print(iteration)
    if(iteration == 0):
        train = pd.concat([sample0, sample1, sample2, sample3])
        test = sample4
    elif(iteration == 1):
        train = pd.concat([sample1, sample2, sample3, sample4])
        test = sample0
    elif(iteration == 2):
        train = pd.concat([sample2, sample3, sample4, sample0])
        test = sample1        
    elif(iteration == 3):
        train = pd.concat([sample3, sample4, sample0, sample1])
        test = sample2
    else:
        train = pd.concat([sample4, sample0, sample1, sample2])
        test = sample3
    
    rating_mean = train['rating'].mean()
    rated_items = pd.DataFrame(index=users, columns=items, dtype='float')

    rated_items_test = pd.DataFrame(index=users, columns=items, dtype='float')

    for user in users:
        df_user_rat = train.loc[train['user'] == user]
        items_rat = df_user_rat['item'].tolist()
        rating = df_user_rat['rating'].tolist()
        
        df_user_rat_t = test.loc[test['user'] == user]
        items_rat_t = df_user_rat_t['item'].tolist()
        rating_t = df_user_rat_t['rating'].tolist()
        
        i = 0
        for item in items_rat:
            rated_items.loc[user][item] = rating[i]
            i = i + 1
        i = 0
        for item in items_rat_t:
            rated_items_test.loc[user][item] = rating_t[i]
            i = i + 1

    R = np.array(rated_items.astype(float))

    N = len(R)
    M = len(R[0])
    K = 10
    Npersonalities = len(df_user_personality.columns)-1
    MaxV = max(df_user_personality.loc[:, df_user_personality.columns != 'user'].max())

    U = np.random.rand(N,K)
    I = np.random.rand(M,K)
    P = np.random.rand(Npersonalities*MaxV,K)

    user_bias = np.random.rand(N,1)
    item_bias = np.random.rand(M,1)

    U, I, P = matrix_factorization(R, U, I, P, K, rating_mean, user_bias, item_bias, steps=120, alpha= 0.0001, beta= 0.02)
    for i in range(len(R)):
        for j in range(len(R[i])):
            ag = df_user_personality['agreeableness'][i] - 1
            cn = 6 + df_user_personality['conscientiousness'][i]
            ex = 13 + df_user_personality['extraversion'][i]
            nr = 20 + df_user_personality['neuroticism'][i]
            op = 27 + df_user_personality['openness'][i]
                    
            p1 = P[ag, :]
            p2 = P[cn, :]
            p3 = P[ex, :]
            p4 = P[nr, :]
            p5 = P[op, :]
            
            sP =  p1 + p2 + p3 + p4 + p5
            rS = U[i] + sP 
            
            R[i][j] = rating_mean + user_bias[i] + item_bias[j] + np.dot(I.T[j], rS)
            
    predictions = pd.DataFrame(index=users, columns=items, data = R)
    p, r, e, n = metrics(rated_items_test, predictions)
    #precision[0][iteration] = p
    #recall[0][iteration] = r
    precision[0][iteration] = p
    recall[0][iteration] = r
    ndcg[0][iteration] = ndcg
    error[0][iteration] = e
    print(p)
    print(r)
    print(e)
    print(n)

ITERATION
0
13.069908814589665
32.142857142857146
0.1428436467132617
0.9269330515299227
ITERATION
1
17.52136752136752
17.798594847775178
0.01013557510933993
0.9691225257211624
ITERATION
2
13.138686131386862
20.171673819742487
0.0781438864376938
0.9699034615788046
ITERATION
3
21.179624664879356
26.180257510729614
0.14497010980022543
0.9724272147445174
ITERATION
4
17.81609195402299
23.626373626373624
0.08492318929578435
0.9554515651947633


## Some Metrics already recovered

In [60]:
precision1 = [15.19756838905775, 16.23931623931624, 12.652068126520682, 12.332439678284182, 14.367816091954023]
recall1 = [10.714285714285714, 13.817330210772832, 12.446351931330472, 12.446351931330472, 11.538461538461538]

precision2 = [11.854103343465045, 18.58974358974359, 12.652068126520682, 14.209115281501342, 14.367816091954023]
recall2 = [42.857142857142854, 19.20374707259953, 18.4549356223176, 17.59656652360515, 19.78021978021978]

precision3 = [9.72644376899696, 21.581196581196583, 14.5985401459854, 16.0857908847185, 16.379310344827587]
recall3 = [21.428571428571427, 20.843091334894616, 20.600858369098713, 18.4549356223176, 23.076923076923077]

print('Alpha = 0.0001, Beta = 0.002, Nsteps = 100')
print(np.mean(precision))
print(np.mean(recall))
print()
print('Alpha = 0.00001, Beta = 0.002, Nsteps = 120')
print(np.mean(precision1))
print(np.mean(recall1))
print()
print('Alpha = 0.001, Beta = 0.002, Nsteps = 120')
print(np.mean(precision2))
print(np.mean(recall2))
print()
print('Alpha = 0.0001, Beta = 0.02, Nsteps = 120')
print(np.mean(precision3))
print(np.mean(recall3))

Alpha = 0.0001, Beta = 0.002, Nsteps = 100
15.369752622092417
20.985527875347053

Alpha = 0.00001, Beta = 0.002, Nsteps = 120
14.157841705026575
12.192556265236204

Alpha = 0.001, Beta = 0.002, Nsteps = 120
14.334569286636938
23.578522371176984

Alpha = 0.0001, Beta = 0.02, Nsteps = 120
15.674256345145006
20.880875966361085


In [27]:
rmse1 = np.mean([0.10561486289827865, 0.04494817358731171, 0.13356429305014042, 0.1372853910742225, 0.14515154363225244])
rmse2 = np.mean([0.10561486289827865, 0.01828975662850862, 0.05786220670586738, 0.13794586057104033, 0.07744882441543859])
rmse3 = np.mean([0.10561486289827865, 0.04494817358731171, 0.13356429305014042, 0.1372853910742225, 0.14515154363225244])
rmse4 = np.mean([0.10561486289827865, 0.04494817358731171, 0.11128379431129118, 0.13356429305014042, 0.14515154363225244])

print(rmse1)
print(rmse2)
print(rmse3)
print(rmse4)


0.11331285284844114
0.07943230224382672
0.11331285284844114
0.10811253349585488


In [11]:
def matrix_factorization_noP(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
    return P, Q.T

In [30]:

for iteration in range(5):
    print('ITERATION')
    print(iteration)
    if(iteration == 0):
        train = pd.concat([sample0, sample1, sample2, sample3])
        test = sample4
    elif(iteration == 1):
        train = pd.concat([sample1, sample2, sample3, sample4])
        test = sample0
    elif(iteration == 2):
        train = pd.concat([sample2, sample3, sample4, sample0])
        test = sample1        
    elif(iteration == 3):
        train = pd.concat([sample3, sample4, sample0, sample1])
        test = sample2
    else:
        train = pd.concat([sample4, sample0, sample1, sample2])
        test = sample3
    
    rating_mean = train['rating'].mean()
    rated_items = pd.DataFrame(index=users, columns=items, dtype='float')

    rated_items_test = pd.DataFrame(index=users, columns=items, dtype='float')

    for user in users:
        df_user_rat = train.loc[train['user'] == user]
        items_rat = df_user_rat['item'].tolist()
        rating = df_user_rat['rating'].tolist()
        
        df_user_rat_t = test.loc[test['user'] == user]
        items_rat_t = df_user_rat_t['item'].tolist()
        rating_t = df_user_rat_t['rating'].tolist()
        
        i = 0
        for item in items_rat:
            rated_items.loc[user][item] = rating[i]
            i = i + 1
        i = 0
        for item in items_rat_t:
            rated_items_test.loc[user][item] = rating_t[i]
            i = i + 1

    R = np.array(rated_items.astype(float))

    N = len(R)
    M = len(R[0])
    K = 10

    P = np.random.rand(N,K)
    Q = np.random.rand(M,K)

    nP, nQ = matrix_factorization_noP(R, P, Q, K, steps = 120, alpha= 0.0001, beta= 0.002)

    nR = np.dot(nP, nQ.T)
    predictions_noP = pd.DataFrame(index=users, columns=items, data = nR)
    p, r, e, n = metrics(rated_items_test, predictions_noP)
    #precision_noP[0][iteration] = p
    #recall_noP[0][iteration] = r
    precision_noP[0][iteration] = p
    recall_noP[0][iteration] = r
    ndcg_noP[0][iteration] = ndcg
    error_noP[0][iteration] = e
    print(p)
    print(r)
    print(e)
    print(n)

ITERATION
0
10.94224924012158
35.714285714285715
0.08933182934422997
0.9903458107733517
ITERATION
1
23.717948717948715
17.330210772833723
0.01723996036236408
0.9923798207278646
ITERATION
2
17.27493917274939
20.171673819742487
0.041158059542871495
0.9891625028916264
ITERATION
3
28.686327077747993
29.613733905579398
0.10408570601249842
0.9921012201091094
ITERATION
4
22.126436781609197
28.021978021978022
0.08492318929578435
0.9916116161760299


## Some Metrics already recovered

In [61]:
precision_noP1 = [8.51063829787234, 10.683760683760683, 12.165450121654501, 11.26005361930295, 14.367816091954023]
recall_noP1 = [14.285714285714285, 9.367681498829041, 13.733905579399142, 12.875536480686694, 11.538461538461538]


precision_noP2 = [1.82370820668693, 22.00854700854701, 11.435523114355231, 15.013404825737265, 14.367816091954023]
recall_noP2 = [21.428571428571427, 23.88758782201405, 20.171673819742487, 23.605150214592275, 19.78021978021978]

precision_noP3 = [4.25531914893617, 26.495726495726498, 11.678832116788321, 15.549597855227882, 16.379310344827587]
recall_noP3 = [17.857142857142858, 27.166276346604217, 19.313304721030043, 22.317596566523605, 23.076923076923077]

print('Alpha = 0.0001, Beta = 0.002, Nsteps = 100')
print(np.mean(precision_noP))
print(np.mean(recall_noP))
print()
print('Alpha = 0.00001, Beta = 0.002, Nsteps = 120')
print(np.mean(precision_noP1))
print(np.mean(recall_noP1))
print()
print('Alpha = 0.001, Beta = 0.002, Nsteps = 120')
print(np.mean(precision_noP2))
print(np.mean(recall_noP2))
print()
print('Alpha = 0.0001, Beta = 0.02, Nsteps = 120')
print(np.mean(precision_noP3))
print(np.mean(recall_noP3))

Alpha = 0.0001, Beta = 0.002, Nsteps = 100
15.658699240722859
23.44954665400736

Alpha = 0.00001, Beta = 0.002, Nsteps = 120
11.3975437629089
12.36025987661814

Alpha = 0.001, Beta = 0.002, Nsteps = 120
12.929799849456092
21.774640613028005

Alpha = 0.0001, Beta = 0.02, Nsteps = 120
14.871757192301292
21.94624871364476


In [23]:
rmse4 = np.mean([0.1075277224293865, 0.05504581218752367, 0.08517936131166928, 0.08972573445488866, 0.10551757712725705])
rmse3 = np.mean([0.1279836449850023, 0.01828975662850862, 0.05786220670586738, 0.13794586057104033, 0.07744882441543859])
rmse2 = np.mean([0.08122511647642298, 0.07066588725378073, 0.09152529077527113, 0.09479462577626808, 0.10948492748381565])
rmse1 = np.mean([0.10017713998416972, 0.06925895691288196, 0.11128379431129118, 0.11986357675288985, 0.14515154363225244])

print(rmse1)
print(rmse2)
print(rmse3)
print(rmse4)





0.10914700231869703
0.08953916955311172
0.08390605866117146
0.08859924150214504
