In [1]:
import os
cwd = os.getcwd()
import gc
import numpy as np
from dask import dataframe as dd
import pandas as pd

In [2]:
def rmse(y,h):
    """RMSE
    Args:
        y: real y
        h: predicted y
    Returns:
        RMSE
    """
    a = y-h

    return np.sqrt(sum(a**2)/len(a))

def dcg_k(r, k):
    """ Discounted Cumulative Gain (DGC)  
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        DCG
    """
  
    r = np.asfarray(r)[:k]
    return np.sum(2**r / np.log2(np.arange(2, r.size + 2))) 

def ndcg_k(r, k):
    """Normalized Discounted Cumulative Gain (NDCG)
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        NDCG
    """
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_k(r, k) / dcg_max

def mean_ndcg(rs):
    """Mean NDCG for all users
    Args:
        rs: Iterator / For each user: True Ratings in Predicted Rank Order
    Returns:
        Mean NDCG
    """
    return np.mean([ndcg_k(r, len(r)) for r in rs])

In [3]:
## DEFINING THE TAIL
ratings = pd.read_csv(cwd+'/../../../data/output csv files/finalratings.csv')
tailcomp = ratings.groupby(by= 'newbookid', as_index=False).agg({'rating':pd.Series.count}).sort_values(by = 'rating', ascending = False)
ratings=""
tot = sum(tailcomp['rating'])
tailcomp['popshare']= [x/tot for x in tailcomp['rating']]
tailcomp['popshare']= tailcomp['popshare'].cumsum()
tailcomp['category']= ['Head' if x<0.95 else "Tail" for x in tailcomp['popshare']]

tail = tailcomp.loc[tailcomp.popshare >= 0.95]
tailcomp=""
tail

Unnamed: 0,newbookid,rating,popshare,category
5733,5734,62,0.950026,Tail
6589,6590,62,0.950056,Tail
7094,7095,62,0.950085,Tail
6334,6335,62,0.950114,Tail
6280,6281,62,0.950144,Tail
...,...,...,...,...
7619,7620,31,0.999941,Tail
5461,5462,31,0.999956,Tail
6094,6095,31,0.999971,Tail
7757,7758,31,0.999985,Tail


In [4]:
bayes = dd.read_csv('all_predictions_of_naive_bayes.csv')
print("Length of file: %d"%len(bayes))
bayes

Length of file: 120000000


Unnamed: 0_level_0,newbookid,newuser_id,predicted_rating
npartitions=54,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,int64,int64,float64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [5]:
test = dd.read_csv(cwd+'/../../../data/output csv files/test.csv')
useronly = test['newuser_id'].unique()
bayesrank = test.merge(bayes,on = ['newbookid', 'newuser_id'])
bayesrank = bayesrank.map_partitions(lambda x: x.sort_values(by=['newuser_id', 'predicted_rating'], ascending=[1,0]))

In [6]:
br = bayesrank.compute()
bayeslist = []

for i in range(len(useronly)):
    a = br.loc[br.newuser_id == i+1]['rating'].tolist()
    bayeslist.append(a)
    if (i+1)%1000 == 0: print("done: ", i+1)

br=""

done:  1000
done:  2000
done:  3000
done:  4000
done:  5000
done:  6000
done:  7000
done:  8000
done:  9000
done:  10000
done:  11000
done:  12000
done:  13000
done:  14000
done:  15000


In [7]:
b = np.array([ndcg_k(r, len(r)) for r in bayeslist])
d = b[b == 1]

In [8]:
#top50 = bayesfin.map_partitions(lambda x: x.sort_values(by='predicted_rat', ascending=False).groupby('newuser_id').head(50))

print('(1) Bayes Model RMSE: ', np.round(rmse(bayesrank['predicted_rating'],bayesrank['rating']), decimals=3))
print('(2) Bayes Model NDCG: ', np.round(mean_ndcg(bayeslist), decimals=3))
print("(2) Median NDCG: ", np.round(np.median(b), decimals=3))
print("(2) Share of NDCG =1 among Users: ", np.round(sum(d)/len(useronly), decimals=3))

test=""
bayesrank=""
bayeslist=""
d=""
b=""
#print('(3) Bayes Model Div50 Score: ',np.round(sum(np.in1d(top50.newbookid, tail.newbookid))/len(top50), decimals=3))

(1) Bayes Model RMSE:  1.375
(2) Bayes Model NDCG:  0.824
(2) Median NDCG:  0.821
(2) Share of NDCG =1 among Users:  0.011


In [9]:
top10 = dd.read_csv('top_10_preds_for_each_user_by_naive_bayes.csv')
print("Length of file: %d"%len(top10))
top10

Length of file: 150000


Unnamed: 0_level_0,newuser_id,newbookid
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
,int64,int64
,...,...


In [10]:
true_count=0
total_count=0
for t10 in top10.newbookid:
    total_count = total_count+1
    if t10 in tail.newbookid:
        true_count = true_count+1

print('(3) Bayes Model Div10 Score: ',np.round(true_count/total_count, decimals=3))
#print('(3) Bayes Model Div10 Score: ',np.round(sum(np.in1d(top10.newbookid, tail.newbookid))/len(top10), decimals=3))
top10=""
tail=""

(3) Bayes Model Div10 Score:  0.288


In [11]:
os.chdir(cwd+'/../../../data/output csv files')
train = dd.read_csv('train.csv')
os.chdir(cwd)

In [12]:
bayesrank_train = train.merge(bayes,on = ['newbookid', 'newuser_id'])
bayesrank_train = bayesrank_train.map_partitions(lambda x: x.sort_values(by=['newuser_id', 'predicted_rating'],ascending=[1,0]))

In [13]:
brt = bayesrank_train.compute()
bayeslist_train = []

for i in range(len(useronly)):
    a = brt.loc[brt.newuser_id == i+1]['rating'].tolist()
    bayeslist_train.append(a)
    if (i+1)%1000 == 0: print("done: ", i+1)

brt=""

done:  1000
done:  2000
done:  3000
done:  4000
done:  5000
done:  6000
done:  7000
done:  8000
done:  9000
done:  10000
done:  11000
done:  12000
done:  13000
done:  14000
done:  15000


In [14]:
print('(1) Bayes Model Train RMSE: ', np.round(rmse(bayesrank_train['predicted_rating'],bayesrank_train['rating']), decimals=3))
print('(2) Bayes Model Train NDCG: ', np.round(mean_ndcg(bayeslist_train), decimals=3))

train=""
bayesrank_train=""
bayes=""
bayeslist_train=""

(1) Bayes Model Train RMSE:  1.415
(2) Bayes Model Train NDCG:  0.779
