In [1]:
import os
cwd = os.getcwd()
import gc

import numpy as np
import pandas as pd
from dask import dataframe as dd

In [2]:
def rmse(y,h):
    """RMSE
    Args:
        y: real y
        h: predicted y
    Returns:
        RMSE
    """
    a = y-h

    return np.sqrt(sum(a**2)/len(a))

def dcg_k(r, k):
    """ Discounted Cumulative Gain (DGC)  
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        DCG
    """
  
    r = np.asfarray(r)[:k]
    return np.sum(2**r / np.log2(np.arange(2, r.size + 2))) 

def ndcg_k(r, k):
    """Normalized Discounted Cumulative Gain (NDCG)
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        NDCG
    """
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_k(r, k) / dcg_max

def mean_ndcg(rs):
    """Mean NDCG for all users
    Args:
        rs: Iterator / For each user: True Ratings in Predicted Rank Order
    Returns:
        Mean NDCG
    """
    return np.mean([ndcg_k(r, len(r)) for r in rs])

In [3]:

## MAP

def precision_k(r, k):
    """Score is precision @ k
    Args:
        r: Binary Y/N in Predicted Rank Order (1st element is top recommendation)
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)


def average_precision(r):
    """Average Precision
    Args:
        r: Binary Y/N in Predicted Rank Order (1st element is top recommendation)
    Returns:
        Average Precision
    """
    r = np.asarray(r) != 0
    out = [precision_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)


def mean_average_precision(rs):
    """Mean Average Precision (MAP)
    Args:
        rs: Iterator / For each user: Binary Y/N in Predicted Rank Order
    Returns:
        MAP
    """
    return np.mean([average_precision(r) for r in rs])

In [4]:
## DEFINING THE TAIL
ratings = pd.read_csv(cwd+'/../../../data/output csv files/finalratings.csv')
tailcomp = ratings.groupby(by= 'newbookid', as_index=False).agg({'rating':pd.Series.count}).sort_values(by = 'rating', ascending = False)
tot = sum(tailcomp['rating'])
tailcomp['popshare']= [x/tot for x in tailcomp['rating']]
tailcomp['popshare']= tailcomp['popshare'].cumsum()
tailcomp['category']= ['Head' if x<0.95 else "Tail" for x in tailcomp['popshare']]

#finalbooks.loc[finalbooks.popshare >= 0.8].describe()
tail = tailcomp.loc[tailcomp.popshare >= 0.95]
tail

Unnamed: 0,newbookid,rating,popshare,category
5733,5734,62,0.950026,Tail
6589,6590,62,0.950056,Tail
7094,7095,62,0.950085,Tail
6334,6335,62,0.950114,Tail
6280,6281,62,0.950144,Tail
...,...,...,...,...
7619,7620,31,0.999941,Tail
5461,5462,31,0.999956,Tail
6094,6095,31,0.999971,Tail
7757,7758,31,0.999985,Tail


In [5]:
test = pd.read_csv(cwd+'/../../../data/output csv files/test.csv')
useronly = test['newuser_id'].unique()
popular = pd.read_csv(cwd+'/popular_books.csv')

In [6]:
# merge test dataframe and popular dataframe
poprank = test.merge(popular,on = 'newbookid')

# ratingYN column assigns 1 if rating>=4 and 0 if rating<4
poprank['ratingYN']= [1 if x>= 4 else 0 for x in poprank['rating']]

poprank = poprank.sort_values(by=['newuser_id', 'ratings_count'], ascending=False)
poprank

Unnamed: 0,rating,newbookid,newuser_id,ratings_count,average_rating,predicted_rating,ratingYN
139520,4,3,15000,3866839,3.57,4.0,1
143201,5,23,15000,1779331,4.37,4.0,1
149103,5,42,15000,1257121,4.04,4.0,1
77754,4,70,15000,813439,4.30,4.0,1
19823,4,89,15000,628637,4.25,4.0,1
...,...,...,...,...,...,...,...
145357,3,660,1,129811,3.55,4.0,0
153191,3,903,1,95620,3.62,4.0,0
334789,4,1256,1,82458,3.94,4.0,1
55873,3,1294,1,65397,3.69,4.0,0


In [7]:
poplist = []

for i in range(len(useronly)):
  a = poprank.loc[poprank.newuser_id == i+1]['rating'].tolist()
  poplist.append(a)
  if (i+1)%1000 == 0: print("done: ", i+1)

done:  1000
done:  2000
done:  3000
done:  4000
done:  5000
done:  6000
done:  7000
done:  8000
done:  9000
done:  10000
done:  11000
done:  12000
done:  13000
done:  14000
done:  15000


In [8]:
top10 = pd.read_csv('top_10_predictions_for_each_user_by_popularity.csv')
print("Length of file: %d"%len(top10))
top10

Length of file: 150000


Unnamed: 0,newuser_id,newbookid,ratings_count,average_rating
0,1,2,4602479,4.44
1,1,3,3866839,3.57
2,1,5,2683664,3.89
3,1,8,2044241,3.79
4,1,9,2001311,3.85
...,...,...,...,...
149995,15000,6,2346404,4.26
149996,15000,7,2071616,4.25
149997,15000,8,2044241,3.79
149998,15000,10,2035490,4.24


In [9]:
b = np.array([ndcg_k(r, len(r)) for r in poplist])
d = b[b == 1]

In [10]:
print('Popularity Model MAP: ', mean_average_precision(poplist))
print('(1) Popularity Model RMSE: ', np.round(rmse(poprank['rating'],poprank['average_rating']), decimals=3))
print('(2) Popularity Model NDCG: ', np.round(mean_ndcg(poplist), decimals=3))
print("(2) Median NDCG: ", np.round(np.median(b), decimals=3))
print("(2) Share of NDCG =1 among Users: ", np.round(sum(d)/len(useronly), decimals=3))

true_count=0
total_count=0
for t10 in top10.newbookid:
    total_count = total_count+1
    if t10 in tail.newbookid:
        true_count = true_count+1

print('(3) Popularity Model Div10 Score: ',np.round(true_count/total_count, decimals=3))
#print('(3) Popularity Model Div10 Score: ',np.round(sum(np.in1d(top10.newbookid, tail.newbookid))/len(top), decimals=2))
#print('(3) Popularity Model Div50 Score: ',np.round(sum(np.in1d(top.newbookid, tail.newbookid))/len(top), decimals=2))

Popularity Model MAP:  1.0
(1) Popularity Model RMSE:  0.968
(2) Popularity Model NDCG:  0.862
(2) Median NDCG:  0.866
(2) Share of NDCG =1 among Users:  0.011
(3) Popularity Model Div10 Score:  0.0


In [11]:
poplist=""
test=""
poprank=""
top10=""
tail=""
d=""
b=""

In [12]:
train = pd.read_csv(cwd+'/../../../data/output csv files/train.csv')

poprank_train = train.merge(popular,on = 'newbookid')
poprank_train['ratingYN']= [1 if x>= 4 else 0 for x in poprank_train['rating']]

poprank_train = poprank_train.sort_values(by=['newuser_id', 'ratings_count'], ascending=False)
poprank_train

Unnamed: 0,rating,newbookid,newuser_id,ratings_count,average_rating,predicted_rating,ratingYN
24924,4,1,15000,4780653,4.34,4.0,1
169240,5,2,15000,4602479,4.44,4.0,1
50409,5,4,15000,3198671,4.25,4.0,1
40344,5,7,15000,2071616,4.25,4.0,1
45394,5,10,15000,2035490,4.24,4.0,1
...,...,...,...,...,...,...,...
151342,4,2171,1,47389,3.87,4.0,1
1424395,5,3396,1,28386,4.07,4.0,1
1106900,2,3218,1,27719,3.46,3.0,0
804026,4,3186,1,27358,4.09,4.0,1


In [13]:
poplist_train = []

for i in range(len(useronly)):
  a = poprank_train.loc[poprank_train.newuser_id == i+1]['rating'].tolist()
  poplist_train.append(a)
  if (i+1)%1000 == 0: print("done: ", i+1)

done:  1000
done:  2000
done:  3000
done:  4000
done:  5000
done:  6000
done:  7000
done:  8000
done:  9000
done:  10000
done:  11000
done:  12000
done:  13000
done:  14000
done:  15000


In [14]:
print('(1) Popularity Model Train RMSE: ', np.round(rmse(poprank_train['rating'],poprank_train['average_rating']), decimals=3))
print('(2) Popularity Model Train NDCG: ', np.round(mean_ndcg(poplist_train), decimals=3))

(1) Popularity Model Train RMSE:  0.969
(2) Popularity Model Train NDCG:  0.881


In [15]:
poplist_train=""
train=""
poprank_train=""
popular=""