In [61]:
def standarize(input_str):
    """
    e.g:
    change "/AAMAS/AAMAS2005/p1067-sukthankar.pdf\n" to "AAMAS05-p1067-sukthankar"
    :param input_str: the conference info and paperID of the items in top-10 rank list
    :return: paper ID that match the ground truth
    """
    result = re.split(r'/', input_str)
    paperInfo = result[-1].replace('.pdf\n','')
    if '20' in result[-2]:
        conInfo = result[-2].replace('20','')
    if '19' in result[-2]:
        conInfo = result[-2].replace('19','')
    item_name = conInfo + '-' + paperInfo
    return item_name

def rename_rank_item(data,num_user):
    """
    rename all items name in the ranking result so as to check if the item in 
    the result list hit the ground truth.
    :param data: a csv file that contain columns : ranking,R1,R2,R3...R50
    :param num_user: number of columns in csv file, in this case num_user=50
    : return ranking_df: the ranking dataframe with standard item name in each cell
    """
    ranking = pd.read_csv(data)
    ranking_df = pd.DataFrame()
    # n denotes number of researchers, k denote top-k ranking list
    for n in range(1,num_user+1,1):
        r = 'R' + str(n)
        ranking_ls = []
        for k in range(len(ranking[r])):
            input_str = ranking[r][k]
            ranking_item = standarize(input_str)
            ranking_ls.append(ranking_item)
        ranking_df[r] = ranking_ls
    return ranking_df

def get_rank_matrix(ranking_df, ground_truth,num_user,k):
    """
    :param ranking_df: dataframe
    :param ground_truth: dataframe
    :param num_user: number of users
    :param k: recommend top k items for each user
    :return rank_matrix: a matrix[k*num_user], cell value is either 1(relevant item) or 0(irrelevant item)
    """
    # reverse all researchers to see if the top k recommended items hit the ground truth
    rank_matrix = {}
    for r in range(num_user):
        hit_ls = []
        # the ranking result of researcher r
        rID = 'R' + str(r+1)
        r_result_ls = ranking_df.iloc[:,r].tolist()
        r_ground_truth_ls = ground_truth.iloc[r].tolist()
        for i in range(k):
            item_i  = r_result_ls[i]
            # to see if item i hit the ground truth
            if item_i in r_ground_truth_ls:
                hit_ls.append(1)
            else:
                hit_ls.append(0)
        rank_matrix[rID] = hit_ls
    return rank_matrix

def get_dcg(rank_list):
    """
    :param rank_list: list, such as  [1,0,1,0,1,1,1], 1 denotes relevant item and 0 denotes irrelevant item
    :return dcg: the dcg value of the input rank list
    """
    n = len(rank_list)
    dcg = 0
    for i in range(n):
        pos = i + 1
        # here gains is 1 or 0
        gains = rank_list[i]
        discounts = np.log2(pos + 1)
        if gains == 0:
            cg = 0
        else:
            cg = (gains / discounts)
        dcg += cg
    return dcg

def get_idcg(rank_list):
    """
    :param rank_list: list, such as  [1,0,1,0,1,1,1], 1 denotes relevant item and 0 denotes irrelevant item
    :return idcg: the ideal dcg value of the input rank list
    """
    ideal_rank_list = sorted(rank_list, reverse=True)
    idcg = get_dcg(ideal_rank_list)
    return idcg

def get_ndcg(rank_list):
    """
    :param rank_list: list, such as  [1,0,1,0,1,1,1], 1 denotes relevant item and 0 denotes irrelevant item
    :return ndcg: the ideal dcg value of the input rank list
    """
    if get_dcg(rank_list) == 0:
        ndcg = 0
    else:
        ndcg = get_dcg(rank_list)/get_idcg(rank_list)
    return ndcg

def get_avg_ndcg(rank_matrix):
    """
    :param rank_list: list, such as  [1,0,1,0,1,1,1], 1 denotes relevant item and 0 denotes irrelevant item
    :return (avg_ndcg,ndcg_ls): the average ndcg value of the input rank matrix for all users and ndcg list for all users
    """
    ndcg_ls = []
    for (k,v) in  rank_matrix.items():
        ndcg = get_ndcg(v)
        ndcg_ls.append(ndcg)
    avg_ndcg = np.mean(ndcg_ls)
    return avg_ndcg,ndcg_ls

def get_precision(rank_list):
    """
    :param rank_list: list, such as  [1,0,1,0,1,1,1], 1 denotes relevant item and 0 denotes irrelevant item
    :return precision: the precision of the input rank list
    """
    tp = 0 # truth positive
    fp = 0 # flase positive
    for i in range(len(rank_list)):
        if rank_list[i] == 1:
            tp += 1
        else:
            fp += 1
    precision = tp/len(rank_list)
    return precision

def avg_precision(rank_matrix):
    """
    :param rank_matrix: a dataframe or a ndarray contains top-10 rank result for all users
    :return: average precision and precision list for all users
    """
    p_ls = []
    for (k,v) in  rank_matrix.items():
        p = get_precision(v)
        p_ls.append(p)
    avg_p = np.mean(p_ls)
    return avg_p,p_ls

def reciprocal_rank(rank_list):
    """ 
    :param rank_list: list, prediction [1,0,1,0,1,1,1], 1 denotes relevant item and 0 denotes irrelevant item
    :return rr: reciprocal rank score of the input rank list
    """
    rr = 0.0
    for index,item in enumerate(rank_list):
        if item == 1:
            rr = 1.0 / (index + 1.0)
            break
    return rr

def mean_reciprocal_rank(rank_matrix):
    """
    :param rank_matrix: a dataframe or a ndarray contains top-10 rank result for all users
    :return (mrr,rr_ls): MRR scores and RR scores list for all users
    """
    rr_ls = []
    for (k, v) in rank_matrix.items():
        rr = reciprocal_rank(v)
        rr_ls.append(rr)
    mrr = np.mean(rr_ls)
    return mrr,rr_ls

def get_ndcg_p_mrr(rank_matrix):
    """
    get final values for each metrics: NDCG@k,P@k and MRR
    :param data: a csv file path that store the rank rusults
    :param ground_truth: a csv file path that store the graound truth items of each user
    :param num_user: number of users, there are 50 researchers in this experiment
    :param k: recommend top k items for each user
    :param rank_matrix_file: output file to save rank matrix
    :return (average_ndcg, average_precision, mrr, metrics4each_user)  
    """
    metrics4each_user = {}

    # get mean ndcg for the ranking matrix of 50 researchers
    average_ndcg = get_avg_ndcg(rank_matrix)
    metrics4each_user['NDCG@10'] = average_ndcg[1]
#     for r in range(1,num_user+1,1):
#         print('the ndcg for researcher {} is: {}'.format(r, average_ndcg[1][r-1]))

    # ge average precision for the ranking result of 50 researcher
    average_precision = avg_precision(rank_matrix)
    metrics4each_user['P@10'] = average_precision[1]
#     for r in range(1,num_user+1,1):
#         print('the precision for researcher {} is: {}'.format(r, average_precision[1][r-1]))

    # get mean reciprocal rank for the ranking result of 50 researcher
    mrr = mean_reciprocal_rank(rank_matrix)
    metrics4each_user['RR'] = mrr[1]
#     for r in range(1,num_user+1,1):
#         print('the reciprocal rank  for researcher {} is: {}'.format(r, mrr[1][r-1]))

#     print('the average ndcg for all researchers is: {}'.format(average_ndcg))
#     print('the average precision for all researchers is: {}'.format(average_precision[0]))
#     print('the mean reciprocal rank for all researchers is: {}'.format(mrr[0]))
    
    
#     return(average_ndcg[0],average_precision[0],mrr[0],metrics4each_user)
    return(average_ndcg[0],average_precision[0],mrr[0])

In [45]:
# get ramdom ranking dataframe for all researchers
def get_random_rank_result(data):
    ranking_df = pd.DataFrame()
    for i in range(1,51,1):
        paperIDs = []
        column_name = 'R'+ str(i)
        # get paperID in the rank result
        for index,row in data.sample(n=10).iterrows():
            paperIDs.append(row[0])
        ranking_df[column_name] = paperIDs
    return ranking_df

def get_mean_random_rank_result(n):
    for i in range(data,n):
        ranking_df = get_random_rank_result(data)
        

In [64]:
import numpy as np
import pandas as pd

data = pd.read_csv('/Users/sherry/Downloads/candidate_papers.csv',sep=',')
ground_truth = pd.read_csv('/Users/sherry/git_project/scholarly_paper_recommendatation/user_profiles/ground_truth.csv',sep=',')

In [66]:
def iter_sampling(n,data,ground_truth):
    metrics_ls = []
    for i in range(n):
        ranking_df = get_random_rank_result(data)
        ranking_matrix = get_rank_matrix(ranking_df, ground_truth,50,10)
        metrics_ls.append(list(get_ndcg_p_mrr(ranking_matrix)))
    res = pd.DataFrame(metrics_ls)
    return res.mean()

In [70]:
iter_sampling(10000,data,ground_truth)

0    0.0
1    0.0
2    0.0
dtype: float64