In [17]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import os
import shutil
from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity

# setting working directory
os.chdir('/Users/sherry/PhD-research/paper-recommendation/feature_vector_datasets/Kazunari-Sugiyam/20131106-SchPaperRecData/ScholarlyPaperRecData')

# functions to compute cosine similarity score
def cosine(fv1_file:str, fv2_file:str):
    """
    fv1_file: featrue vector file path, e.g fv1_path = 'rlv/R49/CHI07-p1253-lampe_fv.txt'
    """
    fv1 = pd.read_csv(fv1_file, names=['token', 'tf/tfidf'], header=None, delimiter=' ')
    fv2 = pd.read_csv(fv2_file, names=['token', 'tf/tfidf'], header=None, delimiter=' ')

    #avoidind empty input file, there are some feature vector files that are empty.
    if len(fv1) == 0 or len(fv2) == 0:
        cos = None
        return cos

    # Merge the two DataFrames based on the 'token' column
    merged_df = pd.merge(fv1, fv2, on='token', how='outer')

    # replace missing value NaN with 0
    merged_df = merged_df.fillna(0)

    # Get the 'tf/tfidf' value columns as arrays
    values1 = merged_df['tf/tfidf_x'].values.reshape(1, -1)
    values2 = merged_df['tf/tfidf_y'].values.reshape(1, -1)

    # Compute the cosine similarity
    cosine_sim = cosine_similarity(values1, values2)

    return cosine_sim[0,0]

def cos_weight_fv(fv1_file:str, fv2_file:str):
    """
    fv1_file: featrue vector file path of a user's publication, 
            e.g fv1_path = 'rlv/R49/CHI07-p1253-lampe_fv.txt'
    fv2_file: featrue vector file path of a citation paper or a reference paper 
            of the publication
    return fv2: featrue vector dataframe with two columns ['token', 'tf/tfidf']
    
    """
    cos_weight = cosine(fv1_file, fv2_file)
    fv2 = pd.read_csv(fv2_file, names=['token', 'tf/tfidf'], header=None, delimiter=' ')
    fv2['tf/tfidf'] = cos_weight * fv2['tf/tfidf']
    return fv2


def cos_fv_input(fv1:pd.DataFrame, fv2:pd.DataFrame):
    """
    fv1: featrue vector dataframe with two columns ['token', 'tf/tfidf']
    """
    # avoidind empty input file, there are some feature vector files that are empty.
    # if len(fv1) == 0 or len(fv2) == 0:
    #     cos = None
    #     return cos
    
    # Merge the two DataFrames based on the 'token' column
    merged_df = pd.merge(fv1, fv2, on='token', how='outer')

    # replace missing value NaN with 0
    merged_df = merged_df.fillna(0)

    # Get the 'tf/tfidf' value columns as arrays
    values1 = merged_df['tf/tfidf_x'].values.reshape(1, -1)
    values2 = merged_df['tf/tfidf_y'].values.reshape(1, -1)

    # Compute the cosine similarity, cosine_sim:np.array
    cosine_sim = cosine_similarity(values1, values2)

    return cosine_sim[0,0]

In [18]:
# ************ using all publications to construct user profile
# for researcher R who have n publications
# *** user_fv = FVp1 + FVp2 +...FVpn

user_statistics_df = pd.read_csv('user_profiles_statistics.csv')
num_pubs_ls = user_statistics_df.iloc[:,1].tolist()

user_stats = pd.read_csv('std.csv', header=0)
user_std = user_stats['std']

################################################## weighting function #####################
# weighted parameter
def fstd_weight(n, std, a=0.1, b=1):
    weights_ls = []
    for i in range(1,n+1):
        # (1-i/n) ranges from (n-i)/n to 1/n
        # emath.logn(n, x),Take log base n of x.
        # x = std + a # 0<x<1, base = (i+b) # base > 1
        weight = -1 * np.emath.logn(i+b, std*std)
        weights_ls.append(weight)
    norm_weights_ls = []
    for weight in weights_ls:
        norm_weight = weight/sum(weights_ls)
        norm_weights_ls.append(norm_weight)
    # print(sum(norm_weights_ls))
    return norm_weights_ls
################################################## weighting function #####################

os.chdir('/Users/sherry/PhD-research/paper-recommendation/feature_vector_datasets/Kazunari-Sugiyam/20131106-SchPaperRecData/ScholarlyPaperRecData/')

############################# change results path #############################
cos_results_path = "result/fstd_weights"
############################# change results path #############################

for r in range(1,51,1):
    print(f'Starting R{r}')
    user_fv = pd.DataFrame(columns=['token','tf/tfidf'])
    fvs = []
    # get number of publications for the researcher
    n = num_pubs_ls[r-1]
    std = user_std[r-1]
    print(f"n: {n}, std: {std}")
    a = 0.1
    b = 10
    norm_weights_ls = fstd_weight(n, std, a, b)
    for i in range(1,n,1):
        fv_path = 'Researchers/R{}/FeatureVectors/R{}-{}/R{}-{}_fv.txt'.format(r,r,i,r,i)
        fv = pd.read_csv(fv_path, names=['token', 'tf/tfidf'], header=None, delimiter=' ')
        # weighted feature vector 
        fv['tf/tfidf'] = fv['tf/tfidf'] * norm_weights_ls[i-1]
        fvs.append(fv)
    # concatenate all publications fv:pd.DataFrame to construct researcher's profile 
    pub_fv = pd.DataFrame(columns=['token','tf/tfidf'])
    for fv in fvs:
        pub_fv = pd.concat([pub_fv,fv])
    user_fv = pub_fv.groupby('token').sum().reset_index()

    # iterate all files in a directory and its subdirectories to get the paperID and paperFV
    # and then compute the cosine score between (user_fv, fv_candidate)
    root = 'RecCandidatePapersFV'
    paperID_ls = []
    cos_ls = []
    result_df = pd.DataFrame(columns=['paperID', 'cosine_score'])
    for path, subdir, files in os.walk(root):
        for name in files:
            file_path = os.path.join(path, name)
            # display(file_path)
            if file_path.endswith('.txt'):
                fv_candidate = pd.read_csv(file_path, names=['token', 'tf/tfidf'], header=None, delimiter=' ')
                cosine_score = cos_fv_input(user_fv, fv_candidate)
                paperID = name.replace('_fv.txt' , '')
                paperID_ls.append(paperID)
                cos_ls.append(cosine_score)
            else: # ignore other system files such as .DSstore in MacOS
                pass
    result_df['paperID'] = paperID_ls
    result_df['cosine_score'] = cos_ls
    sorted_df = result_df.sort_values('cosine_score', ascending=False)
    sorted_df.to_csv(f'{cos_results_path}/R{r}_cosine.csv')  
    print('finished R{}'.format(r)) 


Starting R1
n: 4, std: 0.291
finished R1
Starting R2
n: 12, std: 0.146
finished R2
Starting R3
n: 7, std: 0.135
finished R3
Starting R4
n: 5, std: 0.108
finished R4
Starting R5
n: 2, std: 0.0


  return nx.log(x)/nx.log(n)
  norm_weight = weight/sum(weights_ls)


finished R5
Starting R6
n: 7, std: 0.114
finished R6
Starting R7
n: 16, std: 0.088
finished R7
Starting R8
n: 7, std: 0.226
finished R8
Starting R9
n: 13, std: 0.115
finished R9
Starting R10
n: 5, std: 0.073
finished R10
Starting R11
n: 12, std: 0.075
finished R11
Starting R12
n: 14, std: 0.108
finished R12
Starting R13
n: 9, std: 0.116
finished R13
Starting R14
n: 14, std: 0.271
finished R14
Starting R15
n: 5, std: 0.07
finished R15
Starting R16
n: 18, std: 0.104
finished R16
Starting R17
n: 12, std: 0.125
finished R17
Starting R18
n: 4, std: 0.236
finished R18
Starting R19
n: 4, std: 0.085
finished R19
Starting R20
n: 12, std: 0.096
finished R20
Starting R21
n: 8, std: 0.076
finished R21
Starting R22
n: 22, std: 0.096
finished R22
Starting R23
n: 24, std: 0.113
finished R23
Starting R24
n: 18, std: 0.122
finished R24
Starting R25
n: 19, std: 0.095
finished R25
Starting R26
n: 3, std: 0.049
finished R26
Starting R27
n: 24, std: 0.144
finished R27
Starting R28
n: 17, std: 0.088
finishe

In [19]:
import pandas as pd
import numpy as np
import os

'''for each researcher:
1. read all interested papers from txt file into a list as ground truth
2. read recommending result from csv file into a dataframe
3. check each recommending paper to see if it hits the ground truth
'''

class Metrics():
    """ 
        :param rank_ls: list, prediction [1,0,1,0,1,1,1], 1 denotes relevant item and 0 denotes irrelevant item
        :k: int
    """    
    def __init__(self, rank_ls:list, k:int) -> None:
        self.rank_ls = rank_ls
        self.k = k
    
    def get_precision(self):
        tp = 0
        fp = 0
        for i in self.rank_ls[0:self.k]:
            if i == 1:
                tp += 1
            else:
                fp += 1
        p = tp/self.k
        return round(p, 3)

    def get_recall(self, total_positive:int):
        tp = 0
        fp = 0
        for i in self.rank_ls[0:self.k]:
            if i == 1:
                tp += 1
            else:
                fp += 1
        recall = tp/self.k
        return round(recall, 3)

    def get_fscore(self, total_positive:int):
        p = self.get_precision()
        r = self.get_recall(total_positive)
        if (p+r) != 0:
            f1 = 2*p*r/(p+r)
        else:
            return 0
        return round(f1, 3)

    def get_reciprocal_rank(self):
        rr = 0.0
        for index,item in enumerate(self.rank_ls[0:self.k]):
            if item == 1:
                rr = 1.0 / (index + 1.0)
                break
        return round(rr, 3)

    def get_average_p(self):
        p_ls = []
        for index,item in enumerate(self.rank_ls[0:self.k]):
            if item == 1:
                p_ls.append(1.0 / (index + 1))
        if len(p_ls) == 0:
            return 0
        else:
            return round(np.mean(p_ls), 3)  
        
    
    def get_dcg(self):
        """
        :param rank_ls: list, such as  [1,0,1,0,1,1,1], 1 denotes relevant item and 0 denotes irrelevant item
        :return dcg: the dcg value of the input rank list
        """
        n = self.k
        dcg = 0
        for i in range(n):
            pos = i + 1
            # here gains is 1 or 0
            gains = self.rank_ls[i]
            discounts = np.log2(pos + 1)
            if gains == 0:
                cg = 0
            else:
                cg = (gains / discounts)
            dcg += cg
        return dcg

    def get_idcg(self):
        """
        :param rank_ls: list, such as  [1,0,1,0,1,1,1], 1 denotes relevant item and 0 denotes irrelevant item
        :return idcg: the ideal dcg value of the input rank list
        """
        ideal_rank_ls = sorted(self.rank_ls, reverse=True)
        n = self.k
        idcg = 0
        for i in range(n):
            pos = i + 1
            # here gains is 1 or 0
            gains = ideal_rank_ls[i]
            discounts = np.log2(pos + 1)
            if gains == 0:
                cg = 0
            else:
                cg = (gains / discounts)
            idcg += cg
        return idcg

    def get_ndcg(self):
        """
        :param rank_list: list, such as  [1,0,1,0,1,1,1], 1 denotes relevant item and 0 denotes irrelevant item
        :return ndcg: the ideal dcg value of the input rank list
        """
        if self.get_dcg() == 0:
            ndcg = 0
        else:
            ndcg = self.get_dcg()/self.get_idcg()
        return ndcg

In [21]:
for r in range(1,51,1):
    gt_df = pd.read_csv('ground_truth/rlv_list/R{}-rlv.txt'.format(r), header=None)
    gt_df.columns = ['papeID']
    gt_set = set(gt_df.papeID)
    
    #### recommendation result in csv with two column ['paperID', ['cosine_score']]
    path = f'{cos_results_path}/R{r}_cosine.csv'
    recommending_df = pd.read_csv(path, index_col=0, nrows=30)
    hit_ls = []
    for id in recommending_df.paperID:
        if id in gt_set:
            hit_ls.append(1)
        else:
            hit_ls.append(0)
    recommending_df['hit'] = hit_ls
    recommending_df.to_csv(f'{cos_results_path}/R{r}_hit.csv')

# calculate the metrics
total_positives = pd.read_csv('user_profiles_statistics.csv')['numbers of interest paper']
#for each researcher
p_ls, r_ls, f1_ls, map_ls, mrr_ls, ndcg_ls= ([],[],[],[],[],[])
r = 1
n = 50
for r in range(1,51,1):
    data = pd.read_csv(f'{cos_results_path}/R{r}_hit.csv')
    rank_ls = data.hit
    evaluation = Metrics(rank_ls=rank_ls, k=10)
    precision = evaluation.get_precision()
    p_ls.append(precision)

    recall = evaluation.get_recall(total_positive=total_positives[r-1])
    r_ls.append(recall)

    f1 = evaluation.get_fscore(total_positive=total_positives[r-1])
    f1_ls.append(f1)

    rr = evaluation.get_reciprocal_rank()
    mrr_ls.append(rr)
    
    ap = evaluation.get_average_p()
    map_ls.append(ap)

    ap = evaluation.get_ndcg()
    ndcg_ls.append(ap)

avg_p = np.mean(p_ls)
avg_r = np.mean(r_ls)
avg_f1 = np.mean(f1_ls)
mrr = np.mean(mrr_ls)
map = np.mean(map_ls)
avg_ndcg = np.mean(ndcg_ls)

print(f'{cos_results_path}\n')
print('NDCG@10: {}'.format(avg_ndcg))
print('MRR: {}'.format(mrr))
print('P@10: {}'.format(avg_p))
# print('Recall: {}'.format(avg_r))
# print('F1: {}'.format(avg_f1))
# print('MAP: {}'.format(map))


result/fstd_weights

NDCG@10: 0.29892374972112845
MRR: 0.33887999999999996
P@10: 0.096
