In [1]:
import mongoengine
from mongoengine import *
from mongoengine.queryset.visitor import Q
from dbmodels import *
from hashfunction import *
from readref import *

import numpy as np
import pandas as pd
from bson.objectid import ObjectId
import configparser
from multiprocessing import Pool
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read the confidentials.
credentials = configparser.ConfigParser()
credentials.read('credentials.ini')

# Connect to the database.
connect(
    db = credentials.get('lb', 'db'),
    username = credentials.get('lb', 'username'),
    password = credentials.get('lb', 'password'),
    host = credentials.get('lb', 'host'), 
    port = int(credentials.get('lb', 'port')), 
)

MongoClient(host=['128.178.60.49:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary())

---

# Read and organize ground truth

In [3]:
file_path = './hash_ground_truth/'
list_df_10 = pd.read_excel(file_path+'hashing groundtruth (1-10).xlsx', sheet_name=list(range(0,10)))
list_df_100 = pd.read_excel(file_path+'hashing groundtruth (10-100).xlsx', sheet_name=list(range(0,10)))
list_df_500 = pd.read_excel(file_path+'hashing groundtruth (100-500).xlsx', sheet_name=list(range(0,10)))

In [10]:
def hash_evaluate(f, g):
    seed_id = f.seed_ref_id
    seed_ref = Reference.objects(id=seed_id).first()
    seed_citation = read_ref(seed_ref)
    
    ref_id = f.ref_id
    ref_ref = Reference.objects(id=ref_id).first()
    ref_citation = read_ref(ref_ref)
    
    
    if type(g) == CitationTitleHashGenerator:
        if 'title' in seed_citation and 'title' in ref_citation:
            g_set = set(g.generate(seed_citation)) & set(g.generate(ref_citation))
            if len(g_set) > 0: return 1
            else: return 0
        else: return -1
        
    if type(g) == CitationAuthorTitleHashGenerator:
        if 'author' in seed_citation and 'author' in ref_citation and 'title' in seed_citation and 'title' in ref_citation:
            g_set = set(g.generate(seed_citation)) & set(g.generate(ref_citation))
            if len(g_set) > 0: return 1
            else: return 0
        else: return -1
    
    if type(g) in [CitationAuthorYearHashGenerator, CitationAuthorBlurYearHashGenerator, 
                   CitationAuthorYearNumHashGenerator]:
        if 'author' in seed_citation and 'author' in ref_citation and 'year' in seed_citation and 'year' in ref_citation:
            g_set = set(g.generate(seed_citation)) & set(g.generate(ref_citation))
            if len(g_set) > 0: return 1
            else: return 0
        else: return -1
    
    
    if type(g) in [CitationAuthorYearPageHashGenerator, CitationAuthorBlurYearPageHashGenerator]:
        if 'author' in seed_citation and 'author' in ref_citation and 'year' in seed_citation and 'year' in ref_citation and 'page' in seed_citation and 'page' in ref_citation:
            g_set = set(g.generate(seed_citation)) & set(g.generate(ref_citation))
            if len(g_set) > 0: return 1
            else: return 0
        else: return -1
        
    
    g_set = set(g.generate(seed_citation)) & set(g.generate(ref_citation))
    if len(g_set) > 0: return 1
    else: return 0

In [5]:
# Baseline
g0 = CitationBaselineHashGenerator()

# Bigrams of Title
g1 = CitationTitleHashGenerator()

# Author + Year
g2 = CitationAuthorYearHashGenerator()
g3 = CitationAuthorBlurYearHashGenerator()

# Author + Year + Page
g4 = CitationAuthorYearPageHashGenerator()
g5 = CitationAuthorBlurYearPageHashGenerator()

# Author + Year + Num
g6 = CitationAuthorYearNumHashGenerator(n=3)

###############Not in the Paper#################
# Author
# g7 = CitationAuthorHashGenerator()

# Author + Title
g8 = CitationAuthorTitleHashGenerator()


In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
for i, df in enumerate(list_df_10.values()):
    df = df[['seed_ref_id','ref_id',True,'Hash']]
    df['Hash'] = ~df.seed_ref_id.isna()*1
    df = df.fillna(method='ffill')
    df['baseline'] = df.apply(hash_evaluate, args=[g0], axis=1)
    df['title'] = df.apply(hash_evaluate, args=[g1], axis=1)
    df['author_year'] = df.apply(hash_evaluate, args=[g2], axis=1)
    df['author_blur_year'] = df.apply(hash_evaluate, args=[g3], axis=1)
    df['author_title'] = df.apply(hash_evaluate, args=[g8], axis=1)
    df['author_year_num'] = df.apply(hash_evaluate, args=[g6], axis=1)
    df['author_year_page'] = df.apply(hash_evaluate, args=[g4], axis=1)
    df['author_blur_year_page'] = df.apply(hash_evaluate, args=[g5], axis=1)
    print('-'*10)
    print(i)
    df.to_pickle(f'./pickle/hash_eval/0_{i}.pkl')

----------
0
----------
1
----------
2
----------
3
----------
4
----------
5
----------
6
----------
7
----------
8
----------
9


In [8]:
for i, df in enumerate(list_df_100.values()):
    df = df[['seed_ref_id','ref_id',True,'Hash']]
    df['Hash'] = ~df.seed_ref_id.isna()*1
    df = df.fillna(method='ffill')
    df['baseline'] = df.apply(hash_evaluate, args=[g0], axis=1)
    df['title'] = df.apply(hash_evaluate, args=[g1], axis=1)
    df['author_year'] = df.apply(hash_evaluate, args=[g2], axis=1)
    df['author_blur_year'] = df.apply(hash_evaluate, args=[g3], axis=1)
    df['author_title'] = df.apply(hash_evaluate, args=[g8], axis=1)
    df['author_year_num'] = df.apply(hash_evaluate, args=[g6], axis=1)
    df['author_year_page'] = df.apply(hash_evaluate, args=[g4], axis=1)
    df['author_blur_year_page'] = df.apply(hash_evaluate, args=[g5], axis=1)
    print('-'*10)
    print(i)
    df.to_pickle(f'./pickle/hash_eval/1_{i}.pkl')

----------
0
----------
1
----------
2
----------
3
----------
4
----------
5
----------
6
----------
7
----------
8
----------
9


In [None]:
for i, df in enumerate(list_df_500.values()):
    df = df[['seed_ref_id','ref_id',True,'Hash']]
    df['Hash'] = ~df.seed_ref_id.isna()*1
    df = df.fillna(method='ffill')
    df['baseline'] = df.apply(hash_evaluate, args=[g0], axis=1)
    df['title'] = df.apply(hash_evaluate, args=[g1], axis=1)
    df['author_year'] = df.apply(hash_evaluate, args=[g2], axis=1)
    df['author_blur_year'] = df.apply(hash_evaluate, args=[g3], axis=1)
    df['author_title'] = df.apply(hash_evaluate, args=[g8], axis=1)
    df['author_year_num'] = df.apply(hash_evaluate, args=[g6], axis=1)
    df['author_year_page'] = df.apply(hash_evaluate, args=[g4], axis=1)
    df['author_blur_year_page'] = df.apply(hash_evaluate, args=[g5], axis=1)
    print('-'*10)
    print(i)
    df.to_pickle(f'./pickle/hash_eval/2_{i}.pkl')

---

# Calculate precision and recall

In [15]:
def read_hash_eval(k):
    df_list = []
    for i in range(10):
        df = pd.read_pickle(f'./pickle/hash_eval/{k}_{i}.pkl')
        df_list.append(df)
    return df_list

In [16]:
df0_list = read_hash_eval(0)
df1_list = read_hash_eval(1)
df2_list = read_hash_eval(2)

In [378]:
def recall_precision(df, col):
    if col== 'all':
        true_index = df[True] == 1
        false_index = ~true_index
        pos_index = df['Hash'] == 1
        neg_index = ~pos_index
#         pos_index = df.iloc[:,4:].max(axis=1) == 1
#         neg_index = ~pos_index
    
    else:
        # have such hash
        have_index = df[col]>=0

        # returned or not
        pos_index = df[col]==1
        neg_index = df[col]==0

        # finally returned or not because of black list
        return_index = df[have_index]['Hash'] == 1
        no_return_index = ~return_index

        pos_index = pos_index & return_index
        neg_index = neg_index | no_return_index

        assert pos_index.sum() + neg_index.sum() == have_index.sum()

        # is the true match or not
        true_index = df[have_index][True] == 1
        false_index = ~true_index
#         true_index = df[True]==1
#         false_index = ~true_index
    
    if true_index.sum() == 0:
        return -1, -1
    
    true_pos = (true_index & pos_index).sum()
    true_neg = (false_index & neg_index).sum()

    false_pos = (false_index & pos_index).sum()
    false_neg = (true_index & neg_index).sum()
    
    recall = true_pos / (true_pos + false_neg) if true_pos + false_neg else 0.0
    precision = true_pos / (true_pos + false_pos) if true_pos + false_pos else 0.0
    
    return recall, precision

In [379]:
def number_return(df, col):
    if col== 'all':
        return len(df[df['Hash']==1])
    else:
        return len(df[(df[col]==1) & (df['Hash']==1)])

In [380]:
result = []
for k, df_list in enumerate([df0_list, df1_list, df2_list]):
    for i, df in enumerate(df_list):
        for col in list(df.columns[4:]) + ['all']:
            recall, precision = recall_precision(df, col)
            num = number_return(df, col)
            d = {
                'list' : k,
                'item' : i,
                'len'  : len(df),
                'list-item' : f"{k}-{i}-{len(df)}",
                'num'  : num,
                'hash' : col,
                'recall' : recall,
                'precision' : precision
            }
            result.append(d)

In [381]:
df_result = pd.DataFrame.from_records(result)

In [382]:
ind=['baseline', 'title','author_title','author_year','author_blur_year','author_year_page', 'author_blur_year_page', 'author_year_num', 'all']

In [383]:
df_result[df_result.list==0].pivot(index='hash', columns='list-item', values='num').loc[ind]

list-item,0-0-2,0-1-2,0-2-3,0-3-4,0-4-4,0-5-4,0-6-6,0-7-6,0-8-7,0-9-9
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
baseline,0,0,0,0,0,0,0,0,0,0
title,1,2,2,2,4,4,5,0,7,8
author_title,1,1,0,2,2,0,4,0,0,0
author_year,1,1,0,0,0,0,4,4,0,0
author_blur_year,2,1,1,2,0,0,5,6,0,0
author_year_page,0,0,0,0,0,0,0,0,0,0
author_blur_year_page,0,0,0,0,0,0,0,0,0,0
author_year_num,0,0,0,0,0,0,1,0,0,0
all,2,2,3,4,4,4,6,6,7,8


In [384]:
df_result[df_result.list==1].pivot(index='hash', columns='list-item', values='num').loc[ind]

list-item,1-0-13,1-1-14,1-2-17,1-3-15,1-4-21,1-5-92,1-6-87,1-7-83,1-8-73,1-9-72
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
baseline,0,2,0,0,0,0,1,0,0,0
title,13,4,11,13,21,84,81,42,57,19
author_title,1,3,11,2,14,57,15,24,12,10
author_year,1,4,13,3,4,7,15,6,15,28
author_blur_year,1,13,13,4,4,14,18,44,22,49
author_year_page,0,0,0,0,0,0,0,0,0,0
author_blur_year_page,0,0,0,0,0,1,0,0,0,0
author_year_num,0,3,0,0,3,0,9,0,0,2
all,13,14,14,15,21,91,84,83,72,64


In [385]:
df_result[df_result.list==2].pivot(index='hash', columns='list-item', values='num').loc[ind]

list-item,2-0-151,2-1-151,2-2-183,2-3-185,2-4-187,2-5-493,2-6-468,2-7-439,2-8-423,2-9-428
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
baseline,0,0,6,0,0,5,1,1,0,0
title,144,144,37,136,182,411,450,422,314,415
author_title,38,38,13,3,7,410,14,49,294,4
author_year,10,10,55,26,4,436,8,26,331,1
author_blur_year,18,18,157,48,8,469,21,34,378,2
author_year_page,0,0,0,0,0,0,2,0,0,0
author_blur_year_page,0,0,0,0,0,0,2,0,0,0
author_year_num,0,0,9,0,0,181,2,8,0,0
all,151,151,182,185,186,483,464,432,421,415


In [386]:
df_result.groupby('hash')[['num']].sum().loc[ind]

Unnamed: 0_level_0,num
hash,Unnamed: 1_level_1
baseline,16
title,3035
author_title,1029
author_year,1013
author_blur_year,1352
author_year_page,2
author_blur_year_page,3
author_year_num,218
all,3587


In [387]:
df_result.groupby('hash')[['recall']].apply(lambda x: np.mean(x[x>=0])).sort_values('recall', ascending=False).loc[ind]

Unnamed: 0_level_0,recall
hash,Unnamed: 1_level_1
baseline,0.064772
title,0.82435
author_title,0.738881
author_year,0.845194
author_blur_year,0.862579
author_year_page,0.125
author_blur_year_page,0.25
author_year_num,0.207484
all,0.84473


In [388]:
df_result.groupby(['list','hash'])[['recall']] \
         .apply(lambda x: np.mean(x[x>=0])) \
         .reset_index() \
         .sort_values(['list','recall'], ascending=[True, False]) \
         .set_index(['list','hash'])

Unnamed: 0_level_0,Unnamed: 1_level_0,recall
list,hash,Unnamed: 2_level_1
0,all,0.916667
0,title,0.916667
0,author_blur_year,0.625
0,author_title,0.625
0,author_year,0.625
0,author_year_num,0.0625
0,author_blur_year_page,0.0
0,author_year_page,0.0
0,baseline,0.0
1,author_blur_year,0.974026


In [389]:
df_result.groupby('hash')[['precision']].apply(lambda x: np.mean(x[x>=0])).sort_values('precision', ascending=False).loc[ind]

Unnamed: 0_level_0,precision
hash,Unnamed: 1_level_1
baseline,0.277778
title,0.335933
author_title,0.547386
author_year,0.628679
author_blur_year,0.438166
author_year_page,0.125
author_blur_year_page,0.25
author_year_num,0.316536
all,0.257475


In [390]:
df_result.groupby(['list','hash'])[['precision']] \
         .apply(lambda x: np.mean(x[x>=0])) \
         .reset_index() \
         .sort_values(['list','precision'], ascending=[True, False]) \
         .set_index(['list','hash'])

Unnamed: 0_level_0,Unnamed: 1_level_0,precision
list,hash,Unnamed: 2_level_1
0,title,0.7625
0,author_title,0.75
0,author_year,0.75
0,all,0.604167
0,author_blur_year,0.575
0,author_year_num,0.25
0,author_blur_year_page,0.0
0,author_year_page,0.0
0,baseline,0.0
1,author_year,0.588116


### Analysis of False Negative

In [398]:
df_fn_list = []
for df_list in [df0_list, df1_list, df2_list]:
    for df in df_list:
#         df_fn = df[(df[True]==1) & (df.iloc[:,4:].max(axis=1) < 1)]
        df_fn = df[(df[True]==1) & (df['Hash']==0)]
        df_fn_list.append(df_fn)

In [400]:
df_fn = pd.concat(df_fn_list)
df_fn['seed_ref'] = df_fn['seed_ref_id'].apply(lambda x: Reference.objects(id=x).first().reference_string)
df_fn['ref'] = df_fn['ref_id'].apply(lambda x: Reference.objects(id=x).first().reference_string)

len(df_fn), len(set(df_fn['seed_ref_id'].values))

(54, 13)

In [397]:
len(df_fn[(df_fn.author_year==-1)|(df_fn.author_title==-1)])

54