In [1]:
import pickle as pkl
import json
import numpy as np
from collections import Counter
from sklearn.metrics import adjusted_rand_score, v_measure_score, homogeneity_score, normalized_mutual_info_score, silhouette_score
import string
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')
import warnings
warnings.filterwarnings('ignore')

##### Streaming: don't distinguish 'author name' and 'authorID' and mix all of papers together.

In [2]:
def getClusteringMetrics(true_label, predicted_label):
    ri, nmi, v, h = (adjusted_rand_score(true_label, predicted_label),
                        normalized_mutual_info_score(true_label, predicted_label), 
                             v_measure_score(true_label, predicted_label),
                                 homogeneity_score(true_label, predicted_label))
    return ri, nmi, v, h

In [3]:
def reconstruct_corpus(items):
    user_trans_texts = []
    for user_item in items:
        doc = user_item[4]    # dream market
        user_trans_texts.append(text_preprocess(doc))
    return user_trans_texts

In [4]:
def get_dict_bow(user_trans_texts):
    dct = Dictionary(user_trans_texts)
    corpus = []
    for text in user_trans_texts:
        corpus.append(dct.doc2bow(text))
    return dct, corpus

In [5]:
def text_preprocess(text):
    otherChars = '!"#$%&\'*,-./:;<=>?@[\\]`{|}~()'
    printable = set(string.printable)
    text = text.translate(str.maketrans('', '', otherChars))
    token_list = [token.text.strip() for token in nlp(text.lower()) 
                  if token.text!='' and token.text!=' ' and len(token.text.strip())!=0]
    return token_list

### 1. Online data -> author-name oriented; Evalution data -> author-name oriented.

##### 1.1 DMHP

In [6]:
with open('./dblp/smc_input/groundTruthClusteringLabelByAuthors.pkl', 'rb') as fp:
    groundTruthClusteringLabelByAuthors = pkl.load(fp)

In [None]:
_ground_truth_label_dict = {}
_predicted_cluster_label_dict = {}
coll_stats = []
print("{:<15} {:<10} {:<10} {:<10} {:<10} {:<15} {:<15} {:<15}".format('Name','ARS','NMI', 'V-score', 'H-score', "StreamLength", "GroundTruth", "Prediction"))
print(" ")
for _author_name, _ground_truth_label in groundTruthClusteringLabelByAuthors.items():
    
    with open('./dblp/results/%s_Aminer_paper_authorID.pkl'%_author_name, 'rb') as r:
        particles = pkl.load(r)
    _ground_truth_label = np.squeeze(_ground_truth_label).tolist()
    _predicted_cluster_label = np.array(particles[0].docs2cluster_ID).tolist()
    _ground_truth_label_dict[_author_name] = _ground_truth_label
    _predicted_cluster_label_dict[_author_name] = _predicted_cluster_label
    ri, nmi, v, h = getClusteringMetrics(_ground_truth_label, _predicted_cluster_label)
    coll_stats.append([ri, nmi, v, h])
    print("{:<15} {:<10} {:<10} {:<10} {:<10} {:<15} {:<15} {:<15}".format(_author_name, round(ri, 4), round(nmi, 4), round(v, 4), round(h, 4), 
                 len(_ground_truth_label), len(set(_ground_truth_label)), len(set(_predicted_cluster_label))))

stats = np.mean(np.array(coll_stats), axis=0)
print(" ")
print("{:<15} {:<10} {:<10} {:<10} {:<10}".format("Average", round(stats[0], 4), round(stats[1], 4), round(stats[2], 4), round(stats[3], 4)))

##### 1.2 DHP

In [8]:
_ground_truth_label_dict = {}
_predicted_cluster_label_dict = {}
coll_stats = []
print("{:<15} {:<10} {:<10} {:<10} {:<10} {:<15} {:<15} {:<15}".format('Name','ARS','NMI', 'V-score', 'H-score', "StreamLength", "GroundTruth", "Prediction"))
print(" ")
for _author_name, _ground_truth_label in groundTruthClusteringLabelByAuthors.items():
    with open('./dblp/results/baselines/DHP/resultsGroupedByAuthors/extension/%s_Aminer_paper_authorID.pkl'%_author_name, 'rb') as r:
        particles = pkl.load(r)
    _ground_truth_label = np.squeeze(_ground_truth_label).tolist()
    _predicted_cluster_label = np.array(particles[0].docs2cluster_ID).tolist()
    _ground_truth_label_dict[_author_name] = _ground_truth_label
    _predicted_cluster_label_dict[_author_name] = _predicted_cluster_label
    ri, nmi, v, h = getClusteringMetrics(_ground_truth_label, _predicted_cluster_label)
    coll_stats.append([ri, nmi, v, h])
    print("{:<15} {:<10} {:<10} {:<10} {:<10} {:<15} {:<15} {:<15}".format(_author_name, round(ri, 4), round(nmi, 4), round(v, 4), round(h, 4), 
                 len(_ground_truth_label), len(set(_ground_truth_label)), len(set(_predicted_cluster_label))))

stats = np.mean(np.array(coll_stats), axis=0)
print(" ")
print("{:<15} {:<10} {:<10} {:<10} {:<10}".format("Average", round(stats[0], 4), round(stats[1], 4), round(stats[2], 4), round(stats[3], 4)))

Name            ARS        NMI        V-score    H-score    StreamLength    GroundTruth     Prediction     
 
lei_shi         0.2746     0.4304     0.4302     0.4424     512             45              46             
jing_yu         0.0544     0.3479     0.3445     0.3024     166             32              27             
c_h_chen        0.0538     0.6483     0.6474     0.6143     183             66              60             
li_he           0.1733     0.3246     0.3243     0.3382     192             23              28             
min_chen        0.0569     0.2513     0.2358     0.3605     589             24              53             
xia_zhang       0.1729     0.443      0.4341     0.3622     278             56              31             
xi_zhang        0.0314     0.3995     0.3903     0.3219     308             67              55             
lei_zhu         0.0449     0.4453     0.4282     0.3359     211             53              37             
rui_zhang       0.2424    

##### 2.3 HDP

In [9]:
from gensim.models import HdpModel
from gensim.corpora import Dictionary
from sklearn.cluster import DBSCAN
import spacy
import string
nlp = spacy.load('en_core_web_sm')

In [10]:
with open('./dblp/smc_input_extension/paperStreamingGroupedByAuthors.pkl', 'rb') as fp:
    paperStreamingGroupedByAuthors = pkl.load(fp)
with open('./dblp/smc_input_extension/groundTruthClusteringLabelByAuthors.pkl', 'rb') as fp:
    groundTruthClusteringLabelByAuthors = pkl.load(fp)
with open("./dblp/smc_input_extension/wordVocab.p", 'rb') as fp:
    user_trans_vocab = pkl.load(fp)
with open("./dblp/smc_input_extension/markVocab.p", 'rb') as fp:
    user_trans_vendor_vocab = pkl.load(fp)
word2id, id2word = user_trans_vocab
vendor2id, id2vendor = user_trans_vendor_vocab

In [11]:
print("{:<15} {:<10} {:<10} {:<10} {:<10} {:<15} {:<15} {:<15}".format('Name','ARS','NMI', 'V-score', 'H-score', "StreamLength", "GroundTruth", "Prediction"))
print(" ")

coll_stats_loop = []
for _i in np.arange(10):
    
    coll_stats = []
    for _author_name, _paper_streaming in paperStreamingGroupedByAuthors.items():

        pred_cluster = []
        user_trans_texts = reconstruct_corpus(_paper_streaming)
        dct, corpus = get_dict_bow(user_trans_texts)

        # HDP
        hdp = HdpModel(corpus, dct)
        hdp_list =hdp[corpus]
        doc_topics = np.zeros((len(corpus), len(hdp.print_topics(num_topics=len(hdp_list)+3))))
    #     hdp_list =hdp[corpus]
        try:
            for idx, doc in enumerate(hdp_list):
                for tid, prob in doc:
                    doc_topics[idx][tid] = prob
        except: 
            print(idx, ": ", tid, len(hdp_list))

        # DBSCAN FOR CLUSTERING
        clustering = DBSCAN(eps=0.5, min_samples=5).fit(doc_topics)
        pred_cluster = clustering.labels_
        true_label = np.squeeze(groundTruthClusteringLabelByAuthors[_author_name]).tolist()

        # Metrics
        ri, nmi, v, h = (adjusted_rand_score(true_label, pred_cluster), 
        normalized_mutual_info_score(true_label, pred_cluster), v_measure_score(true_label, pred_cluster),
        homogeneity_score(true_label, pred_cluster))
        coll_stats.append([ri, nmi, v, h])
        print("{:<15} {:<10} {:<10} {:<10} {:<10} {:<15} {:<15} {:<15}".format(_author_name, round(ri, 4), round(nmi, 4), round(v, 4), round(h, 4), 
                     len(true_label), len(set(true_label)), len(set(pred_cluster))))

    stats = np.mean(np.array(coll_stats), axis=0)
    coll_stats_loop.append(stats)
    print("***%s"%_i)
    print("{:<15} {:<10} {:<10} {:<10} {:<10}".format("Average", round(stats[0], 4), round(stats[1], 4), round(stats[2], 4), round(stats[3], 4)))

Name            ARS        NMI        V-score    H-score    StreamLength    GroundTruth     Prediction     
 
lei_shi         0.0156     0.1164     0.1156     0.1034     512             45              13             
jing_yu         0.0145     0.2114     0.1831     0.1221     166             32              6              
c_h_chen        -0.0023    0.2433     0.1711     0.1        183             66              6              
li_he           0.0075     0.1421     0.1393     0.1162     192             23              8              
min_chen        0.0231     0.081      0.0796     0.0976     589             24              13             
xia_zhang       0.0084     0.1568     0.1415     0.0988     278             56              6              
xi_zhang        0.0031     0.2819     0.2668     0.2017     308             67              15             
lei_zhu         0.0038     0.1959     0.1304     0.0747     211             53              5              
rui_zhang       -0.0002   

yi_chen         -0.0396    0.2184     0.2143     0.1796     366             59              14             
jie_zhu         0.0142     0.1262     0.1139     0.0795     198             25              7              
zhe_zhang       0.001      0.2207     0.1939     0.1312     235             61              8              
jing_jin        -0.0003    0.1961     0.1641     0.106      174             38              6              
qing_zhang      -0.0089    0.1632     0.1607     0.1367     349             46              13             
1 :  76 52
m_yang          0.018      0.2609     0.1275     0.0681     52              16              2              
2 :  78 70
l_zhao          0.0194     0.2235     0.1104     0.059      70              22              2              
yi_jiang        -0.0121    0.1879     0.1632     0.1091     196             41              6              
31 :  148 145
jing_tian       0.0198     0.2066     0.1495     0.0884     145             37              2       

jing_yu         0.0415     0.1846     0.1694     0.1211     166             32              5              
c_h_chen        -0.0004    0.1974     0.13       0.0742     183             66              5              
li_he           -0.0618    0.084      0.0795     0.06       192             23              4              
min_chen        -0.0003    0.1075     0.1002     0.157      589             24              18             
xia_zhang       0.0097     0.2156     0.2052     0.1571     278             56              10             
xi_zhang        0.0008     0.1815     0.1535     0.1001     308             67              8              
lei_zhu         0.0106     0.2188     0.1622     0.097      211             53              6              
rui_zhang       -0.0015    0.0663     0.0649     0.0538     834             33              13             
kai_chen        0.0454     0.2154     0.2151     0.2037     234             32              12             
jie_yang        0.0116     0

zhe_zhang       -0.0252    0.2442     0.2157     0.1468     235             61              8              
jing_jin        0.0061     0.1895     0.1485     0.0916     174             38              5              
qing_zhang      -0.0327    0.1952     0.1943     0.1777     349             46              17             
1 :  76 52
m_yang          0.0089     0.1992     0.0763     0.0397     52              16              2              
6 :  128 70
l_zhao          0.043      0.3167     0.1929     0.1076     70              22              2              
yi_jiang        -0.0012    0.1707     0.14       0.0891     196             41              6              
77 :  148 145
jing_tian       0.0125     0.2135     0.1695     0.1054     145             37              2              
2 :  131 106
m_li            0.0025     0.0999     0.033      0.017      106             44              2              
yan_gao         -0.0099    0.1527     0.1475     0.117      262             35        