In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import json
import random
import time
import pickle as pkl
import datetime

### Aminer dataset link: https://www.aminer.cn/billboard/whoiswhofbclid=IwAR37MGR9Gt5pMpsxvtIu1Y_LeJyUkeNvbD0QCen5ny4nQKbEgNHpzGmicpI

### 1. Dataset exploration: validation dataset II (continuous paper streaming)

In [2]:
with open('./raw_data/cna_valid_unass_competition.json') as f:
    valid_stream = json.load(f)
with open('./raw_data/cna_valid_pub.json') as f:
    valid_pub = json.load(f)
with open('./raw_data/cna_valid_author_ground_truth.json') as f:
    valid_author = json.load(f)
with open('./raw_data/whole_author_profile.json') as f:
    whole_author_profile = json.load(f)

In [3]:
authorID2authorName = {}
authorName2authorID = {}
for _authorID, _authorInfo in whole_author_profile.items():
    authorID2authorName[_authorID] = _authorInfo['name']
    authorName2authorID[_authorInfo['name']] = [_authorID]

In [4]:
def findAuthorID(author_name, paper_id, author_dict):
    """
    Given a general 'author_name' and a paper id,  pinpoint the exact 'author_id' 
    which is the ground truth of the lattent cluters released by our model.
    """
    for _author_id, _paper_id_list in author_dict[author_name].items():
        if paper_id in _paper_id_list:
            return _author_id

##### 1.1 Along the publication time, generally collect a global paper streaming without any grouping action, e.g. 'author_name specific', in which for  each  publication activity,  we attach the corresponding associated information such as title, abstract, publication date etc. 

In [5]:
streaming = []
labels = []
authorName2StreamingIndex = defaultdict(list)

streaming_id = 0
count_abstract = 0
count_name = 0
for _paperId_autOrd in valid_stream:
    _paper_id, _author_order = _paperId_autOrd.split("-")
    try:
        # some paper does not have the 'abstract' attribute.
        _title = valid_pub[_paper_id]["title"]
        _abstract = valid_pub[_paper_id]["abstract"]
        _mark = valid_pub[_paper_id]["venue"]
        _year = valid_pub[_paper_id]["year"]
        _month = random.randrange(1,13)
        _day = random.randrange(1,29)
        _time_str = str(_day) + "/" + str(_month) + "/" + str(_year)
        _time_str = datetime.datetime.strptime(_time_str, "%d/%m/%Y")
        _time = time.mktime(_time_str.timetuple())
        
        _author_name = valid_pub[_paper_id]["authors"][int(_author_order)]["name"].lower()
        if "." in _author_name:
            _author_first_name = _author_name.split(" ")[0].replace(".-", "_").replace(".", "_")
            _author_family_name = _author_name.split(" ")[1]
            _author_name = _author_first_name + _author_family_name
        else:
            _author_name = _author_name.replace(" ", "_")
        stream_flag = True
    except:
        stream_flag = False
        count_abstract += 1
        
    try:
        # Tedious to map some real authors' names to the 'author_name' attribute in dictionary valid_author.
        _author_id = findAuthorID(_author_name, _paper_id, valid_author)
        label_flag = True
    except:
        label_flag = False
        count_name += 1
          
    if  stream_flag and  label_flag:
#         streaming.append(_title + " " + str(_abstract) + "&&&" + 
        streaming.append(_title + "&&&" + 
                         _mark + "&&&" + 
                         str(_time) + "&&&" + 
                         str(_time_str))
        labels.append(_author_id)
        authorName2StreamingIndex[_author_name].append(streaming_id)
        streaming_id += 1

In [7]:
print("Streaming length: ", len(valid_stream))

Streaming length:  9170


In [8]:
print("{:<15} {:<15} {:<15} {:<10}".format('streamOriLength','afterFilter','abstractIssue', 'authorNameIssue'))
print("{:<15} {:<15} {:<15} {:<10}".format(len(valid_stream),len(streaming),count_abstract, count_name))

streamOriLength afterFilter     abstractIssue   authorNameIssue
9170            7935            1063            191       


In [9]:
7935 + 1063 + 191

9189

##### Sort the paper streaming by publication time.

In [6]:
paperStreaming = []
groundTruth = []
for _paper, _ground_truth in zip(streaming, labels):
    paperStreaming.append([_paper.split("&&&")[0], _paper.split("&&&")[1], _paper.split("&&&")[2], _paper.split("&&&")[3]])
    groundTruth.append(_ground_truth)
paperStreaming, groundTruth = np.array(paperStreaming), np.array(groundTruth)
paperStreaming = pd.DataFrame({'Date':paperStreaming[:,3], 'Time':paperStreaming[:,2], 'Mark':paperStreaming[:,1], 'Text':paperStreaming[:,0]})
groundTruth = pd.DataFrame(groundTruth, columns=['Author_ID'])

In [11]:
groundTruth

Unnamed: 0,Author_ID
0,ud9fCkao
1,q1oO72Fq
2,fdeLfdqT
3,CV7UrQnq
4,xfF1tC2u
...,...
7930,mZiipIYD
7931,Lx4uRpGm
7932,gX9qDXoK
7933,75Rx49Hf


In [12]:
paperStreaming

Unnamed: 0,Date,Time,Mark,Text
0,2011-05-26 00:00:00,1306386000.0,Chemical Communications,Enantioselective Pd-catalyzed hydrogenation of...
1,2014-12-12 00:00:00,1418364000.0,Advanced Materials,Multifunctional Fe5C2 nanoparticles: A targete...
2,2013-06-27 00:00:00,1372309200.0,Physical Review C - Nuclear Physics,Transverse-momentum dependence of the J/ψ nucl...
3,2015-06-25 00:00:00,1435208400.0,International Journal of Multimedia and Ubiqui...,Formal description of the virtual machines sys...
4,2014-09-02 00:00:00,1409634000.0,Small,Controllable fabrication and photoelectrochemi...
...,...,...,...,...
7930,2011-12-24 00:00:00,1324706400.0,IEICE Transactions on Information and Systems,Energy-Aware task scheduling for real-time sys...
7931,2012-10-20 00:00:00,1350709200.0,Chinese Journal of Emergency Medicine,The effects of calmodulin kinase II inhibitor ...
7932,2011-12-18 00:00:00,1324188000.0,Zhejiang Daxue Xuebao (Gongxue Ban)/Journal of...,Binocular bundle adjustment based localization...
7933,2012-02-20 00:00:00,1329717600.0,Physica B: Condensed Matter,First-principle calculations of dilute nitride...


In [7]:
paperStreaming['Date'] = pd.to_datetime(paperStreaming.Date)

In [8]:
paperStreaming

Unnamed: 0,Date,Time,Mark,Text
0,2011-02-02,1296626400.0,Chemical Communications,Enantioselective Pd-catalyzed hydrogenation of...
1,2014-02-24,1393221600.0,Advanced Materials,Multifunctional Fe5C2 nanoparticles: A targete...
2,2013-07-16,1373950800.0,Physical Review C - Nuclear Physics,Transverse-momentum dependence of the J/ψ nucl...
3,2015-09-19,1442638800.0,International Journal of Multimedia and Ubiqui...,Formal description of the virtual machines sys...
4,2014-03-02,1393740000.0,Small,Controllable fabrication and photoelectrochemi...
...,...,...,...,...
7930,2011-06-19,1308459600.0,IEICE Transactions on Information and Systems,Energy-Aware task scheduling for real-time sys...
7931,2012-06-16,1339822800.0,Chinese Journal of Emergency Medicine,The effects of calmodulin kinase II inhibitor ...
7932,2011-06-07,1307422800.0,Zhejiang Daxue Xuebao (Gongxue Ban)/Journal of...,Binocular bundle adjustment based localization...
7933,2012-02-11,1328940000.0,Physica B: Condensed Matter,First-principle calculations of dilute nitride...


In [9]:
paperStreamingSortByTime = paperStreaming.sort_values(by = 'Date')

In [10]:
paperStreamingSortByTime

Unnamed: 0,Date,Time,Mark,Text
1818,1995-01-15,790149600.0,Biochemistry,Cryo atomic force microscopy: a new approach f...
1519,1995-02-16,792914400.0,"Micron (Oxford, England : 1993)",Recent advances in biological atomic force mic...
90,1996-06-02,833691600.0,Thin Solid Films,Nanostructure of supported phospholipid monola...
2233,1996-09-17,842936400.0,Advances in Physics,Biological atomic force microscopy: what is ac...
3318,1997-03-03,857368800.0,Jinshu Xuebao/Acta Metallurgica Sinica,Simulation of room-temperature strength of ZA2...
...,...,...,...,...
6306,2019-12-14,1576303200.0,Journal of Materials Chemistry,High Ion Mobility and Capacity of Monolayer Ga...
4441,2019-12-16,1576476000.0,Angewandte Chemie,Amorphous Nanocages of Cu‐Ni‐Fe Hydr(oxy)oxide...
3329,2019-12-18,1576648800.0,IEEE Access,Multi-Spectral Image Change Detection Based on...
3065,2019-12-28,1577512800.0,Chemical Engineering Journal,Nitrogen enriched porous carbons from d-glucos...


In [11]:
groundTruthSortByTime = groundTruth.reindex(paperStreamingSortByTime.index)

In [18]:
groundTruthSortByTime

Unnamed: 0,Author_ID
1519,v8FV7xlb
1818,v8FV7xlb
2233,v8FV7xlb
90,v8FV7xlb
3318,q0C4eefB
...,...
7926,USpdGtDl
6591,PkezW32h
3329,ERs8KYfr
314,ERs8KYfr


### 2. Adapt the raw paper streaming into the Dirichlet  Darknet model  

In [12]:
def toVocabulary(texts):
    cv = CountVectorizer()
    cv_fit=cv.fit_transform(texts)
    return dict(zip(cv.get_feature_names(), np.arange(len(cv.get_feature_names())))), dict(zip(np.arange(len(cv.get_feature_names())), cv.get_feature_names()))

In [13]:
def getMarkVocab(marks):
    mark2id = {}
    id2mark = {}
    for _id, _mark in enumerate(set(marks)):
        mark2id[_mark] = _id
        id2mark[_id] = _mark
    return mark2id, id2mark

In [14]:
word2id, id2word = toVocabulary(np.squeeze(paperStreamingSortByTime[['Text']].to_numpy()))
mark2id, id2mark = getMarkVocab(np.squeeze(paperStreamingSortByTime[['Mark']].to_numpy()))
wordVocab = (word2id, id2word)
markVocab = (mark2id, id2mark)

In [15]:
with open('./smc_input/wordVocab.p', 'wb') as fp:
    pkl.dump(wordVocab, fp, protocol=pkl.HIGHEST_PROTOCOL)
with open('./smc_input/markVocab.p', 'wb') as fp:
    pkl.dump(markVocab, fp, protocol=pkl.HIGHEST_PROTOCOL)

In [16]:
paperStreamingSortByTime['Mark'] = paperStreamingSortByTime['Mark'].apply(lambda x: mark2id[x])

In [17]:
paperStreamingSortByTime.loc[0].to_numpy()

array([Timestamp('2011-02-02 00:00:00'), '1296626400.0', 538,
       'Enantioselective Pd-catalyzed hydrogenation of enesulfonamides'],
      dtype=object)

In [18]:
paperStreamingSortByTime.rename(columns = {'Text':'TextContent'}, inplace = True)

In [19]:
paperStreamingSortByTime

Unnamed: 0,Date,Time,Mark,TextContent
1818,1995-01-15,790149600.0,1917,Cryo atomic force microscopy: a new approach f...
1519,1995-02-16,792914400.0,877,Recent advances in biological atomic force mic...
90,1996-06-02,833691600.0,1920,Nanostructure of supported phospholipid monola...
2233,1996-09-17,842936400.0,610,Biological atomic force microscopy: what is ac...
3318,1997-03-03,857368800.0,1272,Simulation of room-temperature strength of ZA2...
...,...,...,...,...
6306,2019-12-14,1576303200.0,1504,High Ion Mobility and Capacity of Monolayer Ga...
4441,2019-12-16,1576476000.0,1376,Amorphous Nanocages of Cu‐Ni‐Fe Hydr(oxy)oxide...
3329,2019-12-18,1576648800.0,2052,Multi-Spectral Image Change Detection Based on...
3065,2019-12-28,1577512800.0,2081,Nitrogen enriched porous carbons from d-glucos...


In [20]:
paperStreamingSortByTime['TextContent'] = paperStreamingSortByTime['TextContent'].str.lower()

In [21]:
paperStreamingSortByTime

Unnamed: 0,Date,Time,Mark,TextContent
1818,1995-01-15,790149600.0,1917,cryo atomic force microscopy: a new approach f...
1519,1995-02-16,792914400.0,877,recent advances in biological atomic force mic...
90,1996-06-02,833691600.0,1920,nanostructure of supported phospholipid monola...
2233,1996-09-17,842936400.0,610,biological atomic force microscopy: what is ac...
3318,1997-03-03,857368800.0,1272,simulation of room-temperature strength of za2...
...,...,...,...,...
6306,2019-12-14,1576303200.0,1504,high ion mobility and capacity of monolayer ga...
4441,2019-12-16,1576476000.0,1376,amorphous nanocages of cu‐ni‐fe hydr(oxy)oxide...
3329,2019-12-18,1576648800.0,2052,multi-spectral image change detection based on...
3065,2019-12-28,1577512800.0,2081,nitrogen enriched porous carbons from d-glucos...


In [22]:
def toTextFreqDistr(_texts, _word2id):
    cv = CountVectorizer()
    cv_fit=cv.fit_transform(_texts)
    return (tuple([_word2id[_word] for _word in cv.get_feature_names()]), tuple(cv_fit.toarray().sum(axis=0).tolist()))

In [23]:
TextBagofWords = []
TextWordCount = []
for line in paperStreamingSortByTime['TextContent']:
    try:
        TextBagofWords.append(toTextFreqDistr([line], word2id))
        TextWordCount.append(np.sum(toTextFreqDistr([line], word2id)[1]))
    except: 
        print(line)

 
a g-c
l


In [24]:
paperStreamingSortByTime.loc[paperStreamingSortByTime['TextContent'] == "a g-c"]

Unnamed: 0,Date,Time,Mark,TextContent
613,2015-08-02,1438491600.0,538,a g-c


In [25]:
groundTruthSortByTime.loc[[613]]

Unnamed: 0,Author_ID
613,xfF1tC2u


In [26]:
paperStreamingSortByTime.loc[paperStreamingSortByTime['TextContent'] == "l"]

Unnamed: 0,Date,Time,Mark,TextContent
757,2015-09-21,1442811600.0,1310,l


In [27]:
groundTruthSortByTime.loc[[757]]

Unnamed: 0,Author_ID
757,8YFOzgbv


In [28]:
paperStreamingSortByTime.loc[paperStreamingSortByTime['TextContent'] == " "]

Unnamed: 0,Date,Time,Mark,TextContent
3822,2014-07-28,1406523600.0,1268,


In [29]:
groundTruthSortByTime.loc[[3822]]

Unnamed: 0,Author_ID
3822,B37Ah8xY


In [30]:
paperStreamingSortByTime.loc[paperStreamingSortByTime['TextContent'] == "preparation of anhydrate phosphogypsum binders"]

Unnamed: 0,Date,Time,Mark,TextContent
156,2010-07-27,1280206800.0,1705,preparation of anhydrate phosphogypsum binders


In [31]:
paperStreamingSortByTime = paperStreamingSortByTime.drop([613,757, 3822])
groundTruthSortByTime = groundTruthSortByTime.drop([613, 757, 3822])

In [32]:
for _author_name, _paper_streaming in authorName2StreamingIndex.items():
    if 613 in _paper_streaming:
        _paper_streaming.remove(613)
        authorName2StreamingIndex[_author_name] = _paper_streaming
    if 757 in _paper_streaming:
        _paper_streaming.remove(757)
        authorName2StreamingIndex[_author_name] = _paper_streaming
    if 3822 in _paper_streaming:
        _paper_streaming.remove(3822)
        authorName2StreamingIndex[_author_name] = _paper_streaming

In [33]:
TextBagofWords = []
TextWordCount = []
for line in paperStreamingSortByTime['TextContent']:
    try:
        TextBagofWords.append(toTextFreqDistr([line], word2id))
        TextWordCount.append(np.sum(toTextFreqDistr([line], word2id)[1]))
    except: 
        print(line)

In [34]:
paperStreamingSortByTime['TextBagofWords'] = TextBagofWords
paperStreamingSortByTime['TextWordCount'] = TextWordCount

In [35]:
paperStreamingSortByTime

Unnamed: 0,Date,Time,Mark,TextContent,TextBagofWords,TextWordCount
1818,1995-01-15,790149600.0,1917,cryo atomic force microscopy: a new approach f...,"((1241, 1428, 1440, 1869, 3470, 5557, 5561, 64...",12
1519,1995-02-16,792914400.0,877,recent advances in biological atomic force mic...,"((746, 1440, 1869, 5561, 6909, 8578, 11620), (...",7
90,1996-06-02,833691600.0,1920,nanostructure of supported phospholipid monola...,"((1067, 1802, 2189, 8578, 8812, 9205, 9696, 10...",11
2233,1996-09-17,842936400.0,610,biological atomic force microscopy: what is ac...,"((608, 1067, 1440, 1869, 5561, 7347, 8578, 928...",11
3318,1997-03-03,857368800.0,1272,simulation of room-temperature strength of za2...,"((840, 3083, 9696, 12070, 12702, 13283, 13817,...",9
...,...,...,...,...,...,...
6306,2019-12-14,1576303200.0,1504,high ion mobility and capacity of monolayer ga...,"((1067, 1125, 1360, 1671, 2306, 5786, 6424, 73...",13
4441,2019-12-16,1576476000.0,1376,amorphous nanocages of cu‐ni‐fe hydr(oxy)oxide...,"((1002, 2189, 3503, 4574, 5052, 5268, 5557, 64...",17
3329,2019-12-18,1576648800.0,2052,multi-spectral image change detection based on...,"((1067, 1618, 1651, 2602, 3933, 6804, 7408, 89...",14
3065,2019-12-28,1577512800.0,2081,nitrogen enriched porous carbons from d-glucos...,"((2322, 2348, 2902, 4828, 5066, 5656, 5958, 94...",11


In [36]:
paperStreamingSortByTime.insert(0, "StreamingIDSorted", (np.arange(paperStreamingSortByTime.shape[0])).tolist())

In [37]:
paperStreamingSortByTime

Unnamed: 0,StreamingIDSorted,Date,Time,Mark,TextContent,TextBagofWords,TextWordCount
1818,0,1995-01-15,790149600.0,1917,cryo atomic force microscopy: a new approach f...,"((1241, 1428, 1440, 1869, 3470, 5557, 5561, 64...",12
1519,1,1995-02-16,792914400.0,877,recent advances in biological atomic force mic...,"((746, 1440, 1869, 5561, 6909, 8578, 11620), (...",7
90,2,1996-06-02,833691600.0,1920,nanostructure of supported phospholipid monola...,"((1067, 1802, 2189, 8578, 8812, 9205, 9696, 10...",11
2233,3,1996-09-17,842936400.0,610,biological atomic force microscopy: what is ac...,"((608, 1067, 1440, 1869, 5561, 7347, 8578, 928...",11
3318,4,1997-03-03,857368800.0,1272,simulation of room-temperature strength of za2...,"((840, 3083, 9696, 12070, 12702, 13283, 13817,...",9
...,...,...,...,...,...,...,...
6306,7927,2019-12-14,1576303200.0,1504,high ion mobility and capacity of monolayer ga...,"((1067, 1125, 1360, 1671, 2306, 5786, 6424, 73...",13
4441,7928,2019-12-16,1576476000.0,1376,amorphous nanocages of cu‐ni‐fe hydr(oxy)oxide...,"((1002, 2189, 3503, 4574, 5052, 5268, 5557, 64...",17
3329,7929,2019-12-18,1576648800.0,2052,multi-spectral image change detection based on...,"((1067, 1618, 1651, 2602, 3933, 6804, 7408, 89...",14
3065,7930,2019-12-28,1577512800.0,2081,nitrogen enriched porous carbons from d-glucos...,"((2322, 2348, 2902, 4828, 5066, 5656, 5958, 94...",11


In [38]:
paperStreamingSortByTime.to_pickle("./smc_input/paperStreamingSMC.pkl")
groundTruthSortByTime.to_pickle("./smc_input/groundTruthSMC.pkl")

In [39]:
with open('./smc_input/authorName2StreamingIndexSMC.pkl', 'wb') as fp:
    pkl.dump(authorName2StreamingIndex, fp, protocol=pkl.HIGHEST_PROTOCOL)

In [40]:
StreamingIndex2StreamingID = dict(zip(paperStreamingSortByTime.index.to_numpy().tolist(), 
                                      paperStreamingSortByTime.StreamingIDSorted.to_numpy().tolist()))

In [41]:
with open('./smc_input/StreamingIndex2StreamingIDSMC.pkl', 'wb') as fp:
    pkl.dump(StreamingIndex2StreamingID, fp, protocol=pkl.HIGHEST_PROTOCOL)

### Paper streaming grouped by author names.

In [42]:
paperStreamingSortByTime.drop(columns = ['StreamingIDSorted'])

Unnamed: 0,Date,Time,Mark,TextContent,TextBagofWords,TextWordCount
1818,1995-01-15,790149600.0,1917,cryo atomic force microscopy: a new approach f...,"((1241, 1428, 1440, 1869, 3470, 5557, 5561, 64...",12
1519,1995-02-16,792914400.0,877,recent advances in biological atomic force mic...,"((746, 1440, 1869, 5561, 6909, 8578, 11620), (...",7
90,1996-06-02,833691600.0,1920,nanostructure of supported phospholipid monola...,"((1067, 1802, 2189, 8578, 8812, 9205, 9696, 10...",11
2233,1996-09-17,842936400.0,610,biological atomic force microscopy: what is ac...,"((608, 1067, 1440, 1869, 5561, 7347, 8578, 928...",11
3318,1997-03-03,857368800.0,1272,simulation of room-temperature strength of za2...,"((840, 3083, 9696, 12070, 12702, 13283, 13817,...",9
...,...,...,...,...,...,...
6306,2019-12-14,1576303200.0,1504,high ion mobility and capacity of monolayer ga...,"((1067, 1125, 1360, 1671, 2306, 5786, 6424, 73...",13
4441,2019-12-16,1576476000.0,1376,amorphous nanocages of cu‐ni‐fe hydr(oxy)oxide...,"((1002, 2189, 3503, 4574, 5052, 5268, 5557, 64...",17
3329,2019-12-18,1576648800.0,2052,multi-spectral image change detection based on...,"((1067, 1618, 1651, 2602, 3933, 6804, 7408, 89...",14
3065,2019-12-28,1577512800.0,2081,nitrogen enriched porous carbons from d-glucos...,"((2322, 2348, 2902, 4828, 5066, 5656, 5958, 94...",11


In [43]:
paperStreamingGroupedByAuthors = {}
groundTruthClusteringLabelByAuthors ={}
paperStreamingSortByTime2 = paperStreamingSortByTime.drop(columns = ['StreamingIDSorted'])
for _author_name, _streaming_index in authorName2StreamingIndex.items():
    _paper_streaming = [paperStreamingSortByTime2.loc[_paper_index].to_numpy().tolist()
                                              for _paper_index in _streaming_index]
    _paper_streaming = pd.DataFrame(_paper_streaming, columns=['Date', 'Time', 'Mark', 'TextContent', 'TextBagofWords', 'TextWordCount'])
    
    _ground_truth_streaming = [groundTruthSortByTime.loc[_paper_index].to_numpy()[0] for _paper_index in _streaming_index]
    _ground_truth_streaming = pd.DataFrame(_ground_truth_streaming, columns=['Author_ID'])
    
    # sorted by Date
    _paper_streaming = _paper_streaming.sort_values(by='Date')
    _ground_truth_streaming = _ground_truth_streaming.reindex(_paper_streaming.index)
    
    # add one ID column
    _paper_streaming.insert(loc=0, column='StreamingIDSorted', value=(np.arange(_paper_streaming.shape[0])).tolist())

    
    paperStreamingGroupedByAuthors[_author_name] = _paper_streaming.to_numpy().tolist()
    groundTruthClusteringLabelByAuthors[_author_name] = _ground_truth_streaming.to_numpy().tolist()

In [44]:
_paper_streaming

Unnamed: 0,StreamingIDSorted,Date,Time,Mark,TextContent,TextBagofWords,TextWordCount
0,0,2015-02-28,1425103200.0,1727,a novel urinary long non-coding rna transcript...,"((584, 1894, 2279, 2937, 3933, 3995, 5557, 689...",23
1,1,2015-03-27,1427432400.0,395,clinical utility of a novel urine-based gene f...,"((1651, 1894, 2849, 5718, 5838, 6909, 9570, 96...",15


In [45]:
_ground_truth_streaming

Unnamed: 0,Author_ID
0,oQW0lWYM
1,oQW0lWYM


In [46]:
with open('./smc_input/paperStreamingGroupedByAuthors.pkl', 'wb') as fp:
    pkl.dump(paperStreamingGroupedByAuthors, fp, protocol=pkl.HIGHEST_PROTOCOL)
with open('./smc_input/groundTruthClusteringLabelByAuthors.pkl', 'wb') as fp:
    pkl.dump(groundTruthClusteringLabelByAuthors, fp, protocol=pkl.HIGHEST_PROTOCOL)

In [47]:
for key, value in paperStreamingGroupedByAuthors.items():
    print(key, ": ")
    for _paper in value: 
        print(_paper)
    break

lei_shi : 
[0, Timestamp('2007-03-11 00:00:00'), '1173592800.0', 775, 'ultralight conductive carbon-nanotube-polymer composite', ((2341, 3082, 3135, 9213, 10836, 14541), (1, 1, 1, 1, 1, 1)), 6]
[1, Timestamp('2010-06-16 00:00:00'), '1276664400.0', 1723, 'a study on climatic adaptability of dalbergia sissoo', ((670, 2848, 3670, 9696, 9748, 12743, 13335), (1, 1, 1, 1, 1, 1, 1)), 7]
[2, Timestamp('2011-01-03 00:00:00'), '1294034400.0', 467, 'credit assessment with random forests', ((1401, 3422, 5574, 11527, 15123), (1, 1, 1, 1, 1)), 5]
[3, Timestamp('2011-01-12 00:00:00'), '1294812000.0', 2562, 'synthesis and antiproliferative activities against hep-g2 of salicylanide derivatives: potent inhibitors of the epidermal growth factor receptor (egfr) tyrosine kinase', ((649, 786, 1067, 1189, 3897, 4584, 4885, 5217, 5730, 6113, 6322, 7054, 7526, 9696, 10948, 11622, 12201, 13658, 13932, 14502), (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1)), 21]
[4, Timestamp('2011-01-14 00:00:00')