# Import libs

In [4]:
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import inflect

# Load dataset

In [5]:
def load_data(dts):
    path = 'MuP_dataset/'+dts+'_complete.jsonl'
    with open(path, 'r') as json_file:
        json_list = list(json_file)
        col_name = ["paper_id", "paper","summary"]
    dataset = pd.DataFrame(columns=col_name)
    for json_str in json_list[:]:
        result = json.loads(json_str)
        df = pd.DataFrame([[result["paper_id"], result['paper'], result["summary"]]], columns=col_name)
        dataset = pd.concat([dataset,df])
    return dataset

In [6]:
df_train = load_data("training")
print(len(df_train))
df_train.reset_index(inplace=True, drop=True)
df_train.head()

18934


Unnamed: 0,paper_id,paper,summary
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...,This paper investigates kernel ridge-less regr...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...,This paper presents a novel way of making full...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...,This paper proposes a new framework that compu...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...,This work proposes a deep reinforcement learni...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...,This paper proposes 3 deep generative models b...


## Drop summary column

In [7]:
df_train.drop(['summary'], axis=1, inplace=True)
df_train

Unnamed: 0,paper_id,paper
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...
...,...,...
18929,SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,{'abstractText': 'Discovering the underlying m...
18930,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...
18931,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...
18932,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,{'abstractText': 'Thanks to graph neural netwo...


## Drop duplicated values

In [8]:
df_train.drop_duplicates(subset=['paper_id'],inplace=True)
df_train

Unnamed: 0,paper_id,paper
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...
...,...,...
18924,SP:77d59e1e726172184249bdfdd81011617dc9c208,{'abstractText': 'Quantum machine learning met...
18926,SP:e58dc2d21175a62499405b7f4c3a03b135530838,{'abstractText': 'Trained generative models ha...
18928,SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,{'abstractText': 'Discovering the underlying m...
18930,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...


In [22]:
df_train.reset_index(drop=True, inplace=True)
df_train

Unnamed: 0,paper_id,paper
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...
...,...,...
8374,SP:77d59e1e726172184249bdfdd81011617dc9c208,{'abstractText': 'Quantum machine learning met...
8375,SP:e58dc2d21175a62499405b7f4c3a03b135530838,{'abstractText': 'Trained generative models ha...
8376,SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,{'abstractText': 'Discovering the underlying m...
8377,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...


## Example sections

In [9]:
for i in range(2):
    for sec in ((df_train['paper'][i])['sections']):
        print((sec)['heading'])
        # print(">"*10, sec['text'][:40])
    print("="*100)


1 INTRODUCTION
2 STATISTICAL LEARNING AND EMPIRICAL RISK MINIMIZATION
2.1 KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION
3 ERROR BOUNDS VIA STABILITY
4.1 KEY LEMMAS
4.2 PROOF OF LEMMA 11
5 REMARK AND RELATED WORK
6 CONCLUSIONS
A EXCESS RISK, GENERALIZATION, AND STABILITY
1 INTRODUCTION
2 A NEW PERSPECTIVE OF REDUCING DIVERSITY OF GRADIENTS
2.1 THE SOURCE OF GRADIENT DIVERSITY
2.2 CONNECTING DEEP METRIC LEARNING TO CONTINUAL LEARNING
3 DISCRIMINATIVE REPRESENTATION LOSS
4 ONLINE MEMORY UPDATE AND BALANCED EXPERIENCE REPLAY
5 EXPERIMENTS
6 CONCLUSION
A PROOF OF THEOREMS
B ALGORITHMS OF ONLINE MEMORY UPDATE
C DEFINITION OF PERFORMANCE MEASURES
D RELATED METHODS FROM DML
E ABLATION STUDY ON DRL
F COMPARING DIFFERENT MEMORY SIZES
G COMPARING DIFFERENT REPLAY STRATEGY
H COMPARING TRAINING TIME
I HYPER-PARAMETERS IN EXPERIMENTS


## Sections extraction

In [30]:
def sec_ext(sec):
    text = sec['text']
    try:
        # sec['heading']=sec['heading'].upper()
        sec_split = ((sec)['heading'].upper()).split()
        if len(sec_split)==1:
            head_no = None
            head_title = sec_split[0]
        else:
            head_no = sec_split[0]
            head_title = " ".join(sec_split[1:])
    except:
        head_no = None
        head_title = None
    return head_no, head_title, text


In [31]:
df_paper = pd.DataFrame(columns=['paper_id', 'head_no', 'head_title', 'text'])
# df_in = df_train
full_len = len(df_train)
for ind, row in (df_train).iterrows():
    for sec in ((row['paper'])['sections']):
        head_no, head_title, text = sec_ext(sec)
        new_row = pd.DataFrame([[row['paper_id'], head_no, head_title, text]], columns=['paper_id', 'head_no', 'head_title', 'text'])
        df_paper = pd.concat([df_paper, new_row])
    print(str(ind)+'/'+str(full_len), end='\r')
    # break
df_paper

8378/8379

Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2.1,KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION,The focus in this paper is on the kernel least...
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.1,KEY LEMMAS,In order to prove Theorem 7 we make use of the...
...,...,...,...,...
0,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,B.4,ANALYZE FOR EPISTEMIC IN OOD DETECTION,"In OOD detection, epistemic uncertainty perfor..."
0,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C,DERIVATIONS FOR UNCERTAINTY MEASURES AND KL DI...,This appendix provides the derivations and sho...
0,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.1,UNCERTAINTY MEASURES,Vacuity uncertainty of Bayesian Graph neural n...
0,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.2,JOINT PROBABILITY,"At the test stage, we infer the joint probabil..."


## Save CSV

In [32]:
df_paper.to_csv("visualization_data/paper_sections.csv")

# Load Extracted CSV

In [33]:
df_paper = pd.read_csv("visualization_data/paper_sections.csv", index_col=0)
df_paper

Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2.1,KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION,The focus in this paper is on the kernel least...
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.1,KEY LEMMAS,In order to prove Theorem 7 we make use of the...
...,...,...,...,...
0,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,B.4,ANALYZE FOR EPISTEMIC IN OOD DETECTION,"In OOD detection, epistemic uncertainty perfor..."
0,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C,DERIVATIONS FOR UNCERTAINTY MEASURES AND KL DI...,This appendix provides the derivations and sho...
0,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.1,UNCERTAINTY MEASURES,Vacuity uncertainty of Bayesian Graph neural n...
0,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.2,JOINT PROBABILITY,"At the test stage, we infer the joint probabil..."


## Prepossing heading title

In [34]:
no = list(str(i) for i in range(10))
df_paper_main = df_paper[df_paper["head_no"].isin(no)]
df_paper_main.reset_index(drop=True, inplace=True)
df_paper_main[:10]

Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,5,REMARK AND RELATED WORK,In the previous section we obtained bounds on ...
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,6,CONCLUSIONS,"In summary, minimizing a bound on cross valida..."
5,SP:b80bc890180934092cde037b49d94d6e4e06fad9,1,INTRODUCTION,"In the real world, we are often faced with sit..."
6,SP:b80bc890180934092cde037b49d94d6e4e06fad9,2,A NEW PERSPECTIVE OF REDUCING DIVERSITY OF GRA...,"According to Eq. (1), negative cosine similari..."
7,SP:b80bc890180934092cde037b49d94d6e4e06fad9,3,DISCRIMINATIVE REPRESENTATION LOSS,"Based on our findings in the above section, we..."
8,SP:b80bc890180934092cde037b49d94d6e4e06fad9,4,ONLINE MEMORY UPDATE AND BALANCED EXPERIENCE R...,We follow the online setting of continual lear...
9,SP:b80bc890180934092cde037b49d94d6e4e06fad9,5,EXPERIMENTS,In this section we evaluate our methods on mul...


In [35]:
p = inflect.engine()
prepro_list = [p.singular_noun(word) if p.singular_noun(word)!=False else word for word in df_paper_main['head_title']]
df_paper_main['head_title'] = prepro_list
df_paper_main[:10]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paper_main['head_title'] = prepro_list


Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,5,REMARK AND RELATED WORK,In the previous section we obtained bounds on ...
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,6,CONCLUSION,"In summary, minimizing a bound on cross valida..."
5,SP:b80bc890180934092cde037b49d94d6e4e06fad9,1,INTRODUCTION,"In the real world, we are often faced with sit..."
6,SP:b80bc890180934092cde037b49d94d6e4e06fad9,2,A NEW PERSPECTIVE OF REDUCING DIVERSITY OF GRA...,"According to Eq. (1), negative cosine similari..."
7,SP:b80bc890180934092cde037b49d94d6e4e06fad9,3,DISCRIMINATIVE REPRESENTATION LOS,"Based on our findings in the above section, we..."
8,SP:b80bc890180934092cde037b49d94d6e4e06fad9,4,ONLINE MEMORY UPDATE AND BALANCED EXPERIENCE R...,We follow the online setting of continual lear...
9,SP:b80bc890180934092cde037b49d94d6e4e06fad9,5,EXPERIMENT,In this section we evaluate our methods on mul...


In [36]:
df_heading = pd.DataFrame(df_paper_main.groupby("head_title")["head_title"].count())
df_heading.columns = ["count"]
df_heading.reset_index(inplace=True)
df_heading.sort_values('count', ascending=False, inplace=True)
df_heading.reset_index(drop=True, inplace=True)
df_heading

Unnamed: 0,head_title,count
0,INTRODUCTION,7875
1,CONCLUSION,5372
2,RELATED WORK,4935
3,EXPERIMENT,4503
4,DISCUSSION,1043
...,...,...
13005,EXPERIMENT DETAIL,1
13006,EXPERIMENT FOR CROSS-LINGUAL TRANSFER,1
13007,EXPERIMENT FOR DIRECTED HYPERLINK PREDICTION,1
13008,EXPERIMENT FOR EVALUATING KD+,1


In [82]:
df_heading[df_heading['head_title'].str.contains(r'SETUP*')]

Unnamed: 0,head_title,count
26,EXPERIMENTAL SETUP,138
34,PROBLEM SETUP,79
63,SETUP,27
142,PROBLEM SETUP AND PRELIMINARY,8
181,EXPERIMENT SETUP,6
...,...,...
12882,EXPERIMENTAL SETUP AND METRIC,1
12883,EXPERIMENTAL SETUP FOR GRAPH CLASSIFICATION TASKS,1
12884,EXPERIMENTAL SETUP FOR IMAGE RESTORATION,1
12885,EXPERIMENTAL SETUP FOR IMAGE SEGMENTATION,1


In [95]:
introduction_key = ["INTRODUCTION", "PROBLEM", "MOTIVATION", "PRELIMINARY", "OVERVIEW"]
literature_key = ["RELATED WORK", "BACKGROUND", "THEORY", "PREVIOUS WORK", "PRIOR WORK", "REVIEW", "REVIEW", "BASELINE"]
method_key = ["APPROACH", "MODEL", "ALGORITHM", "METHOD", "DATASET", "SETUP", "SETTING", "FRAMEWORK", "ARCHITECTURE", "IMPLEMENTATION", "CONTRIBUTION"]
result_key =["EXPERIMENT", "RESULT", "EVALUATION", "ABLATION STUDY"]
discuss_key = ["DISCUSSION", "LIMITATION", "ANALYSI"]
conclusion_key = ["CONCLU", "FUTURE WORK", "APPLICATION", "SUMMARY", "IMPACT"]
additional_key = ["NOTATION", "APPENDIX", "ACKNOWLEDGMENT", "ACKNOWLEDGEMENT", "EXTENSION", "REMARK", "MATERIAL", "ENVIRONMENT"]

In [96]:

# other = ["REPRODUCIBILITY", "SIMULATION", "EXAMPLE", "FORMULATION", "LEARNING", "TASK"]
# pattern = r'CONCLU*|INTRODUCTION*|FUTURE*|WORK*|DISCUSSION*|ACKNOWLEDGMENT*|ACKNOWLEDGEMENT*|APPROACH*|MODEL*|ANALYSI*|ALGORITHM*|RELATED*|EXPERIMENT*|METHOD*|\
#     |PRELIMINARY*|BACKGROUND*|RESULT*|EVALUATION*|APPENDIX*|ABLATION STUDY*|LIMITATION*|PROBLEM*|STATEMENT*|\
#     |APPLICATION*|THEORY*|MOTIVATION*|IMPACT*|METHOD*|REPRODUCIBILITY*|SUMMARY*|DATASET*|STUDY*|SETUP*|\
#     |SIMULATION*|FRAMEWORK*|IMPLEMENTATION*|NOTATION*|SETTING*|ARCHITECTURE*|PREVIOUS WORK*|PRIOR WORK*|REVIEW*|LITERATURE*|\
#     |EXTENSION*|EXAMPLE*|FORMULATION*|OVERVIEW*|REMARK*|LEARNING*|ENVIRONMENT*|BASELINE*|CONTRIBUTION*|TASK*|MATERIAL*'
# mask = df_heading['head_title'].str.contains(pattern)
# n = 30*0
# df_heading[~mask][n:n+30]

In [156]:
def pattern(keys):
    pattern_str = r''
    for key in keys:
        pattern_str += key + "*|"
    return pattern_str[:-1]
pattern(introduction_key)

def count_sec(keys, col_name):
    mask = df_paper_main['head_title'].str.contains(pattern(keys))
    df_count = pd.DataFrame(df_paper_main[mask].groupby("paper_id")["paper_id"].count())
    df_count.columns = [col_name]
    # intro_count.reset_index(inplace=True)
    return df_count


In [166]:
df_section_group = pd.DataFrame(index=df_train["paper_id"])
keys_list = [introduction_key, literature_key, method_key, result_key, discuss_key, conclusion_key]
keys_name = ["intro", "liture", "method", "result", "discuss", "conclude"]
for idx, keys in enumerate(keys_list):
    # print(keys)
    df_count = count_sec(keys, keys_name[idx])
    df_section_group = df_section_group.merge(df_count, on="paper_id", how='outer', suffixes=('_1', '_2'))
# df_section_group.fillna(0, inplace=True)
df_section_group


Unnamed: 0_level_0,intro,liture,method,result,discuss,conclude
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1.0,1.0,,,,1.0
SP:b80bc890180934092cde037b49d94d6e4e06fad9,1.0,,,1.0,,1.0
SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,1.0,1.0,,1.0,,1.0
SP:a1e2218e6943bf138aeb359e23628676b396ed66,2.0,,,1.0,1.0,1.0
SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,1.0,1.0,1.0,1.0,,1.0
...,...,...,...,...,...,...
SP:77d59e1e726172184249bdfdd81011617dc9c208,2.0,,,,,
SP:e58dc2d21175a62499405b7f4c3a03b135530838,2.0,,1.0,,1.0,1.0
SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,1.0,1.0,1.0,1.0,1.0,1.0
SP:4706017e6f8b958c7d0825fed98b285ea2994b59,1.0,1.0,1.0,1.0,1.0,1.0


In [168]:
df_section_group.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8379 entries, SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc to SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   intro     7989 non-null   float64
 1   liture    6208 non-null   float64
 2   method    4508 non-null   float64
 3   result    6987 non-null   float64
 4   discuss   2675 non-null   float64
 5   conclude  6959 non-null   float64
dtypes: float64(6)
memory usage: 458.2+ KB


In [151]:
# df_section_group_new = df_section_group.reset_index(drop=True)
# df_section_group_new = pd.concat([df_section_group, intro_count], axis=1, join='outer')
df_section_group_new = df_section_group.merge(intro_count, on="paper_id", how='outer', suffixes=('_1', '_2'))
df_section_group_new[df_section_group_new["intro"]==None]

Unnamed: 0_level_0,intro
paper_id,Unnamed: 1_level_1


In [153]:
df_section_group_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8379 entries, SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc to SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   intro   7989 non-null   float64
dtypes: float64(1)
memory usage: 130.9+ KB
