# Import libs

In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import inflect

In [2]:
text = "Hello world \r1)Introduction"
print(text)
print(text.split("\r"))

1)Introduction
['Hello world ', '1)Introduction']


# Load dataset

In [3]:
RAWDATAFILES = {
    "train": "training_complete.jsonl",
    "val": "validation_complete.jsonl",
    "test": "testing_with_paper_release.jsonl"
}
def print_progress(curr, full, desc='', bar_size=30):    
    bar = int((curr+1)/full*bar_size)
    sys.stdout.write(f"\r{desc}[{'='*bar}{' '*(bar_size-bar)}] {curr+1}/{full}")
    # sys.stdout.flush()
    if curr+1==full: print()
def load_data(data_split):
    main_path = str((Path().absolute()).parents[0])    
    filepath = f"{main_path}/dataset_MuP/{RAWDATAFILES[data_split]}"
    with open(filepath, 'r') as json_file:
        json_list = list(json_file)
        col_name = ["paper_id", "paper","summary"]
    dataset = []
    data_len = len(json_list)
    for i, json_str in enumerate(json_list[:]):
        result = json.loads(json_str)
        dataset.append({
            "paper_id": result["paper_id"],
            "paper": result["paper"],
            "summary": result["summary"],
        })
        print_progress(i, data_len, bar_size=50)
    return pd.DataFrame(dataset)

In [4]:
df_train = load_data("train")
# print(len(df_train))
df_train.reset_index(inplace=True, drop=True)
df_train.head()



Unnamed: 0,paper_id,paper,summary
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...,This paper investigates kernel ridge-less regr...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...,This paper presents a novel way of making full...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...,This paper proposes a new framework that compu...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...,This work proposes a deep reinforcement learni...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...,This paper proposes 3 deep generative models b...


## Drop summary column

In [5]:
df_train.drop(['summary'], axis=1, inplace=True)
df_train

Unnamed: 0,paper_id,paper
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...
...,...,...
18929,SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,{'abstractText': 'Discovering the underlying m...
18930,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...
18931,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...
18932,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,{'abstractText': 'Thanks to graph neural netwo...


## Drop duplicated values

In [6]:
df_train.drop_duplicates(subset=['paper_id'],inplace=True)
df_train

Unnamed: 0,paper_id,paper
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...
...,...,...
18924,SP:77d59e1e726172184249bdfdd81011617dc9c208,{'abstractText': 'Quantum machine learning met...
18926,SP:e58dc2d21175a62499405b7f4c3a03b135530838,{'abstractText': 'Trained generative models ha...
18928,SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,{'abstractText': 'Discovering the underlying m...
18930,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...


In [7]:
df_train.reset_index(drop=True, inplace=True)
df_train

Unnamed: 0,paper_id,paper
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...
...,...,...
8374,SP:77d59e1e726172184249bdfdd81011617dc9c208,{'abstractText': 'Quantum machine learning met...
8375,SP:e58dc2d21175a62499405b7f4c3a03b135530838,{'abstractText': 'Trained generative models ha...
8376,SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,{'abstractText': 'Discovering the underlying m...
8377,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...


## Example sections

In [8]:
for i in range(2):
    for sec in ((df_train['paper'][i])['sections']):
        print((sec)['heading'])
        # print(">"*10, sec['text'][:40])
    print("="*100)


1 INTRODUCTION
2 STATISTICAL LEARNING AND EMPIRICAL RISK MINIMIZATION
2.1 KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION
3 ERROR BOUNDS VIA STABILITY
4.1 KEY LEMMAS
4.2 PROOF OF LEMMA 11
5 REMARK AND RELATED WORK
6 CONCLUSIONS
A EXCESS RISK, GENERALIZATION, AND STABILITY
1 INTRODUCTION
2 A NEW PERSPECTIVE OF REDUCING DIVERSITY OF GRADIENTS
2.1 THE SOURCE OF GRADIENT DIVERSITY
2.2 CONNECTING DEEP METRIC LEARNING TO CONTINUAL LEARNING
3 DISCRIMINATIVE REPRESENTATION LOSS
4 ONLINE MEMORY UPDATE AND BALANCED EXPERIENCE REPLAY
5 EXPERIMENTS
6 CONCLUSION
A PROOF OF THEOREMS
B ALGORITHMS OF ONLINE MEMORY UPDATE
C DEFINITION OF PERFORMANCE MEASURES
D RELATED METHODS FROM DML
E ABLATION STUDY ON DRL
F COMPARING DIFFERENT MEMORY SIZES
G COMPARING DIFFERENT REPLAY STRATEGY
H COMPARING TRAINING TIME
I HYPER-PARAMETERS IN EXPERIMENTS


## Sections extraction

In [9]:
def section_extraction(sec):
    text = sec['text']
    try:
        # sec['heading']=sec['heading'].upper()
        sec_split = ((sec)['heading'].upper()).split()
        if len(sec_split)==1:
            head_no = None
            head_title = sec_split[0]
        else:
            head_no = sec_split[0]
            head_title = " ".join(sec_split[1:])
    except:
        head_no = None
        head_title = None
    return head_no, head_title, text

def get_ext_section(df):
    full_len = len(df)
    sections_data = [] 
    one_section_papers = []
    for ind, row in (df).iterrows():
        sections = ((row['paper'])['sections'])
        if len(sections)<=1: 
            one_section_papers.append(row['paper_id'])
            continue
        for sec in sections:
            head_no, head_title, text = section_extraction(sec)
            if head_no is not None:
                while head_no[-1] in ['.','/',',',')']:
                    head_no = head_no[:-1]
            sections_data.append({
                'paper_id': row['paper_id'],
                'head_no': head_no,
                'head_title': head_title,
                'text': text,
            })   
        print_progress(ind, full_len, bar_size=50)
    return pd.DataFrame(sections_data)

In [10]:
df_paper = get_ext_section(df_train)
df_paper



Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2.1,KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION,The focus in this paper is on the kernel least...
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.1,KEY LEMMAS,In order to prove Theorem 7 we make use of the...
...,...,...,...,...
148542,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,B.4,ANALYZE FOR EPISTEMIC IN OOD DETECTION,"In OOD detection, epistemic uncertainty perfor..."
148543,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C,DERIVATIONS FOR UNCERTAINTY MEASURES AND KL DI...,This appendix provides the derivations and sho...
148544,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.1,UNCERTAINTY MEASURES,Vacuity uncertainty of Bayesian Graph neural n...
148545,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.2,JOINT PROBABILITY,"At the test stage, we infer the joint probabil..."


## Save CSV

In [11]:
# file_dir = 'visualization_data
# if not(Path(log_dir).exists()): os.system(f"mkdir -p {log_dir}")
# df_paper.to_csv(f"{file_dir}/paper_sections.csv")

# Load Extracted CSV

In [12]:
# df_paper = pd.read_csv("visualization_data/paper_sections.csv", index_col=0)
# df_paper

## Prepossing heading title

In [13]:
no = list(str(i) for i in range(10))
df_paper_main = df_paper[df_paper["head_no"].isin(no)]
df_paper_main.reset_index(drop=True, inplace=True)
df_paper_main

Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,5,REMARK AND RELATED WORK,In the previous section we obtained bounds on ...
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,6,CONCLUSIONS,"In summary, minimizing a bound on cross valida..."
...,...,...,...,...
48456,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,2,RELATED WORK,Epistemic Uncertainty in Bayesian Deep Learnin...
48457,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,3,PROPOSED APPROACH,Now we define the problem of uncertainty-aware...
48458,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,4,EXPERIMENTS,"In this section, we describe our experimental ..."
48459,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,5,UNCERTAINTY EXPERIMENT AND ANALYSIS,"In Section 4, we showed that our S-BGNN-T impr..."


### Try to set df_paper_main with all sections

In [132]:
df_paper_main = df_paper[df_paper['head_title'].notna()]
df_paper_main.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144299 entries, 0 to 148546
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   paper_id    144299 non-null  object
 1   head_no     144299 non-null  object
 2   head_title  144299 non-null  object
 3   text        144299 non-null  object
dtypes: object(4)
memory usage: 9.5+ MB


In [15]:
df_paper_main

Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2.1,KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION,The focus in this paper is on the kernel least...
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.1,KEY LEMMAS,In order to prove Theorem 7 we make use of the...
...,...,...,...,...
148542,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,B.4,ANALYZE FOR EPISTEMIC IN OOD DETECTION,"In OOD detection, epistemic uncertainty perfor..."
148543,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C,DERIVATIONS FOR UNCERTAINTY MEASURES AND KL DI...,This appendix provides the derivations and sho...
148544,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.1,UNCERTAINTY MEASURES,Vacuity uncertainty of Bayesian Graph neural n...
148545,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.2,JOINT PROBABILITY,"At the test stage, we infer the joint probabil..."


### Continue

In [16]:
p = inflect.engine()
prepro_list = [p.singular_noun(word) if p.singular_noun(word)!=False else word for word in df_paper_main['head_title']]
df_paper_main['head_title'] = prepro_list
df_paper_main[:10]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paper_main['head_title'] = prepro_list


Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2.1,KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION,The focus in this paper is on the kernel least...
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.1,KEY LEMMA,In order to prove Theorem 7 we make use of the...
5,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.2,PROOF OF LEMMA 11,We can write any interpolating solution to the...
6,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,5,REMARK AND RELATED WORK,In the previous section we obtained bounds on ...
7,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,6,CONCLUSION,"In summary, minimizing a bound on cross valida..."
8,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,A,"EXCESS RISK, GENERALIZATION, AND STABILITY",We use the same notation as introduced in Sect...
9,SP:b80bc890180934092cde037b49d94d6e4e06fad9,1,INTRODUCTION,"In the real world, we are often faced with sit..."


In [17]:
df_heading = pd.DataFrame(df_paper_main.groupby("head_title")["head_title"].count())
df_heading.columns = ["count"]
df_heading.reset_index(inplace=True)
df_heading.sort_values('count', ascending=False, inplace=True)
df_heading.reset_index(drop=True, inplace=True)
df_heading

Unnamed: 0,head_title,count
0,INTRODUCTION,7923
1,RELATED WORK,5533
2,CONCLUSION,5453
3,EXPERIMENT,4733
4,APPENDIX,1559
...,...,...
79476,EXACT ANALYSI WITH TOY MODEL,1
79477,EX-POST DENSITY ESTIMATION,1
79478,EWMA COMPARISON,1
79479,EWC,1


In [18]:
df_heading[df_heading['head_title'].str.contains(r'SETUP*')]

Unnamed: 0,head_title,count
13,EXPERIMENTAL SETUP,801
33,SETUP,228
47,EXPERIMENT SETUP,160
56,PROBLEM SETUP,137
273,TRAINING SETUP,19
...,...,...
78361,EXPERIEMNTAL SETUP,1
78391,EXPERIMEMT SETUP,1
78634,EXPERIMENT FOR OTHER SETUPS,1
78996,EVALUATION SETUP AND METRIC,1


Set section's keys adapted from *paper:*

In [19]:
section_keys = {
    "intro":    ["INTRODUCTION", "PROBLEM", "MOTIVATION", "PRELIMINARY", "OVERVIEW"],
    "liture":   ["RELATED WORK", "BACKGROUND", "THEORY", "PREVIOUS WORK", "PRIOR WORK", "REVIEW", "BASELINE"],
    "method":   ["APPROACH", "TECHNIQUE", "MODEL", "ALGORITHM", "METHOD", "DATASET", "SETUP", "SETTING", "FRAMEWORK", "ARCHITECTURE", "IMPLEMENTATION", "CONTRIBUTION"],
    "result":   ["EXPERIMENT", "RESULT", "EVALUATION", "ABLATION STUDY"],
    "discuss":  ["DISCUSSION", "LIMITATION", "ANALYS"],
    "conclude": ["CONCLU", "FUTURE WORK", "APPLICATION", "SUMMARY", "IMPACT"],
    "addition": ["NOTATION", "APPENDIX", "ACKNOWLEDGMENT", "ACKNOWLEDGEMENT", "EXTENSION", "REMARK", "MATERIAL", "ENVIRONMENT"],
}
introduction_key = ["INTRODUCTION", "PROBLEM", "MOTIVATION", "PRELIMINARY", "OVERVIEW"]
literature_key = ["RELATED WORK", "BACKGROUND", "THEORY", "PREVIOUS WORK", "PRIOR WORK", "REVIEW", "BASELINE"]
method_key = ["APPROACH", "TECHNIQUE", "MODEL", "ALGORITHM", "METHOD", "DATASET", "SETUP", "SETTING", "FRAMEWORK", "ARCHITECTURE", "IMPLEMENTATION", "CONTRIBUTION"]
result_key =["EXPERIMENT", "RESULT", "EVALUATION", "ABLATION STUDY"]
discuss_key = ["DISCUSSION", "LIMITATION", "ANALYS"]
conclusion_key = ["CONCLU", "FUTURE WORK", "APPLICATION", "SUMMARY", "IMPACT"]
additional_key = ["NOTATION", "APPENDIX", "ACKNOWLEDGMENT", "ACKNOWLEDGEMENT", "EXTENSION", "REMARK", "MATERIAL", "ENVIRONMENT"]

In [20]:

# other = ["REPRODUCIBILITY", "SIMULATION", "EXAMPLE", "FORMULATION", "LEARNING", "TASK"]
# pattern = r'CONCLU*|INTRODUCTION*|FUTURE*|WORK*|DISCUSSION*|ACKNOWLEDGMENT*|ACKNOWLEDGEMENT*|APPROACH*|MODEL*|ANALYSI*|ALGORITHM*|RELATED*|EXPERIMENT*|METHOD*|\
#     |PRELIMINARY*|BACKGROUND*|RESULT*|EVALUATION*|APPENDIX*|ABLATION STUDY*|LIMITATION*|PROBLEM*|STATEMENT*|\
#     |APPLICATION*|THEORY*|MOTIVATION*|IMPACT*|METHOD*|REPRODUCIBILITY*|SUMMARY*|DATASET*|STUDY*|SETUP*|\
#     |SIMULATION*|FRAMEWORK*|IMPLEMENTATION*|NOTATION*|SETTING*|ARCHITECTURE*|PREVIOUS WORK*|PRIOR WORK*|REVIEW*|LITERATURE*|\
#     |EXTENSION*|EXAMPLE*|FORMULATION*|OVERVIEW*|REMARK*|LEARNING*|ENVIRONMENT*|BASELINE*|CONTRIBUTION*|TASK*|MATERIAL*'
# mask = df_heading['head_title'].str.contains(pattern)
# n = 30*0
# df_heading[~mask][n:n+30]

In [21]:
def pattern(keys):
    pattern_str = r''
    for key in keys:
        pattern_str += key + "*|"
    return pattern_str[:-1]

def count_sec(keys, col_name):
    mask = df_paper_main['head_title'].str.contains(pattern(keys))
    df_count = pd.DataFrame(df_paper_main[mask].groupby("paper_id")["paper_id"].count())
    df_count.columns = [col_name]
    # intro_count.reset_index(inplace=True)
    return df_count


In [22]:
paper_id_main = list(df_paper_main['paper_id'].drop_duplicates())

### Extract section

In [23]:
# for k, v in section_keys.items():
#     mask = df_paper_main['head_title'].str.contains(pattern(v))
# df_tag = df_paper_main.copy()
keys = section_keys["intro"]
mask = df_paper_main['head_title'].str.contains(pattern(keys))    
df_intro = df_paper_main[mask]
df_intro = df_intro[df_intro['text']!=''].reset_index(drop=True)
paper_intro = list(set(df_intro['paper_id']))
intro_idx = []
i = 0
for paper_id in paper_intro:
    paper_df = df_intro[df_intro['paper_id']==paper_id]
    intro_idx.append(paper_df.index[0])
    # i += 1
    # if i>3: break
    # if len(paper_df)>1:
    #     min_head_no = min(list(paper_df['head_no']))
    #     intro_idx
    #     print(paper_df.index[0])
    #     # print(min_head_no)
    #     break
    # else:
    #     intro_idx.append(0)
    #     break
len(intro_idx)

8038

In [28]:
df_intro_final = df_intro.iloc[intro_idx]
head_title_intro_count = {}
for head_title in df_intro_final['head_title']:
    if head_title not in head_title_intro_count.keys():
        head_title_intro_count[head_title] = 1
    else:
        head_title_intro_count[head_title] += 1
print(head_title_intro_count)
df_intro[df_intro['head_title']=='MAIN RESULT AND OVERVIEW']
# df_paper_main[df_paper_main['paper_id']=='SP:c00f6a4198816665d335df1c8210dc612fa6443f']

{'INTRODUCTION': 7889, 'INTRODUCTION AND MOTIVATION': 14, 'MOTIVATION': 12, 'PROBLEM SETUP': 2, 'THEORETICAL MOTIVATION: WHY TWO-TIME-SCALE MODELS?': 1, 'INTRODUCTION AND CONTRIBUTION': 1, 'INTRODUCTION:': 1, 'PRELIMINARY': 21, 'INTRODUCTION AND SETTING': 1, 'PROBLEM FORMULATION': 6, 'INTRODUCTION AND BACKGROUND': 6, 'INTRODUCTION AND LITERATURE REVIEW': 1, 'ARCHITECTURAL OVERVIEW': 1, 'LINEAR PROBLEM': 1, 'INTRODUCTION AND RELATED WORK': 25, 'INTRODUCTION: STATE OF THE ART': 1, 'BACKGROUND AND MOTIVATION': 4, 'BACKGROUND AND PROBLEM STATEMENT': 1, 'INTRODUCTION TO BANDIT MULTIPLE HYPOTHESIS TESTING': 1, 'PROBLEM STATEMENT': 4, 'BACKGROUND AND PRELIMINARY': 1, 'PRELIMINARIES AND PROBLEM DEFINITION': 1, 'GEOMOL HIGH-LEVEL OVERVIEW': 1, 'PRELIMINARIES: PLANNING IN PARTIALLY-REVEALED ENVIRONMENTS': 1, 'INTRODUCTION & MOTIVATION': 2, 'OVERVIEW OVER LOSS FUNCTIONS': 1, 'PROBLEM DEFINITION': 1, 'MOTIVATION AND CONTRIBUTION': 3, 'PRELIMINARY ON KNOWLEDGE DISTILLATION': 1, 'INTRODUCTION AND MA

Unnamed: 0,paper_id,head_no,head_title,text
10219,SP:ed67cfd40308aa1fcadb6289553946a45da16473,1,MAIN RESULT AND OVERVIEW,Consider functions computed by a single ReLU l...


In [142]:
df_intro_final

Unnamed: 0,paper_id,head_no,head_title,text
3553,SP:0856c81319cd9dbbefc91f8357a04c664bd084b5,1,INTRODUCTION,Neural architecture search (NAS) describes the...
11391,SP:94d8eb827399e58de2a0aed1e5d3a1d629d7fcf7,1,INTRODUCTION,Advances in Deep Reinforcement Learning (RL) h...
5237,SP:ee5f7316d51bcb8e09c19ee565498cc13f6349f3,1,INTRODUCTION,"In classification tasks, for input x, we appro..."
2881,SP:d713225fa41061a2ad23a072786c21f066b2a777,1,INTRODUCTION,Geometric deep learning refers to the developm...
6410,SP:6efb95d8994e07d9c4482ca601df9019a0df93a6,1,INTRODUCTION,Reinforcement learning (RL) for continuous act...
...,...,...,...,...
6533,SP:6e0affde9a52c2dbc458ef3f877f5ffa2790f77b,1,INTRODUCTION,Since the beginning of the recent wave of deep...
1284,SP:f485de73661d59efd25025ddf9778652edb306c1,1,INTRODUCTION,For robotic applications deployed in the real ...
1487,SP:9bc80503d9771b780501b2dacac2cc37e4f5cd95,1,INTRODUCTION,A molecule’s three-dimensional (3D) shape is c...
251,SP:09bbd1a342033a65e751a8878c23e3fa6facc636,1,INTRODUCTION,One of the most important distinctions between...


In [37]:
def len_paragraph(text):
    return len(text.split("\n"))
len_paragraph(df_intro_final.reset_index().iloc[0]["text"])

5

In [41]:
df_intro_final["paragraphs"] = df_intro_final["text"].apply(len_paragraph)
df_intro_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_intro_final["paragraphs"] = df_intro_final["text"].apply(len_paragraph)


Unnamed: 0,paper_id,head_no,head_title,text,paragraphs
9717,SP:cbb2ef0a292693fffd77e7a95183e4725bf21dcd,1,INTRODUCTION,Learning new concepts and skills from a small ...,5
6237,SP:607bb2cfb36e19ba87c6d973da0d03b23eb1a445,1,INTRODUCTION,Batch Normalization (BN) is an indispensable c...,4
6647,SP:119ec5a7b1bc981afd4d248e4643a0f0b3d49c3c,1,INTRODUCTION,Consider the castle made out of toy blocks in ...,5
2091,SP:6bfdc3596045227aaed04a50cd934e5d4bc1e9ad,1,INTRODUCTION,Recent research has shown that deep neural net...,7
5575,SP:685bfac3b09438a4669b0d581a8eafdf73a81cc5,1,INTRODUCTION,Learning a good representation for high-dimens...,10
...,...,...,...,...,...
6836,SP:452600e23747ac98ed7513304b5a008d8ee278bf,1,INTRODUCTION,Neural networks are one of the most powerful m...,8
8460,SP:68815f20a6f2092c3ac14949d4295b9849fec002,1,INTRODUCTION,Canonical correlation analysis (CCA) is a clas...,14
9271,SP:db15839ac53280a4ef57ee472565c137d975f52e,1,INTRODUCTION,Animals play an important role in our everyday...,6
2540,SP:82c52fe144129e913a29248317952ac0fb520ffe,2,PRELIMINARIES AND IRM,"Notations. We use lowercase (e.g., x), upperca...",7


### Count section catagories

In [42]:
df_section_group = pd.DataFrame(index=paper_id_main)
df_section_group.index.name='paper_id'

for k, v in section_keys.items():
    df_count = count_sec(v, k)
    df_section_group = df_section_group.merge(df_count, on="paper_id", how='outer', suffixes=('_1', '_2'))

df_section_group

Unnamed: 0_level_0,intro,liture,method,result,discuss,conclude,addition
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1.0,1.0,,,,1.0,1.0
SP:b80bc890180934092cde037b49d94d6e4e06fad9,1.0,1.0,2.0,3.0,,1.0,
SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,1.0,1.0,6.0,6.0,,1.0,1.0
SP:a1e2218e6943bf138aeb359e23628676b396ed66,2.0,,,1.0,2.0,1.0,1.0
SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,1.0,2.0,3.0,1.0,,1.0,1.0
...,...,...,...,...,...,...,...
SP:77d59e1e726172184249bdfdd81011617dc9c208,2.0,,3.0,,,,
SP:e58dc2d21175a62499405b7f4c3a03b135530838,2.0,,2.0,3.0,2.0,1.0,
SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,1.0,1.0,1.0,1.0,1.0,1.0,
SP:4706017e6f8b958c7d0825fed98b285ea2994b59,1.0,1.0,1.0,1.0,1.0,1.0,


In [52]:
df_paper_main

Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2.1,KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION,The focus in this paper is on the kernel least...
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.1,KEY LEMMA,In order to prove Theorem 7 we make use of the...
...,...,...,...,...
148542,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,B.4,ANALYZE FOR EPISTEMIC IN OOD DETECTION,"In OOD detection, epistemic uncertainty perfor..."
148543,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C,DERIVATION FOR UNCERTAINTY MEASURES AND KL DIV...,This appendix provides the derivations and sho...
148544,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.1,UNCERTAINTY MEASURE,Vacuity uncertainty of Bayesian Graph neural n...
148545,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.2,JOINT PROBABILITY,"At the test stage, we infer the joint probabil..."


In [43]:
df_section_group[df_section_group['conclude'].isna()]

Unnamed: 0_level_0,intro,liture,method,result,discuss,conclude,addition
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SP:a20769de2c7acf390c7e3bece904a17df6a991bd,1.0,2.0,,1.0,1.0,,1.0
SP:95ba9ad102adafaabf9671737e6549728d104629,1.0,1.0,1.0,1.0,1.0,,
SP:797b07cd8142a35333037bb573db0dfe5dde65ac,1.0,,3.0,7.0,,,
SP:72d1283f3602edc22896934271fcec5b03f25d9e,2.0,,1.0,5.0,,,1.0
SP:c83ecc74eb885df5f29e5a7080a8c60d1ee0a3b0,1.0,,1.0,2.0,1.0,,1.0
...,...,...,...,...,...,...,...
SP:7cd001a35175d8565c046093dcf070ba7fa988d6,2.0,,4.0,3.0,1.0,,2.0
SP:67c44f33dff59e4d218f753fdbc6296da62cdf62,2.0,3.0,5.0,1.0,1.0,,2.0
SP:62a75399aa97a61432385cf1dffabb674741a18a,1.0,1.0,4.0,1.0,1.0,,1.0
SP:76a052062e3e4bb707b24a8809c220c8ac1df83a,1.0,1.0,4.0,1.0,1.0,,1.0


In [402]:
df_section_group.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8232 entries, SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc to SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   intro     8058 non-null   float64
 1   liture    7177 non-null   float64
 2   method    7160 non-null   float64
 3   result    7664 non-null   float64
 4   discuss   4391 non-null   float64
 5   conclude  7117 non-null   float64
dtypes: float64(6)
memory usage: 450.2+ KB


In [410]:
No_sec_paper = list(df_section_group[df_section_group['conclude'].isna()].index)
df_paper[df_paper['paper_id']==No_sec_paper[6]]

Unnamed: 0,paper_id,head_no,head_title,text
1210,SP:a8bb14b514e474691be63b51582544a9befa7125,1,INTRODUCTION,The majority of pruning algorithms for Deep Ne...
1211,SP:a8bb14b514e474691be63b51582544a9befa7125,2,RELATED WORK,Pruning trained models Most of the pruning wor...
1212,SP:a8bb14b514e474691be63b51582544a9befa7125,3,PROBLEM FORMULATION: PRUNING AT INITIALIZATION,"Given a dataset D = {(xi,yi)}ni=1, the trainin..."
1213,SP:a8bb14b514e474691be63b51582544a9befa7125,4,FORESIGHT CONNECTION SENSITIVITY,Since removing connections of a neural network...
1214,SP:a8bb14b514e474691be63b51582544a9befa7125,5,EXPERIMENTS,In the following we evaluate the efficacy of o...
1215,SP:a8bb14b514e474691be63b51582544a9befa7125,5.1,RESULTS ON CIFAR-10,Fig 2 compares the accuracy of the described i...
1216,SP:a8bb14b514e474691be63b51582544a9befa7125,5.2,RESULTS ON LARGER DATASETS,We now present experiments on large datasets. ...
1217,SP:a8bb14b514e474691be63b51582544a9befa7125,5.3,ANALYSIS,Saliency optimization To experimentally valida...
1218,SP:a8bb14b514e474691be63b51582544a9befa7125,6,DISCUSSION,Pruning at initialization has become an active...
1219,SP:a8bb14b514e474691be63b51582544a9befa7125,,ACKNOWLEDGMENTS,This work was supported by the Royal Academy o...


In [379]:
def no_section_info(section):
    No_sec_paper = list(df_section_group[df_section_group[section].isna()].index)
    return [list(df_paper[df_paper['paper_id']==paper]['head_title']) for paper in No_sec_paper]
no_con = no_section_info('conclude')
len(no_con)

1194

In [380]:
for i, sections in enumerate(no_con):
    print('='*50)
    print("\n".join(sections))
    if i>=20: break

INTRODUCTION
INFORMATION LATTICE: ABSTRACTIONS AND RULES OF A SIGNAL
INFORMATION LATTICE LEARNING (ILL)
PRACTICAL LATTICE CONSTRUCTION: TO START LIKE A BABY (PHASE I)
PRACTICAL LATTICE LEARNING: TO LEARN LIKE A CHILD (PHASE II)
ILL EXAMPLES
DISCUSSION: LIMITATIONS AND CHALLENGES
CONNECTION TO CONCEPT LATTICE
MORE GENERALIZED FORMALISM FOR INFORMATION LATTICE
MORE INSIGHTS ON THE SPECIAL LIFTING
EXISTING WORK ON SUBLATTICE GENERATION
MORE DETAILS ON THE CONSTRUCTION PHASE
MORE ANALYSES IN THE LEARNING PHASE
STUDIES ON ILL-BASED MUSIC APPLICATION
CONCLUSION AND BROADER IMPACTS


TypeError: sequence item 0: expected str instance, NoneType found

In [301]:
no_section_info('intro')[2]

['BACKGROUND',
 'INFERENCE IN DLVMS AFFECTED BY MNAR',
 'EXPERIMENT',
 'CONCLUSION']

In [44]:
# df_section_group_new = df_section_group.reset_index(drop=True)
# df_section_group_new = pd.concat([df_section_group, intro_count], axis=1, join='outer')
df_section_group_new = df_section_group.merge(intro_count, on="paper_id", how='outer', suffixes=('_1', '_2'))
df_section_group_new[df_section_group_new["intro"]==None]

NameError: name 'intro_count' is not defined

In [153]:
df_section_group_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8379 entries, SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc to SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   intro   7989 non-null   float64
dtypes: float64(1)
memory usage: 130.9+ KB


# Section identification for MuP dataset

## Reused function & lib

In [2]:
import os
import sys
import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import inflect

In [113]:
RAWDATAFILES = {
    "train": "training_complete.jsonl",
    "val": "validation_complete.jsonl",
    "test": "testing_with_paper_release.jsonl"
}
def print_progress(curr, full, desc='', bar_size=30):    
    bar = int((curr+1)/full*bar_size)
    sys.stdout.write(f"\r{desc}[{'='*bar}{' '*(bar_size-bar)}] {curr+1}/{full}")
    # sys.stdout.flush()
    if curr+1==full: print()
def load_data(data_split):
    main_path = str((Path().absolute()).parents[0])    
    filepath = f"{main_path}/dataset_MuP/{RAWDATAFILES[data_split]}"
    with open(filepath, 'r') as json_file:
        json_list = list(json_file)
        col_name = ["paper_id", "paper","summary"]
    dataset = []
    data_len = len(json_list)
    for i, json_str in enumerate(json_list[:]):
        result = json.loads(json_str)
        dataset.append({
            "paper_id": result["paper_id"],
            "paper": result["paper"],
            "summary": result["summary"],
        })
        print_progress(i, data_len, desc=f'Loading {data_split} data', bar_size=30)
    return pd.DataFrame(dataset)

In [51]:
def section_extraction(sec):
    text = sec['text']
    try:
        # sec['heading']=sec['heading'].upper()
        sec_split = ((sec)['heading'].upper()).split()
        if len(sec_split)==1:
            head_no = None
            head_title = sec_split[0]
        else:
            head_no = sec_split[0]
            head_title = " ".join(sec_split[1:])
    except:
        head_no = None
        head_title = None
    return head_no, head_title, text

def get_ext_section(df):
    full_len = len(df)
    sections_data = [] 
    one_section_papers = []
    for ind, row in (df).iterrows():
        sections = ((row['paper'])['sections'])
        if len(sections)<=1: 
            one_section_papers.append(row['paper_id'])
            continue
        for sec in sections:
            head_no, head_title, text = section_extraction(sec)
            if head_no is not None:
                while head_no[-1] in ['.','/',',',')']:
                    head_no = head_no[:-1]
            sections_data.append({
                'paper_id': row['paper_id'],
                'head_no': head_no,
                'head_title': head_title,
                'text': text,
            })   
    return pd.DataFrame(sections_data)

In [52]:
section_keys = {
    "introduction": ["INTRODUCTION", "PROBLEM", "MOTIVATION", "PRELIMINARY", "OVERVIEW"],
    "literature":   ["LITERATURE", "RELATED WORK", "BACKGROUND", "THEORY", "PREVIOUS WORK", "PRIOR WORK", "REVIEW", "BASELINE"],
    "method":       ["APPROACH", "TECHNIQUE", "MODEL", "ALGORITHM", "METHOD", "DATASET", "SETUP", "SETTING", "FRAMEWORK", "ARCHITECTURE", "IMPLEMENTATION", "CONTRIBUTION"],
    "result":       ["EXPERIMENT", "RESULT", "EVALUATION", "ABLATION STUDY"],
    "discussion":   ["DISCUSSION", "LIMITATION", "ANALYS"],
    "conclusion":   ["CONCLU", "FUTURE WORK", "APPLICATION", "SUMMARY", "IMPACT"],
    "addition":     ["NOTATION", "APPENDIX", "ACKNOWLEDGMENT", "ACKNOWLEDGEMENT", "EXTENSION", "REMARK", "MATERIAL", "ENVIRONMENT"],
}
def pattern(keys):
    pattern_str = r''
    for key in keys:
        pattern_str += key + "*|"
    return pattern_str[:-1]

## New functions

In [82]:
def section_processes(df_section):
    # Remove Null 'head_title'
    df_section = df_section[df_section['head_title'].notna()]
    # Convert plural to singular noun
    p = inflect.engine()
    prepro_list = [p.singular_noun(word) if p.singular_noun(word)!=False else word for word in df_section['head_title']]
    # df_section['head_title'] = prepro_list
    df_section = df_section.drop(['head_title'], axis=1)
    df_section = df_section.assign(head_title = prepro_list)    
    return df_section

def filter_sec(df_section, keys):
    mask = df_section['head_title'].str.contains(pattern(keys))    
    df_filter = df_section[mask]
    df_filter = df_filter[df_filter['text']!=''].reset_index(drop=True)
    paperId_filter = list(set(df_filter['paper_id']))
    idx_filter = []
    i = 0
    for paper_id in paperId_filter:
        paper_df = df_filter[df_filter['paper_id']==paper_id]
        idx_filter.append(paper_df.index[0])
    return df_filter.iloc[idx_filter]

def export_data(data, output_dir, filename):
    print(f"Writing data to {output_dir+filename} file")
    if not(Path(output_dir).exists()): os.mkdir(output_dir)
    with open(output_dir+filename, 'w', encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print(f'Wrote {len(data)} records to {output_dir}{filename}')

In [128]:
def all_processes(data_split, sec_catagory=["introduction", 'conclusion'], export=False):
    df_main = load_data(data_split)
    df_paper = df_main.drop(['summary'], axis=1).drop_duplicates(subset=['paper_id'])
    
    print("Extracting section")
    df_section = get_ext_section(df_paper)
    print("Processing with extracted sections")
    df_section = section_processes(df_section)
    
    # Extract text from desired catagory
    df_cat_all = 0
    for catagory in sec_catagory:
        print(f"Extracting \"{catagory}\" catagory")
        df_cat = filter_sec(df_section, keys=section_keys[catagory])
        df_cat.rename(columns = {
            'text': f'{catagory}',
        }, inplace = True)
        df_cat.drop(['head_no', 'head_title'], axis=1, inplace = True)
        if isinstance(df_cat_all, int):
            df_cat_all = df_cat.copy()
        else:
            df_cat_all = df_cat_all.merge(df_cat, on="paper_id", how='outer')

    
    
    abstract = []
    for idx,row in df_paper.iterrows():
        abstract.append({
            'paper_id': row['paper_id'],
            'abstract': row['paper']['abstractText'],
        })
        # print(row['paper']['abstractText'])
    df_abs = pd.DataFrame(abstract)
    df_final = df_abs.merge(df_cat_all, on="paper_id", how='outer')
    
    # print(f">>>Remove {len(df_paper)-len(df_final)}/{len(df_paper)} incorrect papers")
    null_allCat = True
    for catagory in sec_catagory:
        null_cat = df_final[catagory].isna()
        null_allCat = null_allCat*null_cat
        print(f">>>There are {null_cat.sum()}/{len(df_final)} null value in {catagory} catagory")
    print(f">>>There are {null_allCat.sum()}/{len(df_final)} null value in all catagories")
        
        
    if export:
        dict_list = df_final.to_dict('records')
        main_path = str((Path().absolute()).parents[0])
        export_dir = f"{main_path}/dataset_MuP/"
        export_data(
            data=dict_list, 
            output_dir=export_dir, 
            filename=f'{data_split}_iden.jsonl',
        )
    


    return df_final


In [130]:
for data_split in RAWDATAFILES.keys():
    print(f"{'*'*30} Processing with {data_split} data {'*'*30}")
    df_cat_all = all_processes(
        data_split, 
        export=True
    )

****************************** Processing with train data ******************************
Extracting section
Processing with extracted sections
Extracting "introduction" catagory
Extracting "conclusion" catagory
>>>There are 341/8379 null value in introduction catagory
>>>There are 1271/8379 null value in conclusion catagory
>>>There are 219/8379 null value in all catagories
Writing data to /home/nopphawann/My_Thesis_Playground/dataset_MuP/train_iden.jsonl file
Wrote 8379 records to /home/nopphawann/My_Thesis_Playground/dataset_MuP/train_iden.jsonl
****************************** Processing with val data ******************************
Extracting section
Processing with extracted sections
Extracting "introduction" catagory
Extracting "conclusion" catagory
>>>There are 40/1060 null value in introduction catagory
>>>There are 163/1060 null value in conclusion catagory
>>>There are 21/1060 null value in all catagories
Writing data to /home/nopphawann/My_Thesis_Playground/dataset_MuP/val_iden

In [135]:
section = "abstract+introduction"
section = section.split("+")
section

['abstract', 'introduction']

In [95]:
dict_list = df_cat_all.to_dict('records')
dict_list[0]

{'paper_id': 'SP:b19df5243359791fbaad005d6f13d7e9fdb0ff63',
 'abstract': 'Role-based learning holds the promise of achieving scalable multi-agent learning by decomposing complex tasks using roles. However, it is largely unclear how to efficiently discover such a set of roles. To solve this problem, we propose to first decompose joint action spaces into restricted role action spaces by clustering actions according to their effects on the environment and other agents. Learning a role selector based on action effects makes role discovery much easier because it forms a bi-level learning hierarchy: the role selector searches in a smaller role space and at a lower temporal resolution, while role policies learn in significantly reduced primitive action-observation spaces. We further integrate information about action effects into the role policies to boost learning efficiency and policy generalization. By virtue of these advances, our method (1) outperforms the current state-of-the-art MARL a

In [146]:

main_path = str((Path().absolute()).parents[0])
filepath = f"{main_path}/dataset_MuP/{data_split}_iden.jsonl"
with open(filepath, 'r') as json_file:
    json_list = list(json_file)
dataset_list = []

for i, json_str in enumerate(json_list):
    data = json.loads(json_str)
    # print(data["conclusion"])
    if not isinstance(data["conclusion"], str): 
        print(i, "NONE")
        print(data["conclusion"])
    

11 NONE
nan
20 NONE
nan
44 NONE
nan
57 NONE
nan
61 NONE
nan
65 NONE
nan
68 NONE
nan
73 NONE
nan
76 NONE
nan
94 NONE
nan
100 NONE
nan
124 NONE
nan
141 NONE
nan
151 NONE
nan
159 NONE
nan
175 NONE
nan
176 NONE
nan
183 NONE
nan
188 NONE
nan
191 NONE
nan
193 NONE
nan
201 NONE
nan
202 NONE
nan
219 NONE
nan
223 NONE
nan
224 NONE
nan
236 NONE
nan
238 NONE
nan
249 NONE
nan
256 NONE
nan
264 NONE
nan
269 NONE
nan
278 NONE
nan
291 NONE
nan
296 NONE
nan
312 NONE
nan
321 NONE
nan
322 NONE
nan
324 NONE
nan
328 NONE
nan
332 NONE
nan
336 NONE
nan
337 NONE
nan
345 NONE
nan
373 NONE
nan
378 NONE
nan
390 NONE
nan
397 NONE
nan
403 NONE
nan
405 NONE
nan
431 NONE
nan
436 NONE
nan
441 NONE
nan
448 NONE
nan
450 NONE
nan
458 NONE
nan
471 NONE
nan
476 NONE
nan
481 NONE
nan
487 NONE
nan
496 NONE
nan
497 NONE
nan
535 NONE
nan
542 NONE
nan
554 NONE
nan
560 NONE
nan
567 NONE
nan
573 NONE
nan
576 NONE
nan
585 NONE
nan
600 NONE
nan
608 NONE
nan
614 NONE
nan
629 NONE
nan
640 NONE
nan
642 NONE
nan
658 NONE
nan
659 NONE


## Visualize data

In [32]:
import os
#comment this if you are not using puffer?
os.environ['http_proxy'] = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

from transformers import BartTokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

In [36]:
def count_paragraph(text):
    return len(text.split("\n"))
def count_tokens(text):
    tokens = tokenizer.encode(text, return_tensors='pt')
    return tokens.shape[1]

In [37]:
text = "We study the average CVloo stability of kernel ridge-less regression and derive corresponding risk bounds. We show that the interpolating solution with minimum norm minimizes a bound on CVloo stability, which in turn is controlled by the condition number of the empirical kernel matrix. The latter can be characterized in the asymptotic regime where both the dimension and cardinality of the data go to infinity. Under the assumption of random kernel matrices, the corresponding test error should be expected to follow a double descent curve."
print(len(text.split(" ")))
count_tokens(text)


85


104

In [15]:
df_cat_all

Unnamed: 0,paper_id,abstract,introduction,conclusion
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,We study the average CVloo stability of kernel...,Statistical learning theory studies the learni...,"In summary, minimizing a bound on cross valida..."
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,The use of episodic memories in continual lear...,"In the real world, we are often faced with sit...",The two fundamental problems of continual lear...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,Existing Multi-Task Learning(MTL) strategies l...,The process of Multi-Task Learning (MTL) on a ...,This work proposes a task-aware framework whic...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,This paper deals with the fuel optimization pr...,Hybrid electric vehicles powered by fuel cells...,"In this paper, we have proposed a robust concu..."
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,Our work is concerned with the generation and ...,There is an increasing interest in developing ...,In this work we propose the first graph-based ...
...,...,...,...,...
8155,SP:77d59e1e726172184249bdfdd81011617dc9c208,Quantum machine learning methods have the pote...,Data sets used for training machine learning m...,
8156,SP:e58dc2d21175a62499405b7f4c3a03b135530838,Trained generative models have shown remarkabl...,Generative deep neural networks have shown rem...,
8157,SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,Discovering the underlying mathematical expres...,Understanding the mathematical relationships a...,We introduce an unconventional approach to sym...
8158,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,Some conventional transforms such as Discrete ...,Large Convolutional Neural Networks (CNNs) (Kr...,We propose the new PC layers through conventio...


In [23]:
paragraph = df_cat_all['introduction'].iloc[8].split("\n")
print(len(paragraph))
paragraph

5


['Primates perform well at generalization tasks. If presented with a single visual instance of an object, they often immediately can generalize and envision the object in different attributes, e.g., in different 3D pose (Logothetis et al., 1995). Primates can readily do so, as their previous knowledge allows them to be cognizant of attributes. Machines, by contrast, are most-commonly trained on sample features (e.g., pixels), not taking into consideration attributes that gave rise to those features.',
 'To aid machine cognition of visual object attributes, a class of algorithms focuses on learning disentangled representations (Kingma & Welling, 2014; Higgins et al., 2017; Burgess et al., 2018; Kim & Mnih, 2018; Chen et al., 2018), which map visual samples onto a latent space that separates the information belonging to different attributes. These methods show disentanglement by interpolating between attribute values (e.g., interpolate pose, etc). However, these methods usually process o

In [230]:
df_cat_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8160 entries, 0 to 8159
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   paper_id           8160 non-null   object
 1   introduction_text  8038 non-null   object
 2   conclusion_text    7108 non-null   object
 3   abstract           8160 non-null   object
dtypes: object(4)
memory usage: 318.8+ KB


In [189]:
df_cat_all['intro_text'].isna().sum()


122

In [191]:
df_cat_all['conclude_text'].isna().sum()


1052

# Section-based perspective extraction

In [77]:
p = inflect.engine()
df_paper.dropna(inplace=True)
prepro_list = [p.singular_noun(word) if p.singular_noun(word)!=False else word for word in df_paper['head_title']]
df_paper['head_title'] = prepro_list
df_paper[:10]

Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2.1,KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION,The focus in this paper is on the kernel least...
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.1,KEY LEMMA,In order to prove Theorem 7 we make use of the...
5,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.2,PROOF OF LEMMA 11,We can write any interpolating solution to the...
6,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,5,REMARK AND RELATED WORK,In the previous section we obtained bounds on ...
7,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,6,CONCLUSION,"In summary, minimizing a bound on cross valida..."
8,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,A,"EXCESS RISK, GENERALIZATION, AND STABILITY",We use the same notation as introduced in Sect...
9,SP:b80bc890180934092cde037b49d94d6e4e06fad9,1,INTRODUCTION,"In the real world, we are often faced with sit..."


In [204]:
perspective_keys = {
    "introduction":  ["INTRODUCTION"],
    "research_gap":  ["LITERATURE", "RELATED WORK", "BACKGROUND", "PREVIOUS WORK", "PRIOR WORK", "REVIEW", "BASELINE"],
    "model":         ["APPROACH", "TECHNIQUE", "MODEL", "ALGORITHM", "METHOD", "FRAMEWORK", "ARCHITECTURE"],
    "experiment":    ["EXPERIMENT", "SETUP", "SETTING", "IMPLEMENTATION", "CONTRIBUTION"],
    "result_discuss":["RESULT", "EVALUATION", "ABLATION STUDY", "DISCUSSION", "LIMITATION", "ANALYS"],
    "conclusion":    ["CONCLU", "FUTURE WORK", "APPLICATION", "SUMMARY", "IMPACT"],
    "addition":      ["NOTATION", "APPENDIX", "ACKNOWLEDGMENT", "ACKNOWLEDGEMENT", "EXTENSION", "REMARK", "MATERIAL", "ENVIRONMENT"],
}
def pattern(keys):
    pattern_str = r''
    for key in keys:
        pattern_str += key + "*|"
    return pattern_str[:-1]

In [205]:

df_paper_copy = df_paper.copy()
for sec, keys in perspective_keys.items():
    mask = (df_paper_copy['head_title'].str.contains(pattern(keys)))
    df_paper_copy[sec] = mask.astype(int)
sec_list = list(perspective_keys.keys())
df_paper_copy['sum_sec'] = df_paper_copy[sec_list].sum(axis=1)
drop_rows = (df_paper_copy[df_paper_copy['sum_sec']>1].index)
# df_paper_copy = df_paper.copy()
df_paper_copy.drop(drop_rows, inplace=True)
print(f"Dropped unclear sections: {len(drop_rows)}/{len(df_paper)} (Remain {len(df_paper_copy)})")
# for sec in sec_list:
#     df_ = (df_paper_copy[df_paper_copy[sec]==1])
#     break


Dropped unclear sections: 5405/144299 (Remain 138894)


## Paragraph splitting

In [299]:
import re

def split_paragraph(text):
    # pattern = r'(?<!:)\n(?!•|\s*[-*])'
    pattern = r'(?:\.|\?)\n(?!•|\s*[-*]|[0-9]\..+|[0-9].+|[a-z]\..+|[a-z].+)'
    split_text = re.split(pattern, text)
    return [paragraph+'.' for paragraph in split_text[:-1]]+[split_text[-1]]

def count_paragraph(text):
    return len(split_paragraph(text))

math_symbols = {'×', '÷', '√', '∛', 'ⁿ√', '⁄', '∝', '≥', '≤', '≠', '≈', '∧', '∨', '¬', '∩', '∪', '⊂', '⊃', '∅', '∫', '∞', '°', '⊥'}
special_symbols = {'⏒', '⋥', '⋈', '∇', '⋒', '∋', '∌', '∟', '〈', '∦', '⋭', '⊘', '⋦', '∑', '⋠', '⋛', '∠', '⊙', '⏕', '⊛', '〉', '⋯', '⋏', '⋪', '⋬', '⋔', '∵', '⊉', '⋧', '⋓', '⋢', '⊈', '⋊', '⏚', '⊇', '⋟', '⊆', '⋉', '∀', '⊞', '⎱', '∈', '⋣', '⎰', '∥', '⊗', '⋩', '⏘', '⏙', '⏗', '⋨', '⊄', '∉', '⌋', '⋌', '⋆', '∃', '∴', '⊜', '∆', '⋱', '⋋', '∄', '⍟', '⌈', '⌉', '⋫', '⋚', '⏛', '⊖', '⏔', '⊡', '⋇', '⋙', '⍼', '⌊', '⊅', '⋘', '⌶', '≡', '⊟', '⋰', '⊕', '⏖', '⋮', '⊠', '⏑', '⊚', '⋤', '⋄', '⏓', '⋗', '⊝', '⋖'}
lowercase_greek_symbols = {'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω'}
uppercase_greek_symbols = {'Α', 'Β', 'Γ', 'Δ', 'Ε', 'Ζ', 'Η', 'Θ', 'Ι', 'Κ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο', 'Π', 'Ρ', 'Σ', 'Τ', 'Υ', 'Φ', 'Χ', 'Ψ', 'Ω'}

def detect_math_expressions(text):
    matched = set(text) & (math_symbols | special_symbols | lowercase_greek_symbols | uppercase_greek_symbols)
    return len(matched)>0

import spacy
nlp = spacy.load('en_core_web_sm')

def split_sentence(text):
    doc = nlp(text)
    sentences = [re.sub(r'\n', '', sent.text) for sent in doc.sents]
    return sentences

def remove_bullet(text):
    pattern = r'^[\(\[\{]?([a-zA-Z0-9])[\.\)\]\}\s]+|^[\(\[\{]?(?:[ivxlcdm]+)[\.\)\]\}\s]+'
    text = re.sub(pattern, '', text)
    return text

In [300]:
# df_intro = (df_paper_copy[df_paper_copy['introduction']==1]).drop(columns=sec_list+['sum_sec']).reset_index()
# df_intro['paragraph'] = df_intro['text'].apply(count_paragraph)
# df_intro

# for i, paragraph in enumerate(split_text(text)):
#     print('-'*50,i+1,'-'*50)
#     print((paragraph))
#     for sent in split_sentence(paragraph):
#         print('-', remove_bullet(sent))


In [301]:
# sentences = split_sentence(text)
# for j, sentence in enumerate(sentences):
#     print('-'*20,j+1,'-'*20)
#     print(sentence)

In [302]:
# for i, paragraph in enumerate(split_text(text)):
#     print('-'*50,i+1,'-'*50)
#     # print(paragraph)
#     sentences = split_sentence(paragraph)
#     for j, sentence in enumerate(sentences):
#         print('-'*20,j+1,'-'*20)
#         print(is_sentence(sentence))
#         print(sentence)

In [336]:
# import string library function
	
# Storing the sets of punctuation in variable result
result = list(string.punctuation)
	
# Printing the punctuation values
print(result)
print(type(result))


['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
<class 'list'>


In [352]:
import string

nlp = spacy.load('en_core_web_sm')

def is_sentence(sentence):
    doc = nlp(sentence)
    # for w in doc:
    #     print(w, type(w))
    words = [(w) for w in doc if str(w) not in list(string.punctuation)]
    print(len(words))
    print(words)
    
def prepro_text(text, num_paragraph=None):
    paragraphs = split_paragraph(text)
    if num_paragraph!=None:
        paragraphs = [paragraphs[num_paragraph]]
    sentences = []
    # print([split_sentence(paragraph) for paragraph in paragraphs])
    sentences = [sentence for sentences in [split_sentence(paragraph) for paragraph in paragraphs] for sentence in sentences]
    return sentences

text = """Paragraph number 1 is here.
Paragraph number 2 is here. There are the contents:
(1) The 1st bullet.
(2) The 2nd bullet.
(3) The 3rd bullet.
Paragraph number 3 is here. This is the content.
Paragraph number 4 is here. This is the content.
Paragraph number 5 is here. This is the content."""
# prepro_text(
#     text, 
    # num_paragraph=-1
# )

In [353]:
is_sentence("Paragraph number 3 is here.")

5
[Paragraph, number, 3, is, here]


In [325]:
# df_section = {}

    
for sec in sec_list:
    df_section = (df_paper_copy[df_paper_copy[sec]==1]).drop(columns=sec_list+['sum_sec']).reset_index(drop=True)
    # df_section['processed_text'] = df_section['text'].apply(split_paragraph).apply
    for idx, row in df_section.iterrows():
        sec_text = row['text']
    #     paragraphs = split_paragraph(sec_text)
        break
    break

# prepro_text(
#     sec_text, 
#     num_paragraph=-1
# )
print(sec_text)

Statistical learning theory studies the learning properties of machine learning algorithms, and more fundamentally, the conditions under which learning from finite data is possible. In this context, classical learning theory focuses on the size of the hypothesis space in terms of different complexity measures, such as combinatorial dimensions, covering numbers and Rademacher/Gaussian complexities (Shalev-Shwartz & Ben-David, 2014; Boucheron et al., 2005). Another more recent approach is based on defining suitable notions of stability with respect to perturbation of the data (Bousquet & Elisseeff, 2001; Kutin & Niyogi, 2002). In this view, the continuity of the process that maps data to estimators is crucial, rather than the complexity of the hypothesis space. Different notions of stability can be considered, depending on the data perturbation and metric considered (Kutin & Niyogi, 2002). Interestingly, the stability and complexity approaches to characterizing the learnability of proble

In [354]:
prepro_text(
    sec_text, 
    # num_paragraph=-1
)

['Statistical learning theory studies the learning properties of machine learning algorithms, and more fundamentally, the conditions under which learning from finite data is possible.',
 'In this context, classical learning theory focuses on the size of the hypothesis space in terms of different complexity measures, such as combinatorial dimensions, covering numbers and Rademacher/Gaussian complexities (Shalev-Shwartz & Ben-David, 2014; Boucheron et al., 2005).',
 'Another more recent approach is based on defining suitable notions of stability with respect to perturbation of the data (Bousquet & Elisseeff, 2001; Kutin & Niyogi, 2002).',
 'In this view, the continuity of the process that maps data to estimators is crucial, rather than the complexity of the hypothesis space.',
 'Different notions of stability can be considered, depending on the data perturbation and metric considered (Kutin & Niyogi, 2002).',
 'Interestingly, the stability and complexity approaches to characterizing the 