# Import libs

In [22]:
import os
import sys
import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import inflect

# Load dataset

In [10]:
RAWDATAFILES = {
    "train": "training_complete.jsonl",
    "val": "validation_complete.jsonl",
    "test": "testing_with_paper_release.jsonl"
}
def print_progress(curr, full, desc='', bar_size=30):    
    bar = int((curr+1)/full*bar_size)
    sys.stdout.write(f"\r{desc}[{'='*bar}{' '*(bar_size-bar)}] {curr+1}/{full}")
    # sys.stdout.flush()
    if curr+1==full: print()
def load_data(data_split):
    main_path = str((Path().absolute()).parents[0])    
    filepath = f"{main_path}/dataset_MuP/{RAWDATAFILES[data_split]}"
    with open(filepath, 'r') as json_file:
        json_list = list(json_file)
        col_name = ["paper_id", "paper","summary"]
    dataset = pd.DataFrame(columns=col_name)
    data_len = len(json_list)
    for i, json_str in enumerate(json_list[:]):
        result = json.loads(json_str)
        df = pd.DataFrame([[result["paper_id"], result['paper'], result["summary"]]], columns=col_name)
        print_progress(i, data_len, bar_size=50)
        dataset = pd.concat([dataset,df])
    return dataset

In [11]:
df_train = load_data("train")
# print(len(df_train))
df_train.reset_index(inplace=True, drop=True)
df_train.head()



Unnamed: 0,paper_id,paper,summary
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...,This paper investigates kernel ridge-less regr...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...,This paper presents a novel way of making full...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...,This paper proposes a new framework that compu...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...,This work proposes a deep reinforcement learni...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...,This paper proposes 3 deep generative models b...


## Drop summary column

In [13]:
df_train.drop(['summary'], axis=1, inplace=True)
df_train

Unnamed: 0,paper_id,paper
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...
...,...,...
18929,SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,{'abstractText': 'Discovering the underlying m...
18930,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...
18931,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...
18932,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,{'abstractText': 'Thanks to graph neural netwo...


## Drop duplicated values

In [17]:
df_train.drop_duplicates(subset=['paper_id'],inplace=True)
df_train

Unnamed: 0,paper_id,paper
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...
...,...,...
8374,SP:77d59e1e726172184249bdfdd81011617dc9c208,{'abstractText': 'Quantum machine learning met...
8375,SP:e58dc2d21175a62499405b7f4c3a03b135530838,{'abstractText': 'Trained generative models ha...
8376,SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,{'abstractText': 'Discovering the underlying m...
8377,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...


In [18]:
df_train.reset_index(drop=True, inplace=True)
df_train

Unnamed: 0,paper_id,paper
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,{'abstractText': 'We study the average CVloo s...
1,SP:b80bc890180934092cde037b49d94d6e4e06fad9,{'abstractText': 'The use of episodic memories...
2,SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,{'abstractText': 'Existing Multi-Task Learning...
3,SP:a1e2218e6943bf138aeb359e23628676b396ed66,{'abstractText': 'This paper deals with the fu...
4,SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,{'abstractText': 'Our work is concerned with t...
...,...,...
8374,SP:77d59e1e726172184249bdfdd81011617dc9c208,{'abstractText': 'Quantum machine learning met...
8375,SP:e58dc2d21175a62499405b7f4c3a03b135530838,{'abstractText': 'Trained generative models ha...
8376,SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,{'abstractText': 'Discovering the underlying m...
8377,SP:4706017e6f8b958c7d0825fed98b285ea2994b59,{'abstractText': 'Some conventional transforms...


## Example sections

In [19]:
for i in range(2):
    for sec in ((df_train['paper'][i])['sections']):
        print((sec)['heading'])
        # print(">"*10, sec['text'][:40])
    print("="*100)


1 INTRODUCTION
2 STATISTICAL LEARNING AND EMPIRICAL RISK MINIMIZATION
2.1 KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION
3 ERROR BOUNDS VIA STABILITY
4.1 KEY LEMMAS
4.2 PROOF OF LEMMA 11
5 REMARK AND RELATED WORK
6 CONCLUSIONS
A EXCESS RISK, GENERALIZATION, AND STABILITY
1 INTRODUCTION
2 A NEW PERSPECTIVE OF REDUCING DIVERSITY OF GRADIENTS
2.1 THE SOURCE OF GRADIENT DIVERSITY
2.2 CONNECTING DEEP METRIC LEARNING TO CONTINUAL LEARNING
3 DISCRIMINATIVE REPRESENTATION LOSS
4 ONLINE MEMORY UPDATE AND BALANCED EXPERIENCE REPLAY
5 EXPERIMENTS
6 CONCLUSION
A PROOF OF THEOREMS
B ALGORITHMS OF ONLINE MEMORY UPDATE
C DEFINITION OF PERFORMANCE MEASURES
D RELATED METHODS FROM DML
E ABLATION STUDY ON DRL
F COMPARING DIFFERENT MEMORY SIZES
G COMPARING DIFFERENT REPLAY STRATEGY
H COMPARING TRAINING TIME
I HYPER-PARAMETERS IN EXPERIMENTS


## Sections extraction

In [26]:
def section_extraction(sec):
    text = sec['text']
    try:
        # sec['heading']=sec['heading'].upper()
        sec_split = ((sec)['heading'].upper()).split()
        if len(sec_split)==1:
            head_no = None
            head_title = sec_split[0]
        else:
            head_no = sec_split[0]
            head_title = " ".join(sec_split[1:])
    except:
        head_no = None
        head_title = None
    return head_no, head_title, text


In [214]:
full_len = len(df_train)
sections_data = [] 
one_section_papers = []
for ind, row in (df_train).iterrows():
    sections = ((row['paper'])['sections'])
    if len(sections)<=1: 
        one_section_papers.append(row['paper_id'])
        continue
    for sec in sections:
        head_no, head_title, text = section_extraction(sec)
        if head_no is not None:
            while head_no[-1] in ['.','/',',',')']:
                head_no = head_no[:-1]
        sections_data.append({
            'paper_id': row['paper_id'],
            'head_no': head_no,
            'head_title': head_title,
            'text': text,
        })   
    print_progress(ind, full_len, bar_size=50)
    # break
df_paper = pd.DataFrame(sections_data)
df_paper



Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2.1,KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION,The focus in this paper is on the kernel least...
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.1,KEY LEMMAS,In order to prove Theorem 7 we make use of the...
...,...,...,...,...
148542,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,B.4,ANALYZE FOR EPISTEMIC IN OOD DETECTION,"In OOD detection, epistemic uncertainty perfor..."
148543,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C,DERIVATIONS FOR UNCERTAINTY MEASURES AND KL DI...,This appendix provides the derivations and sho...
148544,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.1,UNCERTAINTY MEASURES,Vacuity uncertainty of Bayesian Graph neural n...
148545,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.2,JOINT PROBABILITY,"At the test stage, we infer the joint probabil..."


## Save CSV

In [215]:
# file_dir = 'visualization_data
# if not(Path(log_dir).exists()): os.system(f"mkdir -p {log_dir}")
# df_paper.to_csv(f"{file_dir}/paper_sections.csv")

# Load Extracted CSV

In [216]:
# df_paper = pd.read_csv("visualization_data/paper_sections.csv", index_col=0)
# df_paper

## Prepossing heading title

In [388]:
no = list(str(i) for i in range(10))
df_paper_main = df_paper[df_paper["head_no"].isin(no)]
df_paper_main.reset_index(drop=True, inplace=True)
df_paper_main

Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,5,REMARK AND RELATED WORK,In the previous section we obtained bounds on ...
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,6,CONCLUSIONS,"In summary, minimizing a bound on cross valida..."
...,...,...,...,...
48456,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,2,RELATED WORK,Epistemic Uncertainty in Bayesian Deep Learnin...
48457,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,3,PROPOSED APPROACH,Now we define the problem of uncertainty-aware...
48458,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,4,EXPERIMENTS,"In this section, we describe our experimental ..."
48459,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,5,UNCERTAINTY EXPERIMENT AND ANALYSIS,"In Section 4, we showed that our S-BGNN-T impr..."


### Try to set df_paper_main with all sections

In [390]:
df_paper_main = df_paper[df_paper['head_title'].notna()]
df_paper_main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147065 entries, 0 to 148546
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   paper_id    147065 non-null  object
 1   head_no     144299 non-null  object
 2   head_title  147065 non-null  object
 3   text        147065 non-null  object
dtypes: object(4)
memory usage: 5.6+ MB


In [391]:
df_paper_main

Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2.1,KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION,The focus in this paper is on the kernel least...
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.1,KEY LEMMAS,In order to prove Theorem 7 we make use of the...
...,...,...,...,...
148542,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,B.4,ANALYZE FOR EPISTEMIC IN OOD DETECTION,"In OOD detection, epistemic uncertainty perfor..."
148543,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C,DERIVATIONS FOR UNCERTAINTY MEASURES AND KL DI...,This appendix provides the derivations and sho...
148544,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.1,UNCERTAINTY MEASURES,Vacuity uncertainty of Bayesian Graph neural n...
148545,SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb,C.2,JOINT PROBABILITY,"At the test stage, we infer the joint probabil..."


### Continue

In [392]:
p = inflect.engine()
prepro_list = [p.singular_noun(word) if p.singular_noun(word)!=False else word for word in df_paper_main['head_title']]
df_paper_main['head_title'] = prepro_list
df_paper_main[:10]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paper_main['head_title'] = prepro_list


Unnamed: 0,paper_id,head_no,head_title,text
0,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1,INTRODUCTION,Statistical learning theory studies the learni...
1,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2,STATISTICAL LEARNING AND EMPIRICAL RISK MINIMI...,We begin by recalling the basic ideas in stati...
2,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,2.1,KERNEL LEAST SQUARES AND MINIMUM NORM SOLUTION,The focus in this paper is on the kernel least...
3,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,3,ERROR BOUNDS VIA STABILITY,"In this section, we recall basic results relat..."
4,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.1,KEY LEMMA,In order to prove Theorem 7 we make use of the...
5,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,4.2,PROOF OF LEMMA 11,We can write any interpolating solution to the...
6,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,5,REMARK AND RELATED WORK,In the previous section we obtained bounds on ...
7,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,6,CONCLUSION,"In summary, minimizing a bound on cross valida..."
8,SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,A,"EXCESS RISK, GENERALIZATION, AND STABILITY",We use the same notation as introduced in Sect...
9,SP:b80bc890180934092cde037b49d94d6e4e06fad9,1,INTRODUCTION,"In the real world, we are often faced with sit..."


In [393]:
df_heading = pd.DataFrame(df_paper_main.groupby("head_title")["head_title"].count())
df_heading.columns = ["count"]
df_heading.reset_index(inplace=True)
df_heading.sort_values('count', ascending=False, inplace=True)
df_heading.reset_index(drop=True, inplace=True)
df_heading

Unnamed: 0,head_title,count
0,INTRODUCTION,7923
1,RELATED WORK,5533
2,CONCLUSION,5453
3,EXPERIMENT,4733
4,APPENDIX,1559
...,...,...
79476,EXACT ANALYSI WITH TOY MODEL,1
79477,EX-POST DENSITY ESTIMATION,1
79478,EWMA COMPARISON,1
79479,EWC,1


In [394]:
df_heading[df_heading['head_title'].str.contains(r'SETUP*')]

Unnamed: 0,head_title,count
13,EXPERIMENTAL SETUP,801
33,SETUP,228
47,EXPERIMENT SETUP,160
56,PROBLEM SETUP,137
273,TRAINING SETUP,19
...,...,...
78361,EXPERIEMNTAL SETUP,1
78391,EXPERIMEMT SETUP,1
78634,EXPERIMENT FOR OTHER SETUPS,1
78996,EVALUATION SETUP AND METRIC,1


Set section's keys adapted from *paper:*

In [395]:
introduction_key = ["INTRODUCTION", "PROBLEM", "MOTIVATION", "PRELIMINARY", "OVERVIEW"]
literature_key = ["RELATED WORK", "BACKGROUND", "THEORY", "PREVIOUS WORK", "PRIOR WORK", "REVIEW", "REVIEW", "BASELINE"]
method_key = ["APPROACH", "MODEL", "ALGORITHM", "METHOD", "DATASET", "SETUP", "SETTING", "FRAMEWORK", "ARCHITECTURE", "IMPLEMENTATION", "CONTRIBUTION"]
result_key =["EXPERIMENT", "RESULT", "EVALUATION", "ABLATION STUDY"]
discuss_key = ["DISCUSSION", "LIMITATION", "ANALYSI"]
conclusion_key = ["CONCLU", "FUTURE WORK", "APPLICATION", "SUMMARY", "IMPACT"]
additional_key = ["NOTATION", "APPENDIX", "ACKNOWLEDGMENT", "ACKNOWLEDGEMENT", "EXTENSION", "REMARK", "MATERIAL", "ENVIRONMENT"]

In [396]:

# other = ["REPRODUCIBILITY", "SIMULATION", "EXAMPLE", "FORMULATION", "LEARNING", "TASK"]
# pattern = r'CONCLU*|INTRODUCTION*|FUTURE*|WORK*|DISCUSSION*|ACKNOWLEDGMENT*|ACKNOWLEDGEMENT*|APPROACH*|MODEL*|ANALYSI*|ALGORITHM*|RELATED*|EXPERIMENT*|METHOD*|\
#     |PRELIMINARY*|BACKGROUND*|RESULT*|EVALUATION*|APPENDIX*|ABLATION STUDY*|LIMITATION*|PROBLEM*|STATEMENT*|\
#     |APPLICATION*|THEORY*|MOTIVATION*|IMPACT*|METHOD*|REPRODUCIBILITY*|SUMMARY*|DATASET*|STUDY*|SETUP*|\
#     |SIMULATION*|FRAMEWORK*|IMPLEMENTATION*|NOTATION*|SETTING*|ARCHITECTURE*|PREVIOUS WORK*|PRIOR WORK*|REVIEW*|LITERATURE*|\
#     |EXTENSION*|EXAMPLE*|FORMULATION*|OVERVIEW*|REMARK*|LEARNING*|ENVIRONMENT*|BASELINE*|CONTRIBUTION*|TASK*|MATERIAL*'
# mask = df_heading['head_title'].str.contains(pattern)
# n = 30*0
# df_heading[~mask][n:n+30]

In [397]:
def pattern(keys):
    pattern_str = r''
    for key in keys:
        pattern_str += key + "*|"
    return pattern_str[:-1]
pattern(introduction_key)

def count_sec(keys, col_name):
    mask = df_paper_main['head_title'].str.contains(pattern(keys))
    df_count = pd.DataFrame(df_paper_main[mask].groupby("paper_id")["paper_id"].count())
    df_count.columns = [col_name]
    # intro_count.reset_index(inplace=True)
    return df_count


In [398]:
paper_id_main = list(df_paper_main['paper_id'].drop_duplicates())

In [399]:
# df_section_group = pd.DataFrame(index=df_train["paper_id"])
df_section_group = pd.DataFrame(index=paper_id_main)
df_section_group.index.name='paper_id'

keys_list = [introduction_key, literature_key, method_key, result_key, discuss_key, conclusion_key]
keys_name = ["intro", "liture", "method", "result", "discuss", "conclude"]
for idx, keys in enumerate(keys_list):
    # print(keys)
    df_count = count_sec(keys, keys_name[idx])
    df_section_group = df_section_group.merge(df_count, on="paper_id", how='outer', suffixes=('_1', '_2'))
# df_section_group.fillna(0, inplace=True)
df_section_group


Unnamed: 0_level_0,intro,liture,method,result,discuss,conclude
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc,1.0,1.0,,,,1.0
SP:b80bc890180934092cde037b49d94d6e4e06fad9,1.0,1.0,2.0,3.0,,1.0
SP:09f2fe6a482bbd6f9bd2c62aa841f995171ba939,1.0,1.0,6.0,6.0,,1.0
SP:a1e2218e6943bf138aeb359e23628676b396ed66,2.0,,,1.0,2.0,1.0
SP:43e525fb3fa611df7fd44bd3bc9843e57b154c66,1.0,2.0,3.0,1.0,,1.0
...,...,...,...,...,...,...
SP:77d59e1e726172184249bdfdd81011617dc9c208,2.0,,3.0,,,
SP:e58dc2d21175a62499405b7f4c3a03b135530838,2.0,,2.0,3.0,2.0,1.0
SP:0d872fb4321f3a4a3fc61cf4d33b0c7e33f2d695,1.0,1.0,1.0,1.0,1.0,1.0
SP:4706017e6f8b958c7d0825fed98b285ea2994b59,1.0,1.0,1.0,1.0,1.0,1.0


In [401]:
df_section_group[df_section_group['conclude'].isna()]

Unnamed: 0_level_0,intro,liture,method,result,discuss,conclude
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SP:a20769de2c7acf390c7e3bece904a17df6a991bd,1.0,2.0,,1.0,1.0,
SP:95ba9ad102adafaabf9671737e6549728d104629,1.0,1.0,1.0,1.0,1.0,
SP:797b07cd8142a35333037bb573db0dfe5dde65ac,1.0,,3.0,7.0,,
SP:72d1283f3602edc22896934271fcec5b03f25d9e,2.0,,1.0,5.0,,
SP:c83ecc74eb885df5f29e5a7080a8c60d1ee0a3b0,1.0,,1.0,2.0,1.0,
...,...,...,...,...,...,...
SP:7cd001a35175d8565c046093dcf070ba7fa988d6,2.0,,4.0,3.0,1.0,
SP:67c44f33dff59e4d218f753fdbc6296da62cdf62,2.0,3.0,5.0,1.0,1.0,
SP:62a75399aa97a61432385cf1dffabb674741a18a,1.0,1.0,4.0,1.0,1.0,
SP:76a052062e3e4bb707b24a8809c220c8ac1df83a,1.0,1.0,4.0,1.0,1.0,


In [402]:
df_section_group.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8232 entries, SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc to SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   intro     8058 non-null   float64
 1   liture    7177 non-null   float64
 2   method    7160 non-null   float64
 3   result    7664 non-null   float64
 4   discuss   4391 non-null   float64
 5   conclude  7117 non-null   float64
dtypes: float64(6)
memory usage: 450.2+ KB


In [410]:
No_sec_paper = list(df_section_group[df_section_group['conclude'].isna()].index)
df_paper[df_paper['paper_id']==No_sec_paper[6]]

Unnamed: 0,paper_id,head_no,head_title,text
1210,SP:a8bb14b514e474691be63b51582544a9befa7125,1,INTRODUCTION,The majority of pruning algorithms for Deep Ne...
1211,SP:a8bb14b514e474691be63b51582544a9befa7125,2,RELATED WORK,Pruning trained models Most of the pruning wor...
1212,SP:a8bb14b514e474691be63b51582544a9befa7125,3,PROBLEM FORMULATION: PRUNING AT INITIALIZATION,"Given a dataset D = {(xi,yi)}ni=1, the trainin..."
1213,SP:a8bb14b514e474691be63b51582544a9befa7125,4,FORESIGHT CONNECTION SENSITIVITY,Since removing connections of a neural network...
1214,SP:a8bb14b514e474691be63b51582544a9befa7125,5,EXPERIMENTS,In the following we evaluate the efficacy of o...
1215,SP:a8bb14b514e474691be63b51582544a9befa7125,5.1,RESULTS ON CIFAR-10,Fig 2 compares the accuracy of the described i...
1216,SP:a8bb14b514e474691be63b51582544a9befa7125,5.2,RESULTS ON LARGER DATASETS,We now present experiments on large datasets. ...
1217,SP:a8bb14b514e474691be63b51582544a9befa7125,5.3,ANALYSIS,Saliency optimization To experimentally valida...
1218,SP:a8bb14b514e474691be63b51582544a9befa7125,6,DISCUSSION,Pruning at initialization has become an active...
1219,SP:a8bb14b514e474691be63b51582544a9befa7125,,ACKNOWLEDGMENTS,This work was supported by the Royal Academy o...


In [379]:
def no_section_info(section):
    No_sec_paper = list(df_section_group[df_section_group[section].isna()].index)
    return [list(df_paper[df_paper['paper_id']==paper]['head_title']) for paper in No_sec_paper]
no_con = no_section_info('conclude')
len(no_con)

1194

In [380]:
for i, sections in enumerate(no_con):
    print('='*50)
    print("\n".join(sections))
    if i>=20: break

INTRODUCTION
INFORMATION LATTICE: ABSTRACTIONS AND RULES OF A SIGNAL
INFORMATION LATTICE LEARNING (ILL)
PRACTICAL LATTICE CONSTRUCTION: TO START LIKE A BABY (PHASE I)
PRACTICAL LATTICE LEARNING: TO LEARN LIKE A CHILD (PHASE II)
ILL EXAMPLES
DISCUSSION: LIMITATIONS AND CHALLENGES
CONNECTION TO CONCEPT LATTICE
MORE GENERALIZED FORMALISM FOR INFORMATION LATTICE
MORE INSIGHTS ON THE SPECIAL LIFTING
EXISTING WORK ON SUBLATTICE GENERATION
MORE DETAILS ON THE CONSTRUCTION PHASE
MORE ANALYSES IN THE LEARNING PHASE
STUDIES ON ILL-BASED MUSIC APPLICATION
CONCLUSION AND BROADER IMPACTS


TypeError: sequence item 0: expected str instance, NoneType found

In [301]:
no_section_info('intro')[2]

['BACKGROUND',
 'INFERENCE IN DLVMS AFFECTED BY MNAR',
 'EXPERIMENT',
 'CONCLUSION']

In [44]:
# df_section_group_new = df_section_group.reset_index(drop=True)
# df_section_group_new = pd.concat([df_section_group, intro_count], axis=1, join='outer')
df_section_group_new = df_section_group.merge(intro_count, on="paper_id", how='outer', suffixes=('_1', '_2'))
df_section_group_new[df_section_group_new["intro"]==None]

NameError: name 'intro_count' is not defined

In [153]:
df_section_group_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8379 entries, SP:4d08cdb2de2044bcb574a425b42963b83fbebfbc to SP:63ad3be1dae7ede5c02a847304072c1cbc91b1cb
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   intro   7989 non-null   float64
dtypes: float64(1)
memory usage: 130.9+ KB
