In [1]:
from pathlib import Path
from matplotlib import pyplot as plt
from helpers import read_jsonl, write_jsonl

import numpy as np
import pandas as pd
import seaborn as sns

import json
import spacy

## Loading data
'coref', 'coref_non_salient', 'doc_id', 'method_subrelations', 'n_ary_relations', 'ner', 'sections', 'sentences', 'words'

In [2]:
TRAIN = 'train'
TEST = 'test'
DEV = 'dev'
DATA_PATH = Path('../data/scirex/')

data = { 
   TRAIN : read_jsonl(DATA_PATH / f'{TRAIN}.jsonl'),
   TEST : read_jsonl(DATA_PATH / f'{TEST}.jsonl'),
   DEV : read_jsonl(DATA_PATH / f'{DEV}.jsonl')
}

In [3]:
nlp = spacy.blank('en')

In [4]:
print(*[f'{k} : {len(v)}' for k, v in data.items()])

train : 306 test : 66 dev : 66


In [5]:
print(*[f'{k} : {str(v[0].keys())}' for k, v in data.items()], sep='\n')

train : dict_keys(['coref', 'coref_non_salient', 'doc_id', 'method_subrelations', 'n_ary_relations', 'ner', 'sections', 'sentences', 'words'])
test : dict_keys(['coref', 'coref_non_salient', 'doc_id', 'method_subrelations', 'n_ary_relations', 'ner', 'sections', 'sentences', 'words'])
dev : dict_keys(['coref', 'doc_id', 'method_subrelations', 'n_ary_relations', 'ner', 'sections', 'sentences', 'words'])


In [6]:
print(*[f'{k} : {len(v[0]["sentences"])}' for k, v in data.items()], sep='\n')


train : 328
test : 611
dev : 149


## ner 

In [8]:
rows = []
for k, v in data.items():
    for doc in v:
        for entity in doc['ner']: 
            rows.append((k, doc['doc_id'], f'{entity[0]}_{entity[1]}',  entity[2], ' '.join(doc['words'][entity[0]:entity[1]])))
entity_df = pd.DataFrame(rows, columns=['corpus', 'doc_id', 'pos', 'entity_name', 'entity_value'])
entity_df.info()
entity_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156931 entries, 0 to 156930
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   corpus        156931 non-null  object
 1   doc_id        156931 non-null  object
 2   pos           156931 non-null  object
 3   entity_name   156931 non-null  object
 4   entity_value  156931 non-null  object
dtypes: object(5)
memory usage: 6.0+ MB


Unnamed: 0,corpus,doc_id,pos,entity_name,entity_value
0,train,000f90380d768a85e2316225854fc377c079b5c4,0_5,Method,Full - Resolution Residual Networks
1,train,000f90380d768a85e2316225854fc377c079b5c4,6_8,Task,Semantic Segmentation
2,train,000f90380d768a85e2316225854fc377c079b5c4,14_17,Task,Semantic image segmentation
3,train,000f90380d768a85e2316225854fc377c079b5c4,23_26,Task,autonomous driving systems
4,train,000f90380d768a85e2316225854fc377c079b5c4,29_35,Task,accurate understanding of the surrounding scene


In [9]:
entity_df.describe()

Unnamed: 0,corpus,doc_id,pos,entity_name,entity_value
count,156931,156931,156931,156931,156931
unique,3,438,39206,4,51949
top,train,3cdb1364c3e66443e1c2182474d44b2fb01cd584,2_5,Method,accuracy
freq,107997,906,61,98458,1291


In [10]:
entity_df['entity_value'].value_counts()

accuracy                        1291
CNN                             1232
LSTM                            1073
training                         757
learning rate                    661
                                ... 
Spanish sequences                  1
template adaptation training       1
training rates                     1
offline sampling                   1
annotation errors                  1
Name: entity_value, Length: 51949, dtype: int64

In [11]:
entity_df.groupby('entity_name').describe()

Unnamed: 0_level_0,corpus,corpus,corpus,corpus,doc_id,doc_id,doc_id,doc_id,pos,pos,pos,pos,entity_value,entity_value,entity_value,entity_value
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
entity_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Material,10615,3,train,7454,10615,422,2aec8d465e9a74c27f956ed1136f3e8a3ba0a833,127,10615,8448,3381_3382,6,10615,2865,CIFAR - 10,281
Method,98458,3,train,67464,98458,438,231af7dc01a166cac3b5b01ca05778238f796e41,548,98458,33173,2_3,43,98458,34030,CNN,1197
Metric,15332,3,train,10744,15332,437,20926884a62778a2bf3f9f3c56f30976749ad763,176,15332,11213,3231_3233,6,15332,4405,accuracy,1291
Task,32526,3,train,22335,32526,438,2b507f659b341ed0f23106446de8e4322f4a3f7e,276,32526,18845,2_4,21,32526,12893,classification,516


## coref

In [17]:
rows = []
for k, v in data.items():
    for doc in v:
        rows += [(k, doc['doc_id'], coref_k, len(coref_v)) for coref_k, coref_v in doc['coref'].items()]
coref_df = pd.DataFrame(rows, columns=['corpus', 'doc_id', 'coref_name', 'coref_length'])
coref_df.info()
coref_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421 entries, 0 to 3420
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   corpus        3421 non-null   object
 1   doc_id        3421 non-null   object
 2   coref_name    3421 non-null   object
 3   coref_length  3421 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 107.0+ KB


Unnamed: 0,corpus,doc_id,coref_name,coref_length
0,train,000f90380d768a85e2316225854fc377c079b5c4,Cityscapes,19
1,train,000f90380d768a85e2316225854fc377c079b5c4,FRRN,17
2,train,000f90380d768a85e2316225854fc377c079b5c4,Frame__fps_,0
3,train,000f90380d768a85e2316225854fc377c079b5c4,Mean_IoU,5
4,train,000f90380d768a85e2316225854fc377c079b5c4,Real-Time_Semantic_Segmentation,0


In [20]:
coref_df.describe(include='all')

Unnamed: 0,corpus,doc_id,coref_name,coref_length
count,3421,3421,3421,3421.0
unique,3,438,1491,
top,train,13b58f3108709dbbed5588759bc0496f82a261c4,Accuracy,
freq,2424,63,76,
mean,,,,9.722596
std,,,,14.610576
min,,,,0.0
25%,,,,0.0
50%,,,,5.0
75%,,,,13.0


In [23]:
coref_df['coref_name'].value_counts()

Accuracy                       76
MAP                            38
CIFAR-10                       37
Image_Classification           35
Percentage_correct             32
                               ..
Balancing_Linear_Regression     1
C2-50k_Segmentation             1
TLL_MRF                         1
Transformer_Base                1
ROAD                            1
Name: coref_name, Length: 1491, dtype: int64

## coref_non_salient 