In [29]:
from pathlib import Path
import functools
import pickle
import random
import json
import os

from spacy.tokens import DocBin
import spacy

In [3]:
training_path = Path('training/corpus')
source_path = training_path / 'source'
fixed_path = training_path / 'fixed'
converted_path = training_path / 'converted'
small_path = training_path / 'small'

In [11]:

for path in source_path.glob('*.txt'):
    txt = path.read_text(encoding='utf-8')
    txt = '\n'.join([t.strip() for t in txt.split('\n')]).strip()
    fpath = fixed_path / path.name
    fpath.write_text(txt, encoding='utf-8')
    os.system(f'python -m spacy convert {fpath} {converted_path} --converter ner')

In [12]:
nlp = spacy.load('en_core_web_lg')

In [13]:
for path in converted_path.glob('*.spacy'):
    doc_bin = DocBin().from_disk(path)
    docs = list(doc_bin.get_docs(nlp.vocab))
    print(path, ' : ', len(docs))

training\corpus\converted\Final_SCIREX_dev.spacy  :  17871
training\corpus\converted\Final_SCIREX_test.spacy  :  19429
training\corpus\converted\Final_SCIREX_train.spacy  :  83132


In [14]:
to_keep = ['Material', 'Metric', 'Task']

labels = []
for path in converted_path.glob('*.spacy'):
 
    input_bin = DocBin().from_disk(path)
    output_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])

    for doc in input_bin.get_docs(nlp.vocab):
        ents = [ent for ent in doc.ents if ent.label_ in to_keep]
        doc.ents = tuple(ents)
        output_bin.add(doc)

    output_path = training_path / f"tdm_{path.name.split('_')[-1]}"
    output_bin.to_disk(output_path)

In [15]:
for path in training_path.glob('*.spacy'):
    input_bin = DocBin().from_disk(path)
    for doc in input_bin.get_docs(nlp.vocab):
        labels += [ent.label_ for ent in doc.ents]
    print(path, ' : ', set(labels))    

training\corpus\tdm_dev.spacy  :  {'Material', 'Task', 'Metric'}
training\corpus\tdm_test.spacy  :  {'Material', 'Task', 'Metric'}
training\corpus\tdm_train.spacy  :  {'Material', 'Task', 'Metric'}


In [8]:
random.seed(42)

DEV_SIZE = TEST_SIZE = 500 
TRAIN_SIZE = 1500

for path, size in zip(fixed_path.glob('*.txt'), [DEV_SIZE, TEST_SIZE, TRAIN_SIZE]):
    sentences = path.read_text(encoding='utf-8').split('\n\n')
    random.shuffle(sentences)
    (small_path / path.name).write_text('\n\n'.join(sentences[:size]), encoding='utf-8')

In [30]:
data = {}
tags = {}

for name, path in zip(['dev', 'test', 'train'], small_path.glob('*.txt')):
    data[name] = []
    tags[name] = []
    for sentences in path.read_text(encoding='utf-8').split('\n\n'):
        data[name].append({'tokens':[], 'tags':[]})
        for row in sentences.split('\n'):
            token, tag = row.split(' ')
            data[name][-1]['tokens'].append(token)
            data[name][-1]['tags'].append(tag)
        tags[name] += data[name][-1]['tags']

data['tag_to_id'] = {'O': 0}
for tag in ['Task', 'Material', 'Metric', 'Method']:
    data['tag_to_id'].update({f'B-{tag}':len(data['tag_to_id'])})
    data['tag_to_id'].update({f'I-{tag}':len(data['tag_to_id'])})

Path('training/corpus/tdmm.pk').write_bytes(pickle.dumps(data))

585697

In [19]:
import pandas as pd

pd.Series(tags['dev']).value_counts()

O             8807
I-Method       554
B-Method       406
I-Task         163
B-Task         132
I-Metric        76
B-Metric        63
B-Material      39
I-Material      31
dtype: int64

In [20]:
pd.Series(tags['test']).value_counts()

O             9423
I-Method       550
B-Method       409
I-Task         201
B-Task         141
I-Metric        55
B-Metric        53
B-Material      42
I-Material      38
dtype: int64

In [21]:
pd.Series(tags['train']).value_counts()

O             26097
I-Method       1666
B-Method       1257
I-Task          586
B-Task          416
I-Metric        183
B-Metric        175
B-Material      124
I-Material      102
dtype: int64