In [2]:
from pathlib import Path
import functools
import random
import json
import os

from spacy.tokens import DocBin
import spacy

In [3]:
training_path = Path('training/corpus')
source_path = training_path / 'source'
fixed_path = training_path / 'fixed'
converted_path = training_path / 'converted'
small_path = training_path / 'small'

In [11]:

for path in source_path.glob('*.txt'):
    txt = path.read_text(encoding='utf-8')
    txt = '\n'.join([t.strip() for t in txt.split('\n')]).strip()
    fpath = fixed_path / path.name
    fpath.write_text(txt, encoding='utf-8')
    os.system(f'python -m spacy convert {fpath} {converted_path} --converter ner')

In [12]:
nlp = spacy.load('en_core_web_lg')

In [13]:
for path in converted_path.glob('*.spacy'):
    doc_bin = DocBin().from_disk(path)
    docs = list(doc_bin.get_docs(nlp.vocab))
    print(path, ' : ', len(docs))

training\corpus\converted\Final_SCIREX_dev.spacy  :  17871
training\corpus\converted\Final_SCIREX_test.spacy  :  19429
training\corpus\converted\Final_SCIREX_train.spacy  :  83132


In [14]:
to_keep = ['Material', 'Metric', 'Task']

labels = []
for path in converted_path.glob('*.spacy'):
 
    input_bin = DocBin().from_disk(path)
    output_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])

    for doc in input_bin.get_docs(nlp.vocab):
        ents = [ent for ent in doc.ents if ent.label_ in to_keep]
        doc.ents = tuple(ents)
        output_bin.add(doc)

    output_path = training_path / f"tdm_{path.name.split('_')[-1]}"
    output_bin.to_disk(output_path)

In [15]:
for path in training_path.glob('*.spacy'):
    input_bin = DocBin().from_disk(path)
    for doc in input_bin.get_docs(nlp.vocab):
        labels += [ent.label_ for ent in doc.ents]
    print(path, ' : ', set(labels))    

training\corpus\tdm_dev.spacy  :  {'Material', 'Task', 'Metric'}
training\corpus\tdm_test.spacy  :  {'Material', 'Task', 'Metric'}
training\corpus\tdm_train.spacy  :  {'Material', 'Task', 'Metric'}


In [8]:
random.seed(42)

DEV_SIZE = TEST_SIZE = 500 
TRAIN_SIZE = 1500

for path, size in zip(fixed_path.glob('*.txt'), [DEV_SIZE, TEST_SIZE, TRAIN_SIZE]):
    sentences = path.read_text(encoding='utf-8').split('\n\n')
    random.shuffle(sentences)
    (small_path / path.name).write_text('\n\n'.join(sentences[:size]), encoding='utf-8')

In [9]:
for path in small_path.glob('*.txt'):
    for sentences in path.read_text(encoding='utf-8').split('\n\n'):
        print(len(sentences))

214
133
90
80
4
28
85
57
49
85
140
204
134
27
209
44
185
53
230
514
266
119
219
133
219
64
327
363
55
518
304
219
139
224
113
216
190
130
227
121
178
158
34
223
204
386
277
63
198
32
61
13
220
174
386
167
454
259
400
184
133
59
278
329
155
113
30
158
291
32
173
64
284
217
174
91
187
118
288
278
212
309
11
184
206
139
134
77
161
72
78
81
22
131
308
314
157
149
152
345
203
117
124
121
128
187
18
66
326
264
178
731
216
47
182
85
268
416
299
236
132
174
235
246
128
212
102
110
411
112
413
263
118
148
243
146
145
107
86
26
157
173
147
417
287
64
90
166
15
39
59
119
235
46
59
71
128
91
200
106
168
243
145
154
154
114
207
97
100
146
199
176
163
114
189
299
240
169
175
147
198
107
151
176
362
49
196
31
112
355
464
131
132
300
49
153
163
199
182
180
263
130
262
205
73
261
37
139
286
249
297
37
87
71
154
282
95
371
100
194
31
258
324
344
307
272
138
170
35
299
58
216
258
73
25
223
26
336
117
89
60
174
208
19
205
263
212
203
21
190
117
238
135
31
118
198
203
191
178
240
275
171
30
84
121
155
9
36