In [1]:
import pandas as pd

import os
import random
import re

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from collections import Counter, OrderedDict, defaultdict
from itertools import islice
from sentence_annotation import *
from utils import *

random.seed(42)

## Convert ontonotes dataset to sentence level

In [2]:
subdir = './datasets/Ontonotes/'
ontonotes_dev, ontonotes_test, ontonotes_train = [subdir + file for file in os.listdir(subdir) if file.endswith('conll')]

In [6]:
onto_train = extract_ner_labels(ontonotes_train, dataset='ontonotes', w_idx=3, ner_idx=10)
onto_dev = extract_ner_labels(ontonotes_dev, dataset='ontonotes', w_idx=3, ner_idx=10)
onto_test = extract_ner_labels(ontonotes_test, dataset='ontonotes', w_idx=3, ner_idx=10)

In [93]:
onto_train_sent_level = annotate_sentences(onto_train)
onto_dev_sent_level = annotate_sentences(onto_dev)
onto_test_sent_level = annotate_sentences(onto_test)

5000 sentences processed...
10000 sentences processed...
Sentence annotation finished!


In [80]:
os.mkdir(subdir + 'sentence_level_annotation')

In [94]:
onto_train_sent_level.to_csv(subdir + 'sentence_level_annotation/onto_sent_train.tsv', sep='\t', index=False, header=False)
onto_dev_sent_level.to_csv(subdir + 'sentence_level_annotation/onto_sent_dev.tsv', sep='\t', index=False, header=False)
onto_test_sent_level.to_csv(subdir + 'sentence_level_annotation/onto_sent_test.tsv', sep='\t', index=False, header=False)

## Convert CONLL 2003 to sentence level

In [2]:
subdir = './datasets/CONLL2003/'
files = list(map(lambda file: os.path.join(subdir, file), os.listdir(subdir)))

In [3]:
conll_dev, conll_test, conll_train = files

In [6]:
conll_train = extract_ner_labels(conll_train, dataset='conll', w_idx=0, ner_idx=3)
conll_dev = extract_ner_labels(conll_dev, dataset='conll', w_idx=0, ner_idx=3)
conll_test = extract_ner_labels(conll_test, dataset='conll', w_idx=0, ner_idx=3)

In [7]:
conll_train_sent_level = annotate_sentences(conll_train)
conll_dev_sent_level = annotate_sentences(conll_dev)
conll_test_sent_level = annotate_sentences(conll_test)

5000 sentences processed...
10000 sentences processed...
Sentence annotation finished!
Sentence annotation finished!
Sentence annotation finished!


In [9]:
os.mkdir(subdir + 'sentence_level_annotation')

In [10]:
conll_train_sent_level.to_csv(subdir + 'sentence_level_annotation/conll2003_sent_train.tsv', sep='\t', index=False, header=False)
conll_dev_sent_level.to_csv(subdir + 'sentence_level_annotation/conll2003_sent_dev.tsv', sep='\t', index=False, header=False)
conll_test_sent_level.to_csv(subdir + 'sentence_level_annotation/conll2003_sent_test.tsv', sep='\t', index=False, header=False)

## Convert SemEval 2010 to sentence level

In [223]:
subdir = './datasets/SemEval2010/'
subtask = 'SemEval2010'

In [225]:
semeval_train = load_semeval2010_files(train=True)
semeval_test_sents = load_semeval2010_files(train=False)
semeval_test_labels = load_semeval2010_files(train=False, labels=True)

In [226]:
semeval_train = extract_rel_labels(semeval_train, train=True)
semeval_test_sents = extract_rel_labels(semeval_test_sents, train=False)
semeval_test_labels = extract_rel_labels(semeval_test_labels, train=False, keys=True)

semeval_dev_sents = semeval_test_sents[:len(semeval_test_sents)//2]
semeval_test_sents = semeval_test_sents[len(semeval_test_sents)//2:]

semeval_dev_labels = semeval_test_labels[:len(semeval_test_labels)//2]
semeval_test_labels = semeval_test_labels[len(semeval_test_labels)//2:]

assert len(semeval_dev_sents) == len(semeval_dev_labels)
assert len(semeval_test_sents) == len(semeval_test_labels)

semeval_dev = (semeval_dev_sents, semeval_dev_labels)
semeval_test = (semeval_test_sents, semeval_test_labels)

In [5]:
semeval_train_sent_level = annotate_sentences(semeval_train,task='RelExtract', subtask=subtask)
semeval_dev_sent_level = annotate_sentences(semeval_dev,task='RelExtract', subtask=subtask)
semeval_test_sent_level = annotate_sentences(semeval_test,task='RelExtract', subtask=subtask)

5000 sentences processed...
Sentence annotation finished!
Sentence annotation finished!
Sentence annotation finished!


In [6]:
#os.mkdir(subdir + 'sentence_level_annotation')

In [7]:
semeval_train_sent_level.to_csv(subdir + 'sentence_level_annotation/semeval_sent_train.tsv', sep='\t', index=False, header=False)
semeval_dev_sent_level.to_csv(subdir + 'sentence_level_annotation/semeval_sent_dev.tsv', sep='\t', index=False, header=False)
semeval_test_sent_level.to_csv(subdir + 'sentence_level_annotation/semeval_sent_test.tsv', sep='\t', index=False, header=False)

## Convert SemEval2007 to sentence level

In [217]:
train_file = 'train'
test_file = 'key'
subtask = 'SemEval2007'

In [218]:
semeval2007_train_sents, semeval2007_train_labels = extract_rels_semeval2007(train_file)
semeval2007_test_sents, semeval2007_test_labels = extract_rels_semeval2007(test_file)

In [219]:
semeval2007_dev_sents = semeval2007_test_sents[:len(semeval2007_test_sents)//2]
semeval2007_dev_labels = semeval2007_test_labels[:len(semeval2007_test_labels)//2]

semeval2007_test_sents = semeval2007_test_sents[len(semeval2007_test_sents)//2:]
semeval2007_test_labels = semeval2007_test_labels[len(semeval2007_test_labels)//2:]

In [220]:
semeval2007_train = (semeval2007_train_sents, semeval2007_train_labels)
semeval2007_dev = (semeval2007_dev_sents, semeval2007_dev_labels)
semeval2007_test = (semeval2007_test_sents, semeval2007_test_labels)

In [7]:
semeval2007_train_sent_level = annotate_sentences(semeval2007_train, task='RelExtract', subtask=subtask)
semeval2007_dev_sent_level = annotate_sentences(semeval2007_dev, task='RelExtract', subtask=subtask)
semeval2007_test_sent_level = annotate_sentences(semeval2007_test, task='RelExtract', subtask=subtask)

Sentence annotation finished!
Sentence annotation finished!
Sentence annotation finished!


In [9]:
#subdir = './datasets/SemEval2007/'
#os.mkdir(subdir + 'sentence_level_annotation')

In [10]:
semeval2007_train_sent_level.to_csv(subdir + 'sentence_level_annotation/semeval2007_sent_train.tsv', sep='\t', index=False, header=False)
semeval2007_dev_sent_level.to_csv(subdir + 'sentence_level_annotation/semeval2007_sent_dev.tsv', sep='\t', index=False, header=False)
semeval2007_test_sent_level.to_csv(subdir + 'sentence_level_annotation/semeval2007_sent_test.tsv', sep='\t', index=False, header=False)

## Convert Wikipedia to sentence level

In [2]:
subtask = 'Wiki'

In [3]:
def str_to_list_zuco(sents:np.ndarray): return list(map(lambda sent: sent.content.split() + [''], sents))

In [4]:
files_task_2 = get_matfiles('task2')
files_task_3 = get_matfiles('task3')

data_task_2_sbj1 = io.loadmat(files_task_2[0], squeeze_me=True, struct_as_record=False)['sentenceData']
data_task_3_sbj1 = io.loadmat(files_task_3[0], squeeze_me=True, struct_as_record=False)['sentenceData']

In [5]:
sents_task_2 = str_to_list_zuco(data_task_2_sbj1)
sents_task_3 = str_to_list_zuco(data_task_3_sbj1)
all_sents_zuco = sents_task_2 + sents_task_3

In [6]:
wiki_sents, wiki_labels = extract_wiki_rels(load_wiki_dataset())
wiki_sents, wiki_labels = shuffle(wiki_sents, wiki_labels)

In [7]:
wiki_train_sents, wiki_test_sents, wiki_train_labels, wiki_test_labels = train_test_split(wiki_sents, wiki_labels, test_size=0.33, random_state=42)

In [8]:
wiki_dev_sents = wiki_test_sents[:len(wiki_test_sents)//2]
wiki_dev_labels = wiki_test_labels[:len(wiki_test_labels)//2]

wiki_test_sents = wiki_test_sents[len(wiki_test_sents)//2:]
wiki_test_labels = wiki_test_labels[len(wiki_test_labels)//2:]

In [9]:
wiki_train = (wiki_train_sents, wiki_train_labels)
wiki_dev = filter_sents(wiki_dev_sents, wiki_dev_labels, all_sents_zuco)
wiki_test = filter_sents(wiki_test_sents, wiki_test_labels, all_sents_zuco)

In [10]:
wiki_train_sent_level = annotate_sentences(wiki_train, task='RelExtract', subtask=subtask)
wiki_dev_sent_level = annotate_sentences(wiki_dev, task='RelExtract', subtask=subtask)
wiki_test_sent_level = annotate_sentences(wiki_test, task='RelExtract', subtask=subtask)

Sentence annotation finished!
Sentence annotation finished!
Sentence annotation finished!


In [11]:
#subdir = './datasets/Wikipedia/'
#os.mkdir(subdir + 'sentence_level_annotation')

In [12]:
wiki_train_sent_level.to_csv(subdir + 'sentence_level_annotation/wiki_sent_train.tsv', sep='\t', index=False, header=False)
wiki_dev_sent_level.to_csv(subdir + 'sentence_level_annotation/wiki_sent_dev.tsv', sep='\t', index=False, header=False)
wiki_test_sent_level.to_csv(subdir + 'sentence_level_annotation/wiki_sent_test.tsv', sep='\t', index=False, header=False)