In [1]:
# A python script to turn annotated data in standoff format (brat annotation tool) to the formats expected by Stanford NER and Relation Extractor models
# - NER format based on: http://nlp.stanford.edu/software/crf-faq.html#a
# - RE format based on: http://nlp.stanford.edu/software/relationExtractor.html#training

# Usage:
# 1) Install the pycorenlp package
# 2) Run CoreNLP server (change CORENLP_SERVER_ADDRESS if needed)
# 3) Place .ann and .txt files from brat in the location specified in DATA_DIRECTORY
# 4) Run this script

# Cross-sentence annotation is not supported

from pycorenlp import StanfordCoreNLP
import os
from os import listdir
from os.path import isfile, join

DEFAULT_OTHER_ANNO = 'O'
STANDOFF_ENTITY_PREFIX = 'T'
STANDOFF_RELATION_PREFIX = 'R'
DATA_DIRECTORY = "C:\\Users\\rohin\\Rohini_Mondaq_Dessertation\\stanford_ner\\testing"
OUTPUT_DIRECTORY = 'C:\\Users\\rohin\\Rohini_Mondaq_Dessertation\\stanford_ner\\testing_results'
CORENLP_SERVER_ADDRESS = 'http://localhost:9000'

NER_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 'ner-crf-testing-data.tsv')
RE_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 're-testing-data.corp')

if os.path.exists(OUTPUT_DIRECTORY):
	if os.path.exists(NER_TRAINING_DATA_OUTPUT_PATH):
		os.remove(NER_TRAINING_DATA_OUTPUT_PATH)
	if os.path.exists(RE_TRAINING_DATA_OUTPUT_PATH):
		os.remove(RE_TRAINING_DATA_OUTPUT_PATH)
else:
    os.makedirs(OUTPUT_DIRECTORY)

sentence_count = 0
nlp = StanfordCoreNLP(CORENLP_SERVER_ADDRESS)

# looping through .ann files in the data directory
ann_data_files = [f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann']

for file in ann_data_files:
	entities = []
	relations = []

	# process .ann file - place entities and relations into 2 seperate lists of tuples
	with open(join(DATA_DIRECTORY, file), 'r',encoding="utf8") as document_anno_file:
		lines = document_anno_file.readlines()
		for line in lines:
			standoff_line = line.split()
			if standoff_line[0][0] == STANDOFF_ENTITY_PREFIX:
				entity = {}
				entity['standoff_id'] = int(standoff_line[0][1:])
				entity['entity_type'] = standoff_line[1].capitalize()
				entity['offset_start'] = int(standoff_line[2])
				entity['offset_end'] = int(standoff_line[3])
				entity['word'] = standoff_line[4]
				entities.append(entity)

			elif standoff_line[0][0] == STANDOFF_RELATION_PREFIX:
				relation = {}
				relation['standoff_id'] = int(standoff_line[0][1:])
				relation['name'] = standoff_line[1]
				relation['standoff_entity1_id'] = int(standoff_line[2].split(':')[1][1:])
				relation['standoff_entity2_id'] = int(standoff_line[3].split(':')[1][1:])
				relations.append(relation)
				# relations.append((standoff_id, relation_name, standoff_entity1_id, standoff_entity2_id))

	# read the .ann's matching .txt file and tokenize its text using stanford corenlp
	with open(join(DATA_DIRECTORY, file.replace('.ann', '.txt')), 'r',encoding="utf8") as document_text_file:
		document_text = document_text_file.read()

	output = nlp.annotate(document_text, properties={
	  'annotators': 'tokenize,ssplit,pos',
	  'outputFormat': 'json'
	})

	# write text and annotations into NER and RE output files
	with open(NER_TRAINING_DATA_OUTPUT_PATH, 'a',encoding="utf-8") as ner_training_data, open(RE_TRAINING_DATA_OUTPUT_PATH, 'a',encoding="utf-8") as re_training_data:
		for sentence in output['sentences']:
			entities_in_sentence = {}
			sentence_re_rows = []

			for token in sentence['tokens']:
				offset_start = int(token['characterOffsetBegin'])
				offset_end = int(token['characterOffsetEnd'])

				re_row = {}
				entity_found = False
				ner_anno = DEFAULT_OTHER_ANNO

				# searching for token in annotated entities
				for entity in entities:
					if offset_start >= entity['offset_start'] and offset_end <= entity['offset_end']:
						ner_anno = entity['entity_type']

					# multi-token entities for RE need to be handled differently than NER
					if offset_start == entity['offset_start'] and offset_end <= entity['offset_end']:
						entities_in_sentence[entity['standoff_id']] = len(sentence_re_rows)
						re_row['entity_type'] = entity['entity_type']
						re_row['pos_tag'] = token['pos']
						re_row['word'] = token['word']

						sentence_re_rows.append(re_row)
						entity_found = True
						break
					elif offset_start > entity['offset_start'] and offset_end <= entity['offset_end'] and len(sentence_re_rows) > 0:
						sentence_re_rows[-1]['pos_tag'] += '/{}'.format(token['pos'])
						sentence_re_rows[-1]['word'] += '/{}'.format(token['word'])
						entity_found = True
						break
					
				if not entity_found:
					re_row['entity_type'] = DEFAULT_OTHER_ANNO
					re_row['pos_tag'] = token['pos']
					re_row['word'] = token['word']

					sentence_re_rows.append(re_row)

				# writing tagged tokens to NER training data
				ner_training_data.write('{}\t{}\n'.format(token['word'], ner_anno))

			# writing tagged tokens to RE training data
			token_count = 0
			for sentence_row in sentence_re_rows:
				re_training_data.write('{}\t{}\t{}\tO\t{}\t{}\tO\tO\tO\n'.format(str(sentence_count), sentence_row['entity_type'], str(token_count), sentence_row['pos_tag'], sentence_row['word']))
				token_count += 1

			re_training_data.write('\n')

			# writing relations to RE training data
			for relation in relations:
				if relation['standoff_entity1_id'] in entities_in_sentence and relation['standoff_entity2_id'] in entities_in_sentence:
					entity1 = str(entities_in_sentence[relation['standoff_entity1_id']])
					entity2 = str(entities_in_sentence[relation['standoff_entity2_id']])
					relation_name = relation['name']
					re_training_data.write('{}\t{}\t{}\n'.format(entity1, entity2, relation_name))

			re_training_data.write('\n')

			sentence_count += 1

		ner_training_data.write('\n')

	print('Processed file pair: {} and {}'.format(file, file.replace('.ann', '.txt')))


Processed file pair: 899872.ann and 899872.txt
Processed file pair: 899874.ann and 899874.txt
Processed file pair: 899876.ann and 899876.txt
Processed file pair: 899878.ann and 899878.txt
Processed file pair: 899880.ann and 899880.txt
Processed file pair: 899882.ann and 899882.txt
Processed file pair: 899884.ann and 899884.txt
Processed file pair: 899888.ann and 899888.txt
Processed file pair: 899890.ann and 899890.txt
Processed file pair: 899892.ann and 899892.txt
Processed file pair: 899894.ann and 899894.txt
Processed file pair: 899896.ann and 899896.txt
Processed file pair: 899898.ann and 899898.txt
Processed file pair: 899900.ann and 899900.txt
Processed file pair: 899902.ann and 899902.txt
Processed file pair: 899904.ann and 899904.txt
Processed file pair: 899906.ann and 899906.txt
Processed file pair: 899908.ann and 899908.txt
Processed file pair: 899910.ann and 899910.txt
Processed file pair: 899912.ann and 899912.txt
Processed file pair: 899914.ann and 899914.txt
Processed fil

Processed file pair: 900274.ann and 900274.txt
Processed file pair: 900276.ann and 900276.txt
Processed file pair: 900278.ann and 900278.txt
Processed file pair: 900282.ann and 900282.txt
Processed file pair: 900284.ann and 900284.txt
Processed file pair: 900286.ann and 900286.txt
Processed file pair: 900288.ann and 900288.txt
Processed file pair: 900290.ann and 900290.txt
Processed file pair: 900292.ann and 900292.txt
Processed file pair: 900298.ann and 900298.txt
Processed file pair: 900302.ann and 900302.txt
Processed file pair: 900312.ann and 900312.txt
Processed file pair: 900320.ann and 900320.txt
Processed file pair: 900324.ann and 900324.txt
Processed file pair: 900328.ann and 900328.txt
Processed file pair: 900334.ann and 900334.txt
Processed file pair: 900342.ann and 900342.txt
Processed file pair: 900344.ann and 900344.txt
Processed file pair: 900346.ann and 900346.txt
Processed file pair: 900348.ann and 900348.txt
Processed file pair: 900350.ann and 900350.txt
Processed fil

ImportError: cannot import name 'sentencebreaks_to_newlines' from 'sentencesplit' (C:\Users\rohin\sentencesplit.py)