In [1]:
# Code to convert brat annotated format to the formats required by spaCy and CRF NRF models, and to include BIO tags

# Usage:
# 1) Install the pycorenlp package
# 2) Run CoreNLP server (change STANFORD_SERVER_ADDRESS if needed)
# 3) Place .ann and .txt files from brat in the location specified in DATA_DIRECTORY
# 4) Run this script

# Cross-sentence annotation is not supported

from pycorenlp import StanfordCoreNLP
import os
from os import listdir
from os.path import isfile, join

OTHER_ANNO = 'O'
ENTITY_PREFIX = 'T'
RELATION_PREFIX = 'R'
DATA_DIRECTORY = "C:\\Users\\rohin\\Rohini_Mondaq_Dessertation\\mondaq_data_project1_set2"
OUTPUT_DIRECTORY = 'C:\\Users\\rohin\\Rohini_Mondaq_Dessertation\\mondaq_data_project1_set2\\output'
STANFORD_SERVER_ADDRESS = 'http://localhost:9000'

NER_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 'ner-crf-data.tsv')
RE_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 're-data.corp')

if os.path.exists(OUTPUT_DIRECTORY):
	if os.path.exists(NER_TRAINING_DATA_OUTPUT_PATH):
		os.remove(NER_TRAINING_DATA_OUTPUT_PATH)
	if os.path.exists(RE_TRAINING_DATA_OUTPUT_PATH):
		os.remove(RE_TRAINING_DATA_OUTPUT_PATH)
else:
    os.makedirs(OUTPUT_DIRECTORY)

sentence_count = 1
nlp = StanfordCoreNLP(STANFORD_SERVER_ADDRESS)

# looping through .ann files in the data directory
ann_data_files = [f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann']

for file in ann_data_files:
	entities = []
	relations = []

	# process .ann file - place entities and relations into 2 seperate lists of tuples
	with open(join(DATA_DIRECTORY, file), 'r',encoding="utf8") as document_anno_file:
		lines = document_anno_file.readlines()
		for line in lines:
			standoff_line = line.split()
			if standoff_line[0][0] == ENTITY_PREFIX:
				entity = {}
				entity['standoff_id'] = int(standoff_line[0][1:])
				entity['entity_type'] = standoff_line[1].capitalize()
				entity['offset_start'] = int(standoff_line[2])
				entity['offset_end'] = int(standoff_line[3])
				entity['word'] = standoff_line[4]
				entities.append(entity)

			elif standoff_line[0][0] == RELATION_PREFIX:
				relation = {}
				relation['standoff_id'] = int(standoff_line[0][1:])
				relation['name'] = standoff_line[1]
				relation['standoff_entity1_id'] = int(standoff_line[2].split(':')[1][1:])
				relation['standoff_entity2_id'] = int(standoff_line[3].split(':')[1][1:])
				relations.append(relation)
				# relations.append((standoff_id, relation_name, standoff_entity1_id, standoff_entity2_id))

	# read the .ann's matching .txt file and tokenize its text using stanford corenlp
	with open(join(DATA_DIRECTORY, file.replace('.ann', '.txt')), 'r',encoding="utf8") as document_text_file:
		document_text = document_text_file.read()

	output = nlp.annotate(document_text, properties={
	  'annotators': 'tokenize,ssplit,pos',
	  'outputFormat': 'json'
	})

	# write text and annotations into NER and RE output files
	with open(NER_TRAINING_DATA_OUTPUT_PATH, 'a',encoding="utf-8") as ner_training_data, open(RE_TRAINING_DATA_OUTPUT_PATH, 'a',encoding="utf-8") as re_training_data:
		for sentence in output['sentences']:
			entities_in_sentence = {}
			sentence_re_rows = []

			for token in sentence['tokens']:
				offset_start = int(token['characterOffsetBegin'])
				offset_end = int(token['characterOffsetEnd'])

				re_row = {}
				entity_found = False
				ner_anno = OTHER_ANNO

				# searching for token in annotated entities. 
				# If the token appears in the beginning, prefix with B-, 
				# if the token appears in between first and last word, prefix with I-, 
				# and for the last token, prefix with O-
				for entity in entities:
					if offset_start == entity['offset_start'] and offset_end <= entity['offset_end']:
						ner_anno = 'B-'+entity['entity_type']
					if offset_start > entity['offset_start'] and offset_end < entity['offset_end']:
						ner_anno = 'I-'+entity['entity_type']
					if offset_start > entity['offset_start'] and offset_end == entity['offset_end']:
						ner_anno = 'O-'+entity['entity_type']
                        
					# multi-token entities for RE need to be handled differently than NER
					if offset_start == entity['offset_start'] and offset_end <= entity['offset_end']:
						entities_in_sentence[entity['standoff_id']] = len(sentence_re_rows)
						re_row['entity_type'] = 'B-'+entity['entity_type']
						re_row['pos_tag'] = token['pos']
						re_row['word'] = token['word']

						sentence_re_rows.append(re_row)
						entity_found = True
						break
					#elif offset_start > entity['offset_start'] and offset_end <= entity['offset_end'] and len(sentence_re_rows) > 0:
					elif offset_start > entity['offset_start'] and offset_end < entity['offset_end']:
						re_row['entity_type'] = 'I-'+entity['entity_type']
						re_row['pos_tag'] = token['pos']
						re_row['word'] = token['word']
						sentence_re_rows.append(re_row)
						entity_found = True
						break
					elif offset_start > entity['offset_start'] and offset_end == entity['offset_end']:
						re_row['entity_type'] = 'O-'+entity['entity_type']
						re_row['pos_tag'] = token['pos']
						re_row['word'] = token['word']
						sentence_re_rows.append(re_row)
						entity_found = True
						break

                        
                        
				if not entity_found:
					re_row['entity_type'] = OTHER_ANNO
					re_row['pos_tag'] = token['pos']
					re_row['word'] = token['word']

					sentence_re_rows.append(re_row)

				# writing tagged tokens to NER training data
				ner_training_data.write('{}\t{}\n'.format(token['word'], ner_anno))

			# writing tagged tokens to RE training data
			token_count = 0
			for sentence_row in sentence_re_rows:
				re_training_data.write('{}\t{}\t{}\t{}\n'.format(str(sentence_count), sentence_row['entity_type'], sentence_row['pos_tag'], sentence_row['word']))
				token_count += 1

			# re_training_data.write('\n')

			# writing relations to RE training data
			for relation in relations:
				if relation['standoff_entity1_id'] in entities_in_sentence and relation['standoff_entity2_id'] in entities_in_sentence:
					entity1 = str(entities_in_sentence[relation['standoff_entity1_id']])
					entity2 = str(entities_in_sentence[relation['standoff_entity2_id']])
					relation_name = relation['name']
					re_training_data.write('{}\t{}\t{}\n'.format(entity1, entity2, relation_name))

			# re_training_data.write('\n')

			sentence_count += 1

		# ner_training_data.write('\n')

	print('Processed file pair: {} and {}'.format(file, file.replace('.ann', '.txt')))


Processed file pair: 899190.ann and 899190.txt
Processed file pair: 899192.ann and 899192.txt
Processed file pair: 899194.ann and 899194.txt
Processed file pair: 899196.ann and 899196.txt
Processed file pair: 899198.ann and 899198.txt
Processed file pair: 899200.ann and 899200.txt
Processed file pair: 899202.ann and 899202.txt
Processed file pair: 899206.ann and 899206.txt
Processed file pair: 899208.ann and 899208.txt
Processed file pair: 899210.ann and 899210.txt
Processed file pair: 899212.ann and 899212.txt
Processed file pair: 899214.ann and 899214.txt
Processed file pair: 899216.ann and 899216.txt
Processed file pair: 899218.ann and 899218.txt
Processed file pair: 899220.ann and 899220.txt
Processed file pair: 899222.ann and 899222.txt
Processed file pair: 899224.ann and 899224.txt
Processed file pair: 899226.ann and 899226.txt
Processed file pair: 899228.ann and 899228.txt
Processed file pair: 899230.ann and 899230.txt
Processed file pair: 899232.ann and 899232.txt
Processed fil

Processed file pair: 899580.ann and 899580.txt
Processed file pair: 899582.ann and 899582.txt
Processed file pair: 899584.ann and 899584.txt
Processed file pair: 899586.ann and 899586.txt
Processed file pair: 899588.ann and 899588.txt
Processed file pair: 899592.ann and 899592.txt
Processed file pair: 899594.ann and 899594.txt
Processed file pair: 899596.ann and 899596.txt
Processed file pair: 899598.ann and 899598.txt
Processed file pair: 899600.ann and 899600.txt
Processed file pair: 899602.ann and 899602.txt
Processed file pair: 899604.ann and 899604.txt
Processed file pair: 899606.ann and 899606.txt
Processed file pair: 899608.ann and 899608.txt
Processed file pair: 899610.ann and 899610.txt
Processed file pair: 899612.ann and 899612.txt
Processed file pair: 899614.ann and 899614.txt
Processed file pair: 899616.ann and 899616.txt
Processed file pair: 899620.ann and 899620.txt
Processed file pair: 899622.ann and 899622.txt
Processed file pair: 899624.ann and 899624.txt
Processed fil

Processed file pair: 899986.ann and 899986.txt
Processed file pair: 899988.ann and 899988.txt
Processed file pair: 899990.ann and 899990.txt
Processed file pair: 899992.ann and 899992.txt
Processed file pair: 899994.ann and 899994.txt
Processed file pair: 899996.ann and 899996.txt
Processed file pair: 899998.ann and 899998.txt
Processed file pair: 900002.ann and 900002.txt
Processed file pair: 900004.ann and 900004.txt
Processed file pair: 900006.ann and 900006.txt
Processed file pair: 900010.ann and 900010.txt
Processed file pair: 900012.ann and 900012.txt
Processed file pair: 900014.ann and 900014.txt
Processed file pair: 900016.ann and 900016.txt
Processed file pair: 900020.ann and 900020.txt
Processed file pair: 900024.ann and 900024.txt
Processed file pair: 900026.ann and 900026.txt
Processed file pair: 900030.ann and 900030.txt
Processed file pair: 900032.ann and 900032.txt
Processed file pair: 900034.ann and 900034.txt
Processed file pair: 900036.ann and 900036.txt
Processed fil