# Doccano data

Export data to doccano (http://doccano.herokuapp.com/) format: json: { "id": 0, "data": "text", "label": [] }

In [85]:
import json
from langdetect import detect
from nltk.tokenize import word_tokenize
import numpy as np
import os
import pandas as pd
import re
from termcolor import colored
from IPython.display import clear_output

In [5]:
def read_data_file(file_name):
    return pd.read_csv(file_name, encoding="latin1")

In [6]:
def get_speech_id(file_name, speeches):
    try:
        file_name_parts = file_name.split()
        date = file_name_parts[0]
        speaker = list(file_name_parts[1].split("_")[0])
        speaker[0] = speaker[0].upper()
        speaker = "".join(speaker)
        speech_identifier = f"{speaker} {date}"
        speech_identifier = fix_speech_identifier(speech_identifier)
        return int(speeches[speeches["Speech_Identifier"] == speech_identifier]["Speech_ID"])
    except:
        return None

In [7]:
def get_paragraph_ids(speech_id, speech_contents):
    paragraph_ids = {}
    try:
        for i, row in speech_contents[speech_contents["Speech_ID"] == speech_id].iterrows():
            paragraph_ids[row["Speech_Content_ID"]] = row["Speech_Content_Title"]
    except:
        pass
    return paragraph_ids

In [8]:
def check_paragraphs(speech_id, paragraph_ids, map_contents, file_name):
    paragraph_values = {}
    for i, row in map_contents[map_contents["Content_Speech_ID"] == speech_id].iterrows():
        if row["Content_Source_ID"] not in paragraph_ids:
            print(colored(f'warning: unknown paragraph id {row["Content_Source_ID"]} for document {speech_id}; file name: {file_name}', "red"))
        else:
            paragraph_values[f'{speech_id} {paragraph_ids[row["Content_Source_ID"]]}'] = True
    return paragraph_values

In [9]:
def read_paragraphs(file_name):
    paragraph_list = []
    data_file = open(file_name, "r", encoding="latin1")
    for line in data_file:
        paragraph_list.append(line.strip())
    data_file.close()
    return paragraph_list

In [27]:
def select_paragraphs(paragraph_list, paragraph_values, speech_id):
    paragraph_texts = {}
    use_paragraph = False
    for paragraph in paragraph_list:
        tokens = paragraph.strip().split()
        if len(tokens) > 0 and re.search(r'^\d+-\d+:*$', tokens[0]):
            use_paragraph = True
            key = re.sub(":", "", tokens[0])
            key = f"{speech_id} {key}"
            tokens.pop(0)
            if len(tokens) > 0 and tokens[0] == ":":
                tokens.pop(0)
        if len(tokens) > 0 and use_paragraph:
            paragraph_texts[key] = " ".join(tokens)
            if key not in paragraph_values:
                paragraph_values[key] = False
            use_paragraph = False
    return paragraph_texts

In [11]:
def guess_language(paragraph_texts):
    text = " ".join(paragraph_texts.values())
    try:
        return detect(text)
    except:
        return "unk"

In [17]:
def fix_speech_identifier(speech_identifier):
    speech_identifier = re.sub("Simor 2010-05-25", "Simor 2010-05-26", speech_identifier)
    speech_identifier = re.sub("^Mario ", "Draghi ", speech_identifier)
    speech_identifier = re.sub("^PM ", "Cameron ", speech_identifier)
    speech_identifier = re.sub("^Thorning ", "Thorning-Schmidt ", speech_identifier)
    speech_identifier = re.sub("Remarks 2009-12-11", "Honohan 2009-12-11", speech_identifier)
    speech_identifier = re.sub("This 2013-02-11", "Cameron 2013-02-11", speech_identifier)
    speech_identifier = re.sub("Mervyn ", "King ", speech_identifier)
    speech_identifier = re.sub("Patrick 2013-03-19", "Honohan 2013-03-19", speech_identifier)
    speech_identifier = re.sub("Statement 2014-12-18", "Kenny 2014-12-19", speech_identifier)
    speech_identifier = re.sub("Speech 2012-06-29", "Cameron 2012-06-29", speech_identifier)
    speech_identifier = re.sub("The 2012-01-30", "Cameron 2012-01-30", speech_identifier)
    speech_identifier = re.sub("Orban ", "Orbán ", speech_identifier)
    speech_identifier = re.sub("The 2014-10-24", "Cameron 2014-10-24", speech_identifier)
    speech_identifier = re.sub("Speech 2013-03-07", "Kenny 2013-07-03", speech_identifier)
    speech_identifier = re.sub("David 2014-11-10", "Cameron 2014-11-10", speech_identifier)
    speech_identifier = re.sub("Statement 2012-07-04", "Kenny 2012-07-04", speech_identifier)
    speech_identifier = re.sub("Schröder", "Schroeder", speech_identifier)
    speech_identifier = re.sub("Schroeder 1998-12-14", "Schroeder 1999-12-14", speech_identifier)
    speech_identifier = re.sub("Schroeder 2001-10-26", "Schroeder 2001-10-16", speech_identifier)
    speech_identifier = re.sub("Hollande 2015-05-19", "Hollande 2015-03-19", speech_identifier)
    speech_identifier = re.sub("Fernandez 2009-11-23", "Fernández Ordóñez  2009-11-23", speech_identifier)
    return speech_identifier

In [181]:
def find_phrase(text, phrase, start_index=0):
    try:
        text = text.lower()
        phrase = phrase.lower()
        for i in range(start_index, len(text)-len(phrase)):
            if text[i: i+len(phrase)] == phrase:
                return i
    except:
        pass
    return None

In [177]:
def make_doccano_data(speech_id, paragraph_texts, paragraph_ids, map_contents, misses):
    annotations = []
    for paragraph_id in paragraph_ids:
        paragraph_text_id = f"{speech_id} {paragraph_ids[paragraph_id]}"
        if paragraph_text_id in paragraph_texts:
            paragraph_text = paragraph_texts[paragraph_text_id]
            for i, row in map_contents[map_contents["Content_Source_ID"] == paragraph_id].iterrows():
                concept_1_start = find_phrase(paragraph_text, row["Content_Concept_1"])
                concept_2_start = find_phrase(paragraph_text, row["Content_Concept_2"])
                explanation_start = find_phrase(paragraph_text, row["Content_Relation_Explanation"])
                if not pd.isna(row["Content_Concept_1"]) and explanation_start != None and not pd.isna(row["Content_Concept_2"]):
                    annotation = { "text": paragraph_text, 
                                   "label": [ [ explanation_start, explanation_start + len(row["Content_Relation_Explanation"]), "Content_Relation_Explanation" ] ],
                                   "source_id": paragraph_id,
                                   "speech_id": speech_id,
                                   "paragraph_id": paragraph_ids[paragraph_id] }
                    if concept_1_start != None:
                        annotation["label"] = [ [ concept_1_start, concept_1_start + len(row["Content_Concept_1"]), "Content_Concept_1" ] ] + annotation["label"]
                    else:
                        annotation["missing concept 1"] = row["Content_Concept_1"]
                    if concept_2_start != None:
                        annotation["label"].append([ concept_2_start, concept_2_start + len(row["Content_Concept_2"]), "Content_Concept_2" ])
                    else:
                        annotation["missing concept 2"] = row["Content_Concept_2"]
                    annotations.append(annotation)
                
                if concept_1_start != None and explanation_start != None and concept_2_start != None:
                    misses["all"] += 1
                elif concept_1_start != None and concept_2_start != None:
                    misses["both"] += 1
                elif explanation_start == None:
                    if pd.isna(row["Content_Relation_Explanation"]):
                        misses["Content_Relation_Explanation NaN"] += 1
                    else:
                        misses["Content_Relation_Explanation"] += 1
                elif concept_1_start == None:
                    misses["Content_Concept_1"] += 1
                    #print("NO ###", row["Content_Concept_1"], "### in ###", paragraph_text)
                elif concept_2_start == None:
                    misses["Content_Concept_2"] += 1
                    #print("NO ###", row["Content_Concept_2"], "### in ###", paragraph_text)
    return annotations

In [162]:
def read_data(speeches, speech_contents, map_contents, text_directory):
    files = os.listdir(text_directory)
    nbr_of_files = 0
    nbr_of_skipped = 0
    annotations = []
    misses = { "Content_Concept_1": 0, "Content_Concept_2": 0, "Content_Relation_Explanation":0, "Content_Relation_Explanation NaN": 0, "all": 0, "both": 0 }
    for file_name in files:
        speech_id = get_speech_id(file_name, speeches)
        if speech_id == None:
            print(f"skipping file {file_name}")
            nbr_of_skipped += 1
        else:
            paragraph_ids = get_paragraph_ids(speech_id, speech_contents)
            paragraph_values = check_paragraphs(speech_id, paragraph_ids, map_contents, file_name)
            paragraph_list = read_paragraphs(f"txt/{file_name}")
            paragraph_texts = select_paragraphs(paragraph_list, paragraph_values, speech_id)
            language = guess_language(paragraph_texts)
            if language == "en":
                annotations.extend(make_doccano_data(speech_id, paragraph_texts, paragraph_ids, map_contents, misses))
                if len(paragraph_texts) != len(paragraph_values):
                    print(colored(f"warning: mismatch meta data ({len(paragraph_values)}) vs file ({len(paragraph_texts)}) for file {file_name}", "red"))
                nbr_of_files += 1
            else:
                print(f"skipping file in language {language}: {file_name}")
                nbr_of_skipped += 1
    print(f"read {nbr_of_files} files; skipped {nbr_of_skipped} file", end="")
    if nbr_of_skipped != 1:
        print("s")
    else:
        print("")
    print(misses)
    return annotations

In [87]:
def write_annotations(annotations, file_name):
    out_file = open(file_name, "w")
    for annotation in annotations:
        print(json.dumps(annotation), file=out_file)
    out_file.close()

In [14]:
speeches = read_data_file("csv/Speeches-20210520.txt")

In [15]:
map_contents = read_data_file("csv/Map_Contents-20200726.csv")

In [182]:
annotations = read_data(speeches, speech_contents, map_contents, text_directory="./txt")

skipping file in language de: 2015-01-19 Merkel Bundesregerung ann g.txt
skipping file in language de: 2013-11-21 Merkel Bundesregerung ann g.txt
skipping file in language fr: 2013-04-17 Hollande SFM2020 ann fr.txt
skipping file in language fr: 2009-12-01 Sarkozy Elysee (Economy) ann fr.txt
skipping file in language nl: 2011-10-28 Knot dnb_01 ANN NL.txt
skipping file placeholder.txt
skipping file in language fr: 2009-12-14 Sarkozy Elysee (Economy) ann fr.txt
skipping file in language fr: 2010-04-20 Barroso European Commission ann fr.txt
skipping file in language nl: 2011-09-27 Rutte Rijksoverheid ann.txt
skipping file in language fr: 2013-02-19 Hollande SFM2020 ann fr.txt
skipping file in language fr: 2012-08-30 Hollande SFM2020 ann fr.txt
skipping file in language de: 2014-02-27 Merkel Bundesregerung ann g.txt
skipping file in language fr: 2011-01-13 Sarkozy gb ann.txt
skipping file in language nl: 2011-04-06 Rutte FD evenement ann NL.txt
skipping file in language de: 2012-01-06 Rutte

In [183]:
write_annotations(annotations, "annotations.json")