## Imports and settings

In [15]:
import ast
import pandas as pd
import re
import os
import csv
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('wordnet')

import spacy
nlp = spacy.load('en_core_web_sm')

pd.options.display.max_colwidth = None

print('DONE!')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anastasiiatodoshchuk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


DONE!


## [1] Data Cleaning and creating DataFrame with sentences using Tokenizer (function)

In [16]:
def data_rewrite(tsv_df):

    words = tsv_df .CHAPTER[1:].values
    text=' '.join(words)

    # text cleaning
    text = re.sub(r'\s([?.!,:;"](?:\s|$))', r'\1', text)
    text = re.sub(r'\s([’](?:\s|$))', r'\1', text)
    text = re.sub(r'([‘])\s', r'\1', text)
    text = re.sub('\s([‘])', ' "', text)
    text = re.sub('([’])\s', '" ', text)
    text = re.sub('([’])[,]', '",', text)
    text = re.sub('([’])[;]', '";', text)

    # adding annotated event
    data_annotated = pd.DataFrame(sent_tokenize(text), columns=['sent_token'])
    data_annotated = get_events_annotated(tsv_df, data_annotated)

    # removing direct speech
    text_speech = re.sub("\".*?[.,!?;\\-]\"", ' "SPEECH."',  text)

    # creating DataFrame with sentences using tokenizer
    data_speechless = pd.DataFrame(sent_tokenize(text_speech), columns = ['sent_token'])

    return data_annotated, data_speechless


#### [1.1] Getting annotated events from tsv dataframe (sub-function)

In [17]:
def get_events_annotated(tsv_df, data):

    ind = tsv_df[tsv_df.O=='EVENT'].index

    events = []
    for i in range(len(ind)):
        events.append(list(tsv_df.loc[ind[i]-1:ind[i]+2, 'CHAPTER'].values))
    e = [" ".join([word for word in a]) for a in events]
    data['annotated_events'] = None

    for i in range(len(data)):
        sentence = data['sent_token'][i]
        list_of_events=[]
        for event in e:
            if event in sentence:
                list_of_events.append(event.split()[1])
        data.loc[i, 'annotated_events'] = str(list_of_events)

    data['annotated_events'] = data['annotated_events'].apply(ast.literal_eval)

    return data

## [2] Splitting complex sentences into simple sub-sentences (function)

In [18]:
def get_sub_sentences(data):

    data['sub_sentences'] = None
    previous_token = ''
    sub_sentences = []
    sub_sentence = ''


    for row in range(data.shape[0]):

        doc_sent = nlp(data.sent_token[row])
        for token in doc_sent:

            # token is a part of the next sentence
            # CCONJ is 'and', 'but'
            if token.pos_ == 'CCONJ' and previous_token:

                # exammple: ..., and ..."
                if previous_token.pos_ == 'PUNCT':
                    sub_sentences.append(sub_sentence)
                    sub_sentence = ''
                    sub_sentence = sub_sentence + token.text + ' '

            # SCONJ is 'because', 'that', 'so'
            elif token.pos_ == 'SCONJ' and previous_token:

                # token is a part of the current sentence
                if previous_token.pos_ == 'SCONJ':
                    sub_sentence = sub_sentence + token.text + ' '

                # token is a part of the current sentence
                if previous_token.pos_ == 'CCONJ':
                    sub_sentence = sub_sentence + token.text + ' '

                # token is a part of the current sentence (Row 3: "ones upon a time..." case)
                if len(sub_sentence.split())==1:
                    sub_sentence = sub_sentence + token.text + ' '

                # token is a part of the new sentence
                # example> Row 4: long ago THAT Pippi did n't remember her at all
                else:
                    sub_sentences.append(sub_sentence)
                    sub_sentence = ''
                    sub_sentence = sub_sentence + token.text + ' '

            # token is a part of the current sentence
            elif token.text == ';':
                #sub_sentence = sub_sentence + token.text
                sub_sentence = sub_sentence + '.'
                sub_sentences.append(sub_sentence)
                sub_sentence = ''

            else:
                sub_sentence = sub_sentence + token.text + ' '

            previous_token = token

        sub_sentences.append(sub_sentence)
        data.loc[row, 'sub_sentences'] = str(sub_sentences)
        previous_token = ''
        sub_sentences = []
        sub_sentence = ''


    data['sub_sentences'] = data['sub_sentences'].apply(ast.literal_eval)

    data['count_sub'] = None
    for row in range(data.shape[0]):
        data.loc[row, 'count_sub'] = len(data.sub_sentences[row])

    return data

## [3] Applying Q1 (agent), Q2 (action), and Q3(patient) to annotated dataset (function)


In [19]:
# This function is looking for agents, events, and patients on the level of sub-sentences.
def get_agents_events_patients(data):
    agent_roles = ['nsubj', 'nsubjpass']
    verb_roles = ['ROOT', 'conj', 'ccomp', 'advcl', 'relcl']
    patient_roles = ['dobj', 'attr']

    annex_agent_roles = ['conj', 'compound', 'advcl']

    data['agents_in_sent'] = None
    data['events_in_sent'] = None
    data['patients_in_sent'] = None

    agents_in_sent = []
    events_in_sent = []
    patients_in_sent = []
    event = ''
    agent = ''
    patient = None

    final_agents = []
    final_events = []
    final_patients = []

    for row in range(data.shape[0]):
        for subsent in data['sub_sentences'][row]:
            doc = nlp(subsent)
            for word in doc:

                if (word.dep_ in agent_roles) and (word.head.dep_ in verb_roles):

                    agent = word.text
                    event = word.head.text

                    agents_in_sent.append(agent)
                    events_in_sent.append(event)

                    # -- are there a PATIENT for the found AGENT and EVENT pair? --
                    for echild in word.head.children:
                        # -- if there's a patient and if it's the 1st one - add it
                        # -- (it goes in set with main agent and event).
                        if echild.dep_ in patient_roles:
                            patient = echild.text
                            patients_in_sent.append(patient)

                    # -- if no patient was found, than add the None. If it was found, make it None
                    if patient:
                        patient = None
                    else:
                        patients_in_sent.append(patient)

                    # -- different agent-children with the same parent event --
                    for achild in word.children:
                        if achild.dep_ in annex_agent_roles:
                            agent = achild.text
                            agents_in_sent.append(agent)
                            events_in_sent.append(event)

                    # -- different event-children with the same parent agent --
                    agent = word.text
                    for echild in word.head.children:
                        if echild.dep_ in annex_agent_roles:
                            event = echild.text
                            agents_in_sent.append(agent)
                            events_in_sent.append(event)

                    # -- combinations agent-children with event-children --
                    for achild in word.children:
                        if achild.dep_ in annex_agent_roles:
                            agent = achild.text
                            for echild in word.head.children:
                                if echild.dep_ in annex_agent_roles:
                                    event = echild.text
                                    agents_in_sent.append(agent)
                                    events_in_sent.append(event)


            final_agents.append(agents_in_sent)
            final_events.append(events_in_sent)
            final_patients.append(patients_in_sent)

            agents_in_sent = []
            events_in_sent = []
            patients_in_sent = []

        data['agents_in_sent'][row] = final_agents
        data['events_in_sent'][row] = final_events
        data['patients_in_sent'][row] = final_patients

        final_agents = []
        final_events = []
        final_patients = []

    return data

## [4] Evaluation of Q2 action (comparison with the annotated actions)

In [20]:
def evaluation(evaluation_df):

    evaluation_df['TruePositive'] = 0
    evaluation_df['FalseNegative'] = 0
    evaluation_df['FalsePositive'] = 0

    for i in range(evaluation_df.shape[0]):
        evaluation_df['events_in_sent'][i] = [item for sublist in evaluation_df['events_in_sent'][i] for item in sublist]
        evaluation_df['events_in_sent'][i] = list(dict.fromkeys(evaluation_df['events_in_sent'][i]))
    for i in range(evaluation_df.shape[0]):
        if evaluation_df['annotated_events'][i]:
            for true_event in evaluation_df['annotated_events'][i]:
                if true_event in evaluation_df['events_in_sent'][i]:
                    evaluation_df['TruePositive'][i]+=1
                else:
                    evaluation_df['FalseNegative'][i]+=1
        if evaluation_df['events_in_sent'][i]:
            for found_event in evaluation_df['events_in_sent'][i]:
                if found_event not in evaluation_df['annotated_events'][i]:
                    evaluation_df['FalsePositive'][i]+=1

    Precision = 0
    Recall = 0
    F1_score = 0
    TP_score = 0
    FP_score = 0
    FN_score = 0

    for i in range(evaluation_df.shape[0]):
        if evaluation_df['annotated_events'][i] and evaluation_df['events_in_sent'][i]:

            TP_score+= evaluation_df['TruePositive'][i]
            FP_score+= evaluation_df['FalsePositive'][i]
            FN_score+= evaluation_df['FalseNegative'][i]

    # we set F1_score, Precision, and Recall as nones in case they can't be calculated
    # (the case of only one literary story - "The Magnificent Ambersons")
    if TP_score == 0 and FP_score == 0 or TP_score == 0 and FN_score == 0:
        return None, None, None

    Precision = TP_score/ (TP_score + FP_score)
    Recall = TP_score / (TP_score + FN_score)
    F1_score = 2 * (Precision * Recall) / (Precision + Recall)

    return round(F1_score, 4), round(Precision, 4), round(Recall, 4)

#### Additional functions

In [21]:
def add_results(df, name, F_score, Precision, Recall):
    df = df.append({'annotated_file': name, 'F_score': F_score, 'Precision': Precision, 'Recall': Recall}, ignore_index=True)
    return df

def final_result(df, metric):
    sum = df[metric].sum()
    num = df[metric].notnull().sum()

    return round(sum/num, 4)

## [Results] Looping through annotated files and applying functions [1]-[4] + saving results

In [24]:
directory = 'litbank-master/events/tsv/'
res_df = pd.DataFrame(columns=['annotated_file', 'F_score', 'Precision', 'Recall'])
F_score = None
Precision = None
Recall = None
# loop through all the annotated files
for filename in os.listdir(directory):
    if filename.endswith(".tsv"):
        path = os.path.join(directory, filename)

        tsv_df = pd.read_csv(path, sep='\t', quoting=csv.QUOTE_NONE, encoding='utf-8')
        tsv_df.columns = ['CHAPTER', 'O']

        data_annotated, data_speechless = data_rewrite(tsv_df)
        if data_annotated.shape[0] == data_speechless.shape[0] and not data_annotated[data_annotated["annotated_events"].astype(bool)].empty:
            data_speechless['annotated_events'] = data_annotated['annotated_events']

        data = get_sub_sentences(data_annotated)
        data = get_agents_events_patients(data)
        if 'annotated_events' in data.columns:
            evaluation_df = data.loc[:, ['annotated_events', 'events_in_sent']]
            F_score, Precision, Recall = evaluation(evaluation_df)
        res_df = add_results(res_df, filename, F_score, Precision, Recall)
        F_score = None
        Precision = None
        Recall = None

res_df.to_csv('evaluation_results_LitBank.csv', encoding='utf-8')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_df['events_in_sent'][i] = [item for sublist in evaluation_df['events_in_sent'][i] for item in sublist]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_df['events_in_sent'][i] = list(dict.fromkeys(evaluation_df['events_in_sent'][i]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_df['TruePositive'][i]+=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pa

#### Printing the results

In [23]:
results = pd.read_csv('evaluation_results.csv', index_col=None)
print(f'THE AVERAGE F-SCORE IS {final_result(res_df, "F_score")}')
print(f'THE AVERAGE PRECISION IS {final_result(res_df, "Precision")}')
print(f'THE AVERAGE RECALL IS {final_result(res_df, "Recall")}')

THE AVERAGE F-SCORE IS 0.4957
THE AVERAGE PRECISION IS 0.4235
THE AVERAGE RECALL IS 0.613
