# Import Data from Parser Output

In [4]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option("display.max_rows", 1000)

In [5]:
df = pd.read_csv('../../data/parser_output/parser-output-pride-and-prejudice-jane-austen-2019-05-05.csv')

In [6]:
from ast import literal_eval
df['tokenized_sent'] = df['tokenized_sent'].apply(lambda x: literal_eval(x))
df['raw_utter_list'] = df['raw_utter_list'].apply(lambda x: literal_eval(x))

In [7]:
df.head(100)

Unnamed: 0.1,Unnamed: 0,para_index,tag,para,num_utterances,raw_utter_list,tokenized_sent,chapter_tag
0,0,0,narrative,By Jane Austen,0,[],[By Jane Austen],
1,1,1,narrative,CONTENTS,0,[],[CONTENTS],
2,2,2,narrative,Chapter 1,0,[],[Chapter 1],Chapter 1
3,3,3,narrative,Chapter 2,0,[],[Chapter 2],Chapter 2
4,4,4,narrative,Chapter 3,0,[],[Chapter 3],Chapter 3
5,5,5,narrative,Chapter 4,0,[],[Chapter 4],Chapter 4
6,6,6,narrative,Chapter 5,0,[],[Chapter 5],Chapter 5
7,7,7,narrative,Chapter 6,0,[],[Chapter 6],Chapter 6
8,8,8,narrative,Chapter 7,0,[],[Chapter 7],Chapter 7
9,9,9,narrative,Chapter 8,0,[],[Chapter 8],Chapter 8


# Heuristic Rule-Based Conversation Miner

In [8]:
def is_utterance(sent):
    return ((get_num_quotes(sent)%2==0) & (get_num_quotes(sent)!=0))

def get_num_quotes(sent):
    return sent.count('"')

def iob_tag_df(df):
    iob_format_list = []
    for i in df.index:
        for j, sent in enumerate(df.loc[i]['tokenized_sent']):
            para_index = df.loc[i]['para_index']
            chapter_tag = df.loc[i]['chapter_tag']
            if (is_utterance(sent)) & (j==0):
                iob_format_list.append((para_index, sent, 'B', chapter_tag))
            elif (is_utterance(sent)) & (j>0):
                iob_format_list.append((para_index, sent, 'I', chapter_tag))
            else:
                iob_format_list.append((para_index, sent, 'O', chapter_tag))
                
    df_final = pd.DataFrame()
    df_final['para_index'] = [x[0] for x in iob_format_list]
    df_final['sent'] = [x[1] for x in iob_format_list]
    df_final['label'] = [x[2] for x in iob_format_list]
    df_final['chapter_tag'] = [x[3] for x in iob_format_list]

    return df_final


def convo_miner_rb_heuristic(df_final):
    some_dict = dict()

    count_consec_o = 0
    o_index = -1
    first_convo_index = -1
    utterindex_chapter = tuple()

    for i in df_final.index:
        if df_final.loc[i]['label'] == 'O':
            if o_index == i-1:
                count_consec_o += 1
                o_index = i
            else:
                o_index = i
                count_consec_o = 1
        elif df_final.loc[i]['label'] == 'B':
            curr_utterindex_chapter = (i, df_final.loc[i]['chapter_tag'])
            # if number of consecutive narratives before utterance is at least 3
            if count_consec_o >= 3:
                utterindex_chapter = curr_utterindex_chapter
                some_dict[i] = 'B-START'
                first_convo_index = i
                count_consec_o = 0
            # if utterance is first utterance of a chapter
            elif curr_utterindex_chapter[1] != utterindex_chapter[1]:
                utterindex_chapter = curr_utterindex_chapter
                some_dict[i] = 'B-START'
                first_convo_index = i
                count_consec_o = 0

    for index in list(some_dict.keys()):
        df_final.loc[index, 'label'] = 'B-START'
    
    return df_final
 

In [9]:
df_final = iob_tag_df(df)
df_final = convo_miner_rb_heuristic(df_final)

In [10]:
df_final.loc[50:150]

Unnamed: 0,para_index,sent,label,chapter_tag
50,50,Chapter 49,O,Chapter 49
51,51,Chapter 50,O,Chapter 50
52,52,Chapter 51,O,Chapter 51
53,53,Chapter 52,O,Chapter 52
54,54,Chapter 53,O,Chapter 53
55,55,Chapter 54,O,Chapter 54
56,56,Chapter 55,O,Chapter 55
57,57,Chapter 56,O,Chapter 56
58,58,Chapter 57,O,Chapter 57
59,59,Chapter 58,O,Chapter 58
