# Imports

In [73]:
import pandas as pd
from IPython.core.display import HTML
#import matplotlib.pyplot as plt
#import seaborn as sns

# Utils

In [377]:
CLASSES_COLORS = {
    "Lead": "Grey",
    "Position": "YellowGreen",
    "Claim": "#F8C471",
    "Counterclaim": "#E67E22",
    "Rebuttal": "#D35400",
    "Evidence": "Blue",
    "Concluding Statement": "Green"
}

In [336]:
def format_discourse(text, type_):
    if text:
        text = text.replace("\n", "<br>")
        return f"|<span style='color:{CLASSES_COLORS.get(type_)}'>{text}</span>"
    return ""

In [366]:
def display_classes(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    formatted_elements = ""
    char_pointer = 0
    for i, element in elements_df.iterrows():
        unlabelled_element = essay_text[char_pointer:element['discourse_start']]
        char_pointer = element['discourse_end'] + 1
        discourse_element = essay_text[element['discourse_start']:char_pointer]
        formatted_elements +=\
            format_discourse(unlabelled_element, "Unlabelled") +\
            format_discourse(discourse_element, element['discourse_type'])
    formatted_elements += essay_text[char_pointer:]
    color_labels = " ".join([format_discourse(class_, class_) for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + formatted_elements)

# Exploration

In [112]:
train_df = pd.read_csv("../raw_data/train.csv", dtype = {"discourse_id": int, "discourse_start": int, "discourse_end": int})

In [331]:
train_df.sample(20, random_state=42)

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
53637,6FD0E82AB64D,1618606092372,498,778,There is also te question of what if the compu...,Evidence,Evidence 1,88 89 90 91 92 93 94 95 96 97 98 99 100 101 10...
5223,FAAC3CC5476F,1622490268014,0,115,Hello today I'm going to be talking about the ...,Lead,Lead 1,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...
115556,FCBD4BC9112B,1621358090273,16,467,Missing lecture days can be devastating for a ...,Lead,Lead 1,2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...
134628,A868395AEAAB,1617996273243,266,300,"it gives people a new perspective,",Claim,Claim 1,47 48 49 50 51 52
9557,F468E21A6DEE,1622928992790,123,174,There are advantages though to limiting car us...,Position,Position 1,25 26 27 28 29 30 31 32
107574,F8CDAD03758D,1620248430367,198,517,Parents pick up their children from school whe...,Evidence,Evidence 1,34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 4...
139323,7CED441887D8,1617678141596,349,367,or maybe a friend.,Claim,Claim 3,67 68 69 70
82397,E146DF9B10F9,1620703884758,131,172,you wont have to drive the car yourself.,Claim,Claim 1,22 23 24 25 26 27 28 29
5827,BD7D972ED5B5,1622489651919,481,561,"In my opinion, I believe that phones should no...",Position,Position 1,82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
116502,284130677785,1622231746013,4999,5653,To sum it up distance learning is not a wrong ...,Concluding Statement,Concluding Statement 1,864 865 866 867 868 869 870 871 872 873 874 87...


In [378]:
display_classes("FC79A21481C3", train_df)

# Unused utils

`display_classes_2` uses the `discourse_text` field of `train_df`. It is generally better than `display_classes`, but when a element is present several times in the essay text, things get messy.

In [344]:
def display_classes_2(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    for i, element in elements_df.iterrows():
        element_text = element["discourse_text"].strip()
        # The stripping above is needed to make sure the replace below works
        if not element_text in essay_text:
            return "Formatting failed"
        essay_text = essay_text.replace(
            element_text,
            f"|<span style='color:{CLASSES_COLORS[element['discourse_type']]}'>{element_text}</span>"
        )
    essay_text = essay_text.replace("\n", "<br>")
    color_labels = " - ".join([
        f"<span style='color:{CLASSES_COLORS[class_]}'>{class_}</span>"
        for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + essay_text)