# Imports

In [509]:
import pandas as pd
from IPython.core.display import HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
#import matplotlib.pyplot as plt
#import seaborn as sns

# Utils

In [384]:
CLASSES_COLORS = {
    "Lead": "Grey",
    "Position": "YellowGreen",
    "Claim": "#F1C40F",
    "Counterclaim": "#E67E22",
    "Rebuttal": "#873600",
    "Evidence": "#3498DB",
    "Concluding Statement": "Green"
}

In [336]:
def format_discourse(text, type_):
    if text:
        text = text.replace("\n", "<br>")
        return f"|<span style='color:{CLASSES_COLORS.get(type_)}'>{text}</span>"
    return ""

In [366]:
def display_classes(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    formatted_elements = ""
    char_pointer = 0
    for i, element in elements_df.iterrows():
        unlabelled_element = essay_text[char_pointer:element['discourse_start']]
        char_pointer = element['discourse_end'] + 1
        discourse_element = essay_text[element['discourse_start']:char_pointer]
        formatted_elements +=\
            format_discourse(unlabelled_element, "Unlabelled") +\
            format_discourse(discourse_element, element['discourse_type'])
    formatted_elements += essay_text[char_pointer:]
    color_labels = " ".join([format_discourse(class_, class_) for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + formatted_elements)

# Exploration

## Loading data

In [112]:
train_df = pd.read_csv("../raw_data/train.csv", dtype = {"discourse_id": int, "discourse_start": int, "discourse_end": int})

In [437]:
train_df.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622627660524,8,229,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622627653021,230,312,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622627671020,313,401,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622627696365,402,758,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622627759780,759,886,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


## Class balance

We have an important class imbalance

In [508]:
train_df["discourse_type"].value_counts()/train_df.shape[0]

Claim                   0.347959
Evidence                0.316731
Position                0.106859
Concluding Statement    0.093594
Lead                    0.064487
Counterclaim            0.040314
Rebuttal                0.030057
Name: discourse_type, dtype: float64

## Are discourse elements full sentences?

Discourse elements that are sentences or groups of sentences (ie: starts with an uppercase letter and ends with a mark). The real number is higher because some students forget uppercase letters or final marks.

In [453]:
sentences_elements = [
    (text[0].isupper() or text[1].isupper())
    and (text[-1] in ".?!" or text[-2] in ".?!")
    for text in train_df["discourse_text"]
]
sentences_elements.count(True)/len(sentences_elements)

0.5753778769586883

Breakdown by discourse class:

In [451]:
sentences_ratio = {}
for class_ in train_df["discourse_type"].unique():
    class_df = train_df["discourse_text"][train_df["discourse_type"] == class_]
    sentences_elements = [
        (text[0].isupper() or text[1].isupper())
        and (text[-1] in ".?!" or text[-2] in ".?!")
        for text in class_df
    ]
    sentences_ratio.update({class_: sentences_elements.count(True)/len(sentences_elements)})
sentences_ratio

{'Lead': 0.7219774314884471,
 'Position': 0.510798365652766,
 'Evidence': 0.7472539494989279,
 'Claim': 0.43530911408540474,
 'Concluding Statement': 0.5780821917808219,
 'Counterclaim': 0.45212308750214886,
 'Rebuttal': 0.45768964722158173}

## Simplest baseline : tf-idf + statistics

Average count of discourse classes per essay

In [481]:
classes_ratio_df = train_df[["id", "discourse_id", "discourse_type"]]\
    .groupby(["id", "discourse_type"]).count()\
    .groupby("discourse_type").mean()\
    .rename(columns={"discourse_id": "ratio"})
classes_ratio_df

Unnamed: 0_level_0,ratio
discourse_type,Unnamed: 1_level_1
Claim,3.363569
Concluding Statement,1.006484
Counterclaim,1.271198
Evidence,2.939035
Lead,1.00043
Position,1.003449
Rebuttal,1.205392


In [486]:
X = train_df["discourse_text"]
y = train_df["discourse_type"]

In [492]:
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

We use `ComplementNB` which is better suited than `MultinomialNB` for imbalanced classes (according to Sklearn doc).

In [510]:
model = ComplementNB()
model.fit(X_vec, y)

ComplementNB()

In [511]:
model.score(X_vec, y)

0.6153867477978835

In [485]:
display_classes("423A1CA112E2", train_df)

# Unused utils

`display_classes_2` uses the `discourse_text` field of `train_df`. It is generally better than `display_classes`, but when a element is present several times in the essay text, things get messy.

In [344]:
def display_classes_2(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    for i, element in elements_df.iterrows():
        element_text = element["discourse_text"].strip()
        # The stripping above is needed to make sure the replace below works
        if not element_text in essay_text:
            return "Formatting failed"
        essay_text = essay_text.replace(
            element_text,
            f"|<span style='color:{CLASSES_COLORS[element['discourse_type']]}'>{element_text}</span>"
        )
    essay_text = essay_text.replace("\n", "<br>")
    color_labels = " - ".join([
        f"<span style='color:{CLASSES_COLORS[class_]}'>{class_}</span>"
        for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + essay_text)

We maybe can simplify the code using the displaCy tool (https://explosion.ai/demos/displacy-ent), see this example:
```python
j = 40
ents = []
for i, row in df[df['id'] == df_f[j][35:-4]].iterrows():
    ents.append({
                    'start': int(row['discourse_start']), 
                     'end': int(row['discourse_end']), 
                     'label': row['discourse_type']
                })
with open(df_f[j], 'r') as file: data = file.read()

doc2 = {
    "text": data,
    "ents": ents,
}
cols = {'Lead': '#dad1f6','Position': '#f9d5de','Claim': '#adcfad','Evidence': '#fbbf9a','Counterclaim': '#bdf2fa','Concluding Statement': '#eea69e','Rebuttal': '#d1f8f4'}
options = {"ents": df.discourse_type.unique().tolist(), "colors": cols}
displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True);
```