# Imports

In [5]:
import pandas as pd
from IPython.core.display import HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from nltk.tokenize import sent_tokenize
#import matplotlib.pyplot as plt
#import seaborn as sns

# Utils

In [6]:
CLASSES_COLORS = {
    "Lead": "Grey",
    "Position": "YellowGreen",
    "Claim": "#F1C40F",
    "Counterclaim": "#E67E22",
    "Rebuttal": "#873600",
    "Evidence": "#3498DB",
    "Concluding Statement": "Green"
}

The following function print the essay text (keeping its exact original formatting) using colors to highlight discourse elements and their classes.

It uses only `predictionstring`, which is useful to display models predictions.

In [7]:
pd.options.mode.chained_assignment = None # Disabling some pandas warning

def display_classes(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    essay_words = essay_text.split()
    formatted_essay = ""
    
    # First we make sure discourse elements are in the text order
    elements_df["prediction_list"] = elements_df["predictionstring"].map(lambda x : x.split())
    elements_df["start_word_index"] = elements_df["prediction_list"].map(lambda x : int(x[0]))
    elements_df.sort_values("start_word_index", inplace=True)

    # Then for each discourse element, we go word by word trough the original essay text
    # and then we highlight the exact part of the essay corresponding to the discourse class.
    end_char = 0
    for i, element in elements_df.iterrows():
        start_word = essay_words[element["start_word_index"]] 
        start_char = essay_text[end_char:].find(start_word) + len(essay_text[:end_char])
        formatted_essay += essay_text[end_char:start_char]
        for word_index in element["prediction_list"]:
            word = essay_words[int(word_index)]
            word_position = essay_text[end_char:].find(word)
            if word_position == -1:
                return "Formatting failed"
            end_char = word_position + len(essay_text[:end_char]) + len(word)
        formatted_essay += f"|<span style='color:{CLASSES_COLORS[element['discourse_type']]}'>{essay_text[start_char:end_char]}</span>"
    formatted_essay = formatted_essay.replace("\n", "<br>")
    color_labels = " ".join([
        f"|<span style='color:{CLASSES_COLORS[class_]}'>{class_}</span>"
        for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + formatted_essay)

# Exploration

## Loading data

In [9]:
train_df = pd.read_csv("../raw_data/train.csv", dtype = {"discourse_id": int, "discourse_start": int, "discourse_end": int})

In [10]:
train_df.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622627660524,8,229,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622627653021,230,312,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622627671020,313,401,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622627696365,402,758,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622627759780,759,886,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


## Class balance

We have an important class imbalance

In [11]:
train_df["discourse_type"].value_counts()/train_df.shape[0]

Claim                   0.347959
Evidence                0.316731
Position                0.106859
Concluding Statement    0.093594
Lead                    0.064487
Counterclaim            0.040314
Rebuttal                0.030057
Name: discourse_type, dtype: float64

## Are discourse elements full sentences?

Discourse elements that are sentences or groups of sentences (ie: starts with an uppercase letter and ends with a mark). The real number is higher because some students forget uppercase letters or final marks.

In [12]:
sentences_elements = [
    (text[0].isupper() or text[1].isupper())
    and (text[-1] in ".?!" or text[-2] in ".?!")
    for text in train_df["discourse_text"]
]
sentences_elements.count(True)/len(sentences_elements)

0.5753778769586883

Breakdown by discourse class:

In [13]:
sentences_ratio = {}
for class_ in train_df["discourse_type"].unique():
    class_df = train_df["discourse_text"][train_df["discourse_type"] == class_]
    sentences_elements = [
        (text[0].isupper() or text[1].isupper())
        and (text[-1] in ".?!" or text[-2] in ".?!")
        for text in class_df
    ]
    sentences_ratio.update({class_: sentences_elements.count(True)/len(sentences_elements)})
sentences_ratio

{'Lead': 0.7219774314884471,
 'Position': 0.510798365652766,
 'Evidence': 0.7472539494989279,
 'Claim': 0.43530911408540474,
 'Concluding Statement': 0.5780821917808219,
 'Counterclaim': 0.45212308750214886,
 'Rebuttal': 0.45768964722158173}

## Simplest baseline : tf-idf + statistics

Average count of discourse classes per essay

In [14]:
classes_ratio = train_df[["id", "discourse_id", "discourse_type"]]\
    .groupby(["id", "discourse_type"]).count()\
    .groupby("discourse_type").mean()\
    .squeeze()
classes_ratio

discourse_type
Claim                   3.363569
Concluding Statement    1.006484
Counterclaim            1.271198
Evidence                2.939035
Lead                    1.000430
Position                1.003449
Rebuttal                1.205392
Name: discourse_id, dtype: float64

The strategy is to identify the following numbers of discourse classes when we "predict" a new essay.

In [15]:
classes_ratio_rounded = classes_ratio.round().astype(int)
classes_ratio_rounded

discourse_type
Claim                   3
Concluding Statement    1
Counterclaim            1
Evidence                3
Lead                    1
Position                1
Rebuttal                1
Name: discourse_id, dtype: int64

In [16]:
X = train_df["discourse_text"]
y = train_df["discourse_type"]

In [17]:
vectorizer = TfidfVectorizer(lowercase=False)
X_vec = vectorizer.fit_transform(X)

We use `ComplementNB` which is better suited than `MultinomialNB` for imbalanced classes (according to Sklearn doc).

In [18]:
model = ComplementNB()
model.fit(X_vec, y)

ComplementNB()

In [19]:
model.score(X_vec, y)

0.6329620979534697

Prediction :

For each sentence, we compute the probability of belonging to each class, and store this into a Dataframe.

In [20]:
essay_text = open(f'../raw_data/train/423A1CA112E2.txt').read()
sentences = sent_tokenize(essay_text)

In [21]:
predictions_matrix = pd.DataFrame(model.predict_proba(vectorizer.transform(sentences)), columns=model.classes_)

In [22]:
predictions_matrix

Unnamed: 0,Claim,Concluding Statement,Counterclaim,Evidence,Lead,Position,Rebuttal
0,0.131756,0.128818,0.134072,0.169805,0.165532,0.135662,0.134355
1,0.121082,0.13931,0.130993,0.199631,0.15459,0.119632,0.134763
2,0.139663,0.137054,0.135748,0.174339,0.138454,0.13859,0.136152
3,0.149383,0.136265,0.136745,0.178759,0.133124,0.128224,0.1375
4,0.127726,0.126587,0.142706,0.173493,0.139626,0.153853,0.136008
5,0.09777,0.118706,0.127384,0.273059,0.135602,0.117683,0.129797
6,0.126523,0.11662,0.123425,0.283009,0.121389,0.106709,0.122325
7,0.179811,0.14103,0.137283,0.131281,0.140885,0.134098,0.135611
8,0.140098,0.156537,0.138972,0.114874,0.157105,0.154759,0.137655
9,0.162999,0.137049,0.13389,0.150167,0.144898,0.136391,0.134606


Now, we want to assign a certain number of sentences to each class depending on `classes_ratio_rounded`.

In [23]:
prediction = []
classes_count = classes_ratio_rounded.copy()
sentences_nb = min(classes_count.sum(), len(sentences))
while len(prediction) < sentences_nb :
    highest_score_class = predictions_matrix.max().idxmax()
    highest_score_element = predictions_matrix[highest_score_class].idxmax()
    prediction.append((highest_score_element, highest_score_class))
    predictions_matrix.drop(highest_score_element, inplace=True)
    classes_count[highest_score_class] -= 1
    if classes_count[highest_score_class] == 0:
        predictions_matrix.drop(columns=highest_score_class, inplace=True)
prediction

[(6, 'Evidence'),
 (5, 'Evidence'),
 (16, 'Evidence'),
 (11, 'Claim'),
 (7, 'Claim'),
 (13, 'Concluding Statement'),
 (0, 'Lead'),
 (26, 'Claim'),
 (14, 'Counterclaim'),
 (8, 'Position'),
 (15, 'Rebuttal')]

# Unused utils

`display_classes_2` uses the `discourse_text` field of `train_df`. It is generally better than `display_classes_3` (which uses `discourse_start/end`), but when a element is present several times in the essay text, things get messy.

In [24]:
def display_classes_2(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    for i, element in elements_df.iterrows():
        element_text = element["discourse_text"].strip()
        # The stripping above is needed to make sure the replace below works
        if not element_text in essay_text:
            return "Formatting failed"
        essay_text = essay_text.replace(
            element_text,
            f"|<span style='color:{CLASSES_COLORS[element['discourse_type']]}'>{element_text}</span>"
        )
    essay_text = essay_text.replace("\n", "<br>")
    color_labels = " - ".join([
        f"<span style='color:{CLASSES_COLORS[class_]}'>{class_}</span>"
        for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + essay_text)

In [25]:
def format_discourse(text, type_):
    if text:
        text = text.replace("\n", "<br>")
        return f"|<span style='color:{CLASSES_COLORS.get(type_)}'>{text}</span>"
    return ""

In [26]:
def display_classes_3(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    formatted_elements = ""
    char_pointer = 0
    for i, element in elements_df.iterrows():
        unlabelled_element = essay_text[char_pointer:element['discourse_start']]
        char_pointer = element['discourse_end'] + 1
        discourse_element = essay_text[element['discourse_start']:char_pointer]
        formatted_elements +=\
            format_discourse(unlabelled_element, "Unlabelled") +\
            format_discourse(discourse_element, element['discourse_type'])
    formatted_elements += essay_text[char_pointer:]
    color_labels = " ".join([format_discourse(class_, class_) for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + formatted_elements)

We maybe can simplify the code using the displaCy tool (https://explosion.ai/demos/displacy-ent), see this example:
```python
j = 40
ents = []
for i, row in df[df['id'] == df_f[j][35:-4]].iterrows():
    ents.append({
                    'start': int(row['discourse_start']), 
                     'end': int(row['discourse_end']), 
                     'label': row['discourse_type']
                })
with open(df_f[j], 'r') as file: data = file.read()

doc2 = {
    "text": data,
    "ents": ents,
}
cols = {'Lead': '#dad1f6','Position': '#f9d5de','Claim': '#adcfad','Evidence': '#fbbf9a','Counterclaim': '#bdf2fa','Concluding Statement': '#eea69e','Rebuttal': '#d1f8f4'}
options = {"ents": df.discourse_type.unique().tolist(), "colors": cols}
displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True);
```