# Imports

In [1]:
import pandas as pd
import numpy as np
from IPython.core.display import HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from nltk.tokenize import sent_tokenize
#import matplotlib.pyplot as plt
#import seaborn as sns

# Utils

## Display classes on essay texts

In [2]:
CLASSES_COLORS = {
    "Lead": "Grey",
    "Position": "YellowGreen",
    "Claim": "#F1C40F",
    "Counterclaim": "#E67E22",
    "Rebuttal": "#873600",
    "Evidence": "#3498DB",
    "Concluding Statement": "Green"
}

The following function print the essay text (keeping its exact original formatting) using colors to highlight discourse elements and their classes.

It uses only `predictionstring`, which is useful to display models predictions.

In [3]:
pd.options.mode.chained_assignment = None # Disabling some pandas warning

def display_classes(essay_id, train_df):
    # Handling submission format :
    discourse_type = "class" if "discourse_type" not in train_df.columns else "discourse_type"
    
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    essay_words = essay_text.split()
    formatted_essay = ""
    
    # First we make sure discourse elements are in the text order
    elements_df["prediction_list"] = elements_df["predictionstring"].map(lambda x : x.split())
    elements_df["start_word_index"] = elements_df["prediction_list"].map(lambda x : int(x[0]))
    elements_df.sort_values("start_word_index", inplace=True)

    # Then for each discourse element, we go word by word trough the original essay text
    # and then we highlight the exact part of the essay corresponding to the discourse class.
    end_char = 0
    for i, element in elements_df.iterrows():
        start_word = essay_words[element["start_word_index"]] 
        start_char = essay_text[end_char:].find(start_word) + len(essay_text[:end_char])
        formatted_essay += essay_text[end_char:start_char]
        for word_index in element["prediction_list"]:
            word = essay_words[int(word_index)]
            word_position = essay_text[end_char:].find(word)
            if word_position == -1:
                return "Formatting failed"
            end_char = word_position + len(essay_text[:end_char]) + len(word)
        formatted_essay += f"|<span style='color:{CLASSES_COLORS[element[discourse_type]]}'>{essay_text[start_char:end_char]}</span>"
    formatted_essay = formatted_essay.replace("\n", "<br>")
    color_labels = " ".join([
        f"|<span style='color:{CLASSES_COLORS[class_]}'>{class_}</span>"
        for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + formatted_essay)

## Evaluation metric

Code adapted from Rob Mulla (@robikscube) (https://www.kaggle.com/robikscube/student-writing-competition-twitch)

Can most probably be optimized (if we have performance issues).

In [54]:
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(" "))
    set_gt = set(row.predictionstring_gt.split(" "))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return [overlap_1, overlap_2]


def get_scores(pred_df, gt_df):
    """
    Returns precision, recall and f1 scores.
    """
    gt_df = (
        gt_df[["id", "discourse_type", "predictionstring"]]
        .reset_index(drop=True)
        .copy()
    )
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    pred_df["pred_id"] = pred_df.index
    gt_df["gt_id"] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(
        gt_df,
        left_on=["id", "class"],
        right_on=["id", "discourse_type"],
        how="outer",
        suffixes=("_pred", "_gt"),
    )
    joined["predictionstring_gt"] = joined["predictionstring_gt"].fillna(" ")
    joined["predictionstring_pred"] = joined["predictionstring_pred"].fillna(" ")

    joined["overlaps"] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    
    # The 2 following lines can be avoided if calc_overlap returns directly the proper data type
    joined["overlap1"] = joined["overlaps"].apply(lambda x: eval(str(x))[0])
    joined["overlap2"] = joined["overlaps"].apply(lambda x: eval(str(x))[1])

    joined["potential_TP"] = (joined["overlap1"] >= 0.5) & (joined["overlap2"] >= 0.5)
    joined["max_overlap"] = joined[["overlap1", "overlap2"]].max(axis=1)
    tp_pred_ids = (
        joined.query("potential_TP")
        .sort_values("max_overlap", ascending=False)
        .groupby(["id", "predictionstring_gt"])
        .first()["pred_id"]
        .values
    )

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined["pred_id"].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query("potential_TP")["gt_id"].unique()
    unmatched_gt_ids = [c for c in joined["gt_id"].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    
    # return metrics
    return {
        "precision" : TP / (TP + FP),
        "recall" : TP / (TP + FN),
        "f1" : TP / (TP + 0.5 * (FP + FN))
    }


def kaggle_score(pred_df, gt_df, return_details=False):
    """
    A function that scores for the kaggle Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    detailed_scores = {}
    # We have to get the existing classes dynamically otherwise the code will break :
    classes = set(pred_df["class"].unique()) | set(gt_df["discourse_type"].unique())
    for class_ in classes:
        pred_subset = pred_df.loc[pred_df["class"] == class_].reset_index(drop=True).copy()
        gt_subset = gt_df.loc[gt_df["discourse_type"] == class_].reset_index(drop=True).copy()
        class_scores = get_scores(pred_subset, gt_subset)
        detailed_scores[class_] = class_scores
    f1_score = np.mean([class_scores["f1"] for class_scores in detailed_scores.values()])
    if return_details:
        return f1_score, detailed_scores
    return f1_score

## Generating predictionstring from discourse_start/end

The following snippet of code was copy pasted from [this Kaggle thread](https://www.kaggle.com/c/feedback-prize-2021/discussion/297591) and is the "official" way `predictionstring` is computed from `discourse_start/end`. It can be useful for models that output a prediction with character index.

In [None]:
char_start = discourse_start
char_end = discourse_end
word_start = len(full_text[:char_start].split())
word_end = word_start + len(full_text[char_start:char_end].split())
word_end = min( word_end, len(full_text.split()) )
predictionstring = " ".join( [str(x) for x in range(word_start,word_end)] )

# Exploration

## Loading data

In [6]:
train_df = pd.read_csv("../raw_data/train.csv", dtype = {"discourse_id": int, "discourse_start": int, "discourse_end": int})

In [7]:
train_df.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622627660524,8,229,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622627653021,230,312,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622627671020,313,401,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622627696365,402,758,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622627759780,759,886,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


## Class balance

We have an important class imbalance

In [8]:
train_df["discourse_type"].value_counts()/train_df.shape[0]

Claim                   0.347959
Evidence                0.316731
Position                0.106859
Concluding Statement    0.093594
Lead                    0.064487
Counterclaim            0.040314
Rebuttal                0.030057
Name: discourse_type, dtype: float64

## Are discourse elements full sentences?

Discourse elements that are sentences or groups of sentences (ie: starts with an uppercase letter and ends with a mark). The real number is higher because some students forget uppercase letters or final marks.

In [9]:
sentences_elements = [
    (text[0].isupper() or text[1].isupper())
    and (text[-1] in ".?!" or text[-2] in ".?!")
    for text in train_df["discourse_text"]
]
sentences_elements.count(True)/len(sentences_elements)

0.5753778769586883

Breakdown by discourse class:

In [10]:
sentences_ratio = {}
for class_ in train_df["discourse_type"].unique():
    class_df = train_df["discourse_text"][train_df["discourse_type"] == class_]
    sentences_elements = [
        (text[0].isupper() or text[1].isupper())
        and (text[-1] in ".?!" or text[-2] in ".?!")
        for text in class_df
    ]
    sentences_ratio.update({class_: sentences_elements.count(True)/len(sentences_elements)})
sentences_ratio

{'Lead': 0.7219774314884471,
 'Position': 0.510798365652766,
 'Evidence': 0.7472539494989279,
 'Claim': 0.43530911408540474,
 'Concluding Statement': 0.5780821917808219,
 'Counterclaim': 0.45212308750214886,
 'Rebuttal': 0.45768964722158173}

## Super Naive Bayesline

Average count of discourse classes per essay

In [11]:
classes_ratio = train_df[["id", "discourse_id", "discourse_type"]]\
    .groupby(["id", "discourse_type"]).count()\
    .groupby("discourse_type").mean()\
    .squeeze()
classes_ratio

discourse_type
Claim                   3.363569
Concluding Statement    1.006484
Counterclaim            1.271198
Evidence                2.939035
Lead                    1.000430
Position                1.003449
Rebuttal                1.205392
Name: discourse_id, dtype: float64

The strategy is to identify the following numbers of discourse classes when we "predict" a new essay.

In [12]:
classes_ratio_rounded = classes_ratio.round().astype(int)
classes_ratio_rounded

discourse_type
Claim                   3
Concluding Statement    1
Counterclaim            1
Evidence                3
Lead                    1
Position                1
Rebuttal                1
Name: discourse_id, dtype: int64

In [13]:
X = train_df["discourse_text"]
y = train_df["discourse_type"]

In [14]:
vectorizer = TfidfVectorizer(lowercase=False)
X_vec = vectorizer.fit_transform(X)

We use `ComplementNB` which is better suited than `MultinomialNB` for imbalanced classes (according to Sklearn doc).

In [15]:
model = ComplementNB()
model.fit(X_vec, y)

ComplementNB()

In [16]:
model.score(X_vec, y)

0.6329620979534697

Prediction :

In [17]:
def bayesline_predict(essay_id):
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()

    # For each sentence, we compute the probability of belonging to each class, and store this into a Dataframe
    sentences = sent_tokenize(essay_text)
    predictions_matrix = pd.DataFrame(model.predict_proba(vectorizer.transform(sentences)), columns=model.classes_)

    # Now, we assign a defined number of sentences to each class using the numbers in `classes_ratio_rounded`.
    # To do so, we take the sentence with the highest probability of belonging to a class, assign it to this class,
    # and repeat until all classes have the defined number of sentences assigned.
    prediction = []
    classes_count = classes_ratio_rounded.copy()
    sentences_nb = min(classes_count.sum(), len(sentences))
    while len(prediction) < sentences_nb :
        highest_score_class = predictions_matrix.max().idxmax()
        highest_score_element = predictions_matrix[highest_score_class].idxmax()
        prediction.append((highest_score_element, highest_score_class))
        predictions_matrix.drop(highest_score_element, inplace=True)
        classes_count[highest_score_class] -= 1
        if classes_count[highest_score_class] == 0:
            predictions_matrix.drop(columns=highest_score_class, inplace=True)
    prediction

    # To generate the submission DataFrame, we need to match sentences number and words index.
    sentences_words_index = []
    word_index = 0
    for sentence in sentences:
        sentence_length = len(sentence.split())
        sentences_words_index.append(" ".join([str(i) for i in range(word_index, word_index+sentence_length)]))
        word_index += sentence_length
    sentences_words_index

    # Returning the submission DataFrame
    submission = [[essay_id, element[1], sentences_words_index[element[0]]] for element in prediction]
    return pd.DataFrame(submission, columns=["id", "class", "predictionstring"])

In [55]:
kaggle_score(bayesline_predict("423A1CA112E2"), train_df[train_df["id"] == "423A1CA112E2"], return_details=True)

(0.0,
 {'Position': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
  'Rebuttal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
  'Evidence': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
  'Concluding Statement': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
  'Claim': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
  'Counterclaim': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
  'Lead': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}})

In [56]:
test = train_df[train_df["id"] == "423A1CA112E2"][["id", "discourse_type", "predictionstring"]].copy()
test["class"] = test["discourse_type"]
kaggle_score(test, train_df[train_df["id"] == "423A1CA112E2"], return_details=True)

(1.0,
 {'Lead': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0},
  'Position': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0},
  'Evidence': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0},
  'Concluding Statement': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0},
  'Claim': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}})

In [57]:
display_classes("423A1CA112E2", bayesline_predict("423A1CA112E2"))

In [58]:
display_classes("423A1CA112E2", train_df)

# Unused utils

`display_classes_2` uses the `discourse_text` field of `train_df`. It is generally better than `display_classes_3` (which uses `discourse_start/end`), but when a element is present several times in the essay text, things get messy.

In [None]:
def display_classes_2(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    for i, element in elements_df.iterrows():
        element_text = element["discourse_text"].strip()
        # The stripping above is needed to make sure the replace below works
        if not element_text in essay_text:
            return "Formatting failed"
        essay_text = essay_text.replace(
            element_text,
            f"|<span style='color:{CLASSES_COLORS[element['discourse_type']]}'>{element_text}</span>"
        )
    essay_text = essay_text.replace("\n", "<br>")
    color_labels = " - ".join([
        f"<span style='color:{CLASSES_COLORS[class_]}'>{class_}</span>"
        for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + essay_text)

In [None]:
def format_discourse(text, type_):
    if text:
        text = text.replace("\n", "<br>")
        return f"|<span style='color:{CLASSES_COLORS.get(type_)}'>{text}</span>"
    return ""

In [None]:
def display_classes_3(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    formatted_elements = ""
    char_pointer = 0
    for i, element in elements_df.iterrows():
        unlabelled_element = essay_text[char_pointer:element['discourse_start']]
        char_pointer = element['discourse_end'] + 1
        discourse_element = essay_text[element['discourse_start']:char_pointer]
        formatted_elements +=\
            format_discourse(unlabelled_element, "Unlabelled") +\
            format_discourse(discourse_element, element['discourse_type'])
    formatted_elements += essay_text[char_pointer:]
    color_labels = " ".join([format_discourse(class_, class_) for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + formatted_elements)

We maybe can simplify the code using the displaCy tool (https://explosion.ai/demos/displacy-ent), see this example:
```python
j = 40
ents = []
for i, row in df[df['id'] == df_f[j][35:-4]].iterrows():
    ents.append({
                    'start': int(row['discourse_start']), 
                     'end': int(row['discourse_end']), 
                     'label': row['discourse_type']
                })
with open(df_f[j], 'r') as file: data = file.read()

doc2 = {
    "text": data,
    "ents": ents,
}
cols = {'Lead': '#dad1f6','Position': '#f9d5de','Claim': '#adcfad','Evidence': '#fbbf9a','Counterclaim': '#bdf2fa','Concluding Statement': '#eea69e','Rebuttal': '#d1f8f4'}
options = {"ents": df.discourse_type.unique().tolist(), "colors": cols}
displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True);
```