# Imports & constants

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from nltk.tokenize import sent_tokenize
from evalstudent import metrics
from evalstudent import utils
from tqdm import tqdm
from sklearn.model_selection import KFold
%load_ext autoreload
%autoreload 2

In [2]:
DATA_PATH = "../../raw_data/train/"

# Exploration

## Loading data

In [3]:
train_df = pd.read_csv("../../raw_data/train.csv", dtype = {"discourse_id": int, "discourse_start": int, "discourse_end": int})

In [4]:
train_df.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622627660524,8,229,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622627653021,230,312,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622627671020,313,401,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622627696365,402,758,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622627759780,759,886,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


## Class balance

We have an important class imbalance

In [5]:
train_df["discourse_type"].value_counts()/train_df.shape[0]

Claim                   0.347959
Evidence                0.316731
Position                0.106859
Concluding Statement    0.093594
Lead                    0.064487
Counterclaim            0.040314
Rebuttal                0.030057
Name: discourse_type, dtype: float64

## Are discourse elements full sentences?

Discourse elements that are sentences or groups of sentences (ie: starts with an uppercase letter and ends with a mark). The real number is higher because some students forget uppercase letters or final marks.

In [6]:
sentences_elements = [
    (text[0].isupper() or text[1].isupper())
    and (text[-1] in ".?!" or text[-2] in ".?!")
    for text in train_df["discourse_text"]
]
sentences_elements.count(True)/len(sentences_elements)

0.5753778769586883

Breakdown by discourse class:

In [7]:
sentences_ratio = {}
for class_ in train_df["discourse_type"].unique():
    class_df = train_df["discourse_text"][train_df["discourse_type"] == class_]
    sentences_elements = [
        (text[0].isupper() or text[1].isupper())
        and (text[-1] in ".?!" or text[-2] in ".?!")
        for text in class_df
    ]
    sentences_ratio.update({class_: sentences_elements.count(True)/len(sentences_elements)})
sentences_ratio

{'Lead': 0.7219774314884471,
 'Position': 0.510798365652766,
 'Evidence': 0.7472539494989279,
 'Claim': 0.43530911408540474,
 'Concluding Statement': 0.5780821917808219,
 'Counterclaim': 0.45212308750214886,
 'Rebuttal': 0.45768964722158173}

# Modeling: Super Naive Bayesline

## Preliminary calculation

Average count of discourse classes per essay

In [8]:
classes_ratio = train_df[["id", "discourse_id", "discourse_type"]]\
    .groupby(["id", "discourse_type"]).count()\
    .groupby("discourse_type").mean()\
    .squeeze()
classes_ratio

discourse_type
Claim                   3.363569
Concluding Statement    1.006484
Counterclaim            1.271198
Evidence                2.939035
Lead                    1.000430
Position                1.003449
Rebuttal                1.205392
Name: discourse_id, dtype: float64

The strategy is to identify the following numbers of discourse classes when we "predict" a new essay.

In [9]:
classes_ratio_rounded = classes_ratio.round().astype(int)
classes_ratio_rounded

discourse_type
Claim                   3
Concluding Statement    1
Counterclaim            1
Evidence                3
Lead                    1
Position                1
Rebuttal                1
Name: discourse_id, dtype: int64

## Training

We use `ComplementNB` which is better suited than `MultinomialNB` for imbalanced classes (see Sklearn doc).

In [10]:
def bayesline_train(X, y):
    vectorizer = TfidfVectorizer(lowercase=False)
    X_vec = vectorizer.fit_transform(X)

    model = ComplementNB()
    model.fit(X_vec, y)
    return model, vectorizer

## Infering

In [11]:
def bayesline_predict(essay_id, data_path, model, vectorizer):
    essay_text = open(f'{data_path}{essay_id}.txt').read()
    # For each sentence, we compute the probability of belonging to each class, and store this into a Dataframe
    sentences = sent_tokenize(essay_text)
    predictions_matrix = pd.DataFrame(model.predict_proba(vectorizer.transform(sentences)), columns=model.classes_)

    # Now, we assign a defined number of sentences to each class using the numbers in `classes_ratio_rounded`.
    # To do so, we take the sentence with the highest probability of belonging to a class, assign it to this class,
    # and repeat until all classes have the defined number of sentences assigned.
    prediction = []
    classes_count = classes_ratio_rounded.copy()
    sentences_nb = min(classes_count.sum(), len(sentences))
    while len(prediction) < sentences_nb :
        highest_score_class = predictions_matrix.max().idxmax()
        highest_score_element = predictions_matrix[highest_score_class].idxmax()
        prediction.append((highest_score_element, highest_score_class))
        predictions_matrix.drop(highest_score_element, inplace=True)
        classes_count[highest_score_class] -= 1
        if classes_count[highest_score_class] == 0:
            predictions_matrix.drop(columns=highest_score_class, inplace=True)
    prediction

    # To generate the submission DataFrame, we need to match sentences number and words index.
    sentences_words_index = []
    word_index = 0
    for sentence in sentences:
        sentence_length = len(sentence.split())
        sentences_words_index.append(" ".join([str(i) for i in range(word_index, word_index+sentence_length)]))
        word_index += sentence_length
    sentences_words_index

    # Returning the submission DataFrame
    submission = [[essay_id, element[1], sentences_words_index[element[0]]] for element in prediction]
    return pd.DataFrame(submission, columns=["id", "class", "predictionstring"])

## Evaluating

In [12]:
def evaluate(test_df, model_prediction_method, data_path, model, vectorizer):
    kaggle_scores = []
    score_details = []
    essay_ids = test_df["id"].unique()
    for essay_id in tqdm(essay_ids):
        kaggle_score, score_detail = metrics.kaggle_score(
            model_prediction_method(essay_id, data_path, model, vectorizer),
            test_df[test_df["id"] == essay_id],
            return_details=True)
        kaggle_scores.append(kaggle_score)
        score_details.append(pd.DataFrame(score_detail))
    kaggle_scores = pd.Series(kaggle_scores, index=essay_ids)
    score_details = pd.concat(score_details, keys=essay_ids)
    return (kaggle_scores, score_details)

In [13]:
# Training on the entire dataset for now
X = train_df["discourse_text"]
y = train_df["discourse_type"]
model, vectorizer = bayesline_train(X,  y)

In [14]:
# Only predicting on a subset
ids_subset = train_df["id"].unique()[:10]
subset_ = train_df.set_index("id", drop=False).loc[ids_subset]
evaluate(subset_, bayesline_predict, DATA_PATH, model, vectorizer)

100%|██████████| 10/10 [00:00<00:00, 14.17it/s]


(423A1CA112E2    0.000000
 A8445CABFECE    0.142857
 6B4F7A0165B9    0.000000
 E05C7F5C1156    0.031746
 50B3435E475B    0.190476
 DBF7EB6A9E02    0.238095
 810B70E80E1D    0.231293
 CE98789F502B    0.057143
 A97DE0D49AEA    0.142857
 48D3F4243F0F    0.057143
 dtype: float64,
                         Lead  Position     Claim  Counterclaim  Rebuttal  \
 423A1CA112E2 precision   0.0       0.0  0.000000           0.0       0.0   
              recall      0.0       0.0  0.000000           NaN       NaN   
              f1          0.0       0.0  0.000000           0.0       0.0   
 A8445CABFECE precision   0.0       1.0  0.000000           0.0       0.0   
              recall      NaN       1.0  0.000000           NaN       NaN   
              f1          0.0       1.0  0.000000           0.0       0.0   
 6B4F7A0165B9 precision   0.0       0.0  0.000000           0.0       0.0   
              recall      0.0       0.0  0.000000           0.0       0.0   
              f1          0.0 

Generating submission file on a subset of the data:

In [15]:
# Only submitting a subset
submission = [bayesline_predict(essay_id, DATA_PATH, model, vectorizer) for essay_id in tqdm(ids_subset)]
submission_df = pd.concat(submission)

#submission_df.to_csv("submission.csv", index=False)

100%|██████████| 10/10 [00:00<00:00, 52.29it/s]


Cross-val

In [35]:
ids_subset_2 = train_df["id"].unique()
np.random.shuffle(ids_subset_2)

# Cross val on the entire dataset
train_small = train_df.set_index("id", drop=False).loc[ids_subset_2]
X_small = train_small["discourse_text"]
y_small = train_small["discourse_type"]

kf = KFold(n_splits=5)
all_scores = []
for train, test in kf.split(ids_subset_2):
    X_train, y_train = (X_small.loc[ids_subset_2[train]], y_small.loc[ids_subset_2[train]])
    test_df = train_small.loc[ids_subset_2[test]]
    model_cv, vectorizer_cv = bayesline_train(X_train, y_train)
    all_scores.append(evaluate(test_df, bayesline_predict, DATA_PATH, model_cv, vectorizer_cv))

100%|██████████| 3119/3119 [03:22<00:00, 15.42it/s]
100%|██████████| 3119/3119 [03:21<00:00, 15.49it/s]
100%|██████████| 3119/3119 [03:22<00:00, 15.42it/s]
100%|██████████| 3119/3119 [03:18<00:00, 15.68it/s]
100%|██████████| 3118/3118 [03:19<00:00, 15.65it/s]


In [38]:
[scores.mean() for (scores, _) in all_scores]

[0.1534513670313835,
 0.15258110749419776,
 0.15195740790471243,
 0.14920631059720968,
 0.14989431062165415]

# Unused utils

`display_classes_2` uses the `discourse_text` field of `train_df`. It is generally better than `display_classes_3` (which uses `discourse_start/end`), but when a element is present several times in the essay text, things get messy.

In [18]:
def display_classes_2(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    for i, element in elements_df.iterrows():
        element_text = element["discourse_text"].strip()
        # The stripping above is needed to make sure the replace below works
        if not element_text in essay_text:
            return "Formatting failed"
        essay_text = essay_text.replace(
            element_text,
            f"|<span style='color:{CLASSES_COLORS[element['discourse_type']]}'>{element_text}</span>"
        )
    essay_text = essay_text.replace("\n", "<br>")
    color_labels = " - ".join([
        f"<span style='color:{CLASSES_COLORS[class_]}'>{class_}</span>"
        for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + essay_text)

In [19]:
def format_discourse(text, type_):
    if text:
        text = text.replace("\n", "<br>")
        return f"|<span style='color:{CLASSES_COLORS.get(type_)}'>{text}</span>"
    return ""

In [20]:
def display_classes_3(essay_id, train_df):
    elements_df = train_df[train_df["id"] == essay_id]
    essay_text = open(f'../raw_data/train/{essay_id}.txt').read()
    formatted_elements = ""
    char_pointer = 0
    for i, element in elements_df.iterrows():
        unlabelled_element = essay_text[char_pointer:element['discourse_start']]
        char_pointer = element['discourse_end'] + 1
        discourse_element = essay_text[element['discourse_start']:char_pointer]
        formatted_elements +=\
            format_discourse(unlabelled_element, "Unlabelled") +\
            format_discourse(discourse_element, element['discourse_type'])
    formatted_elements += essay_text[char_pointer:]
    color_labels = " ".join([format_discourse(class_, class_) for class_ in CLASSES_COLORS.keys()])
    return HTML(color_labels + "<br><br>" + formatted_elements)