# Imports & constants

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from nltk.tokenize import sent_tokenize
#from evalstudent import metrics
#from evalstudent import utils
from tqdm import tqdm
%load_ext autoreload
%autoreload 2
import os

In [None]:
DATA_PATH = "../input/feedback-prize-2021/test/"

# Exploration

## Loading data

In [None]:
train_df = pd.read_csv("../input/feedback-prize-2021/train.csv", dtype = {"discourse_id": int, "discourse_start": int, "discourse_end": int})

In [None]:
train_df.head()

# Modeling: Super Naive Bayesline

Average count of discourse classes per essay

In [None]:
classes_ratio = train_df[["id", "discourse_id", "discourse_type"]]\
    .groupby(["id", "discourse_type"]).count()\
    .groupby("discourse_type").mean()\
    .squeeze()
classes_ratio

The strategy is to identify the following numbers of discourse classes when we "predict" a new essay.

In [None]:
classes_ratio_rounded = classes_ratio.round().astype(int)
classes_ratio_rounded

In [None]:
X = train_df["discourse_text"]
y = train_df["discourse_type"]

In [None]:
vectorizer = TfidfVectorizer(lowercase=False)
X_vec = vectorizer.fit_transform(X)

We use `ComplementNB` which is better suited than `MultinomialNB` for imbalanced classes (according to Sklearn doc).

In [None]:
model = ComplementNB()
model.fit(X_vec, y)

In [None]:
model.score(X_vec, y)

Prediction :

In [None]:
def bayesline_predict(essay_id, data_path):
    essay_text = open(f'{data_path}{essay_id}.txt').read()

    # For each sentence, we compute the probability of belonging to each class, and store this into a Dataframe
    sentences = sent_tokenize(essay_text)
    predictions_matrix = pd.DataFrame(model.predict_proba(vectorizer.transform(sentences)), columns=model.classes_)

    # Now, we assign a defined number of sentences to each class using the numbers in `classes_ratio_rounded`.
    # To do so, we take the sentence with the highest probability of belonging to a class, assign it to this class,
    # and repeat until all classes have the defined number of sentences assigned.
    prediction = []
    classes_count = classes_ratio_rounded.copy()
    sentences_nb = min(classes_count.sum(), len(sentences))
    while len(prediction) < sentences_nb :
        highest_score_class = predictions_matrix.max().idxmax()
        highest_score_element = predictions_matrix[highest_score_class].idxmax()
        prediction.append((highest_score_element, highest_score_class))
        predictions_matrix.drop(highest_score_element, inplace=True)
        classes_count[highest_score_class] -= 1
        if classes_count[highest_score_class] == 0:
            predictions_matrix.drop(columns=highest_score_class, inplace=True)
    prediction

    # To generate the submission DataFrame, we need to match sentences number and words index.
    sentences_words_index = []
    word_index = 0
    for sentence in sentences:
        sentence_length = len(sentence.split())
        sentences_words_index.append(" ".join([str(i) for i in range(word_index, word_index+sentence_length)]))
        word_index += sentence_length
    sentences_words_index

    # Returning the submission DataFrame
    submission = [[essay_id, element[1], sentences_words_index[element[0]]] for element in prediction]
    return pd.DataFrame(submission, columns=["id", "class", "predictionstring"])

Generating submission file on a subset of the data:

In [None]:
_, _, essay_list = next(os.walk('/kaggle/input/feedback-prize-2021/test'))
essay_ids = [essay_name[:-4] for essay_name in essay_list]
submission = [bayesline_predict(essay_id, DATA_PATH) for essay_id in tqdm(essay_ids)]
submission_df = pd.concat(submission)
submission_df.to_csv("submission.csv", index=False)