In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style
from cycler import cycler
from copy import deepcopy, copy
import warnings
import os


plt.rcParams["figure.facecolor"] = "#fff"
plt.rcParams["axes.facecolor"] = "#fff"
plt.rcParams["axes.edgecolor"] = "#000"
plt.rcParams["xtick.color"] = "#000"
plt.rcParams["ytick.color"] = "#000"
plt.rcParams["grid.color"] = "#000"
plt.rcParams["grid.alpha"] = 0.1
plt.rcParams["axes.grid"] = True
plt.rcParams["axes.spines.right"] = False
plt.rcParams["axes.spines.left"] = False
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.bottom"] = False
plt.rcParams["axes.labelcolor"] = "#000"
plt.rcParams["axes.labelweight"] = "normal"
plt.rcParams["axes.labelpad"] = 5
plt.rcParams["xtick.labelcolor"] = "#000"
plt.rcParams["xtick.major.size"] = 0
plt.rcParams["ytick.labelcolor"] = "#000"
plt.rcParams["ytick.major.size"] = 0
plt.rcParams["ytick.major.width"] = 1
plt.rcParams["axes.titlecolor"] = "#000"
plt.rcParams["xtick.major.pad"] = 5
plt.rcParams["xtick.major.width"] = 1
plt.rcParams["axes.titlelocation"] = "left"
plt.rcParams["axes.titlepad"] = 20.0
plt.rcParams["axes.titlesize"] = 20.0
plt.rcParams["axes.labelsize"] = 12.0
plt.rcParams["font.style"] = "normal"

palette = sns.color_palette("mako")
    
warnings.simplefilter("ignore")

In [None]:
def read_file(path):
    with open(path, "r") as file:
        data = file.read()
        
    return data

def preprocess_data_frame(data_frame, directory):
    data_frame = deepcopy(data_frame)
    data_frame["essay_path"] = data_frame["essay_id"].apply(lambda essay_id: os.path.join(directory, f"{essay_id}.txt"))
    data_frame["essay_text"] = data_frame["essay_path"].apply(lambda essay_path: read_file(essay_path))
    
    return data_frame

In [None]:
train_path = "../input/feedback-prize-effectiveness/train.csv"
train_directory = "../input/feedback-prize-effectiveness/train"

test_path = "../input/feedback-prize-effectiveness/test.csv"
test_directory = "../input/feedback-prize-effectiveness/test"

sample_submission_path = "../input/feedback-prize-effectiveness/sample_submission.csv"

In [None]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

<img src="https://i.ibb.co/2k9CjjY/Screenshot-37.png" alt="Screenshot-37" border="0">

<h1 style="font-family: Verdana; font-weight: bold;">Overview</h1>

<h2 style="font-family: Verdana; font-weight: bold;">Task</h2>

<a style="font-size: 15px; font-family: Verdana;" href="https://www.kaggle.com/competitions/feedback-prize-effectiveness/overview/description"> Description page says: </a><i style="font-size: 15px; font-family: Verdana;">"The goal of this competition is to classify argumentative elements in student writing as "effective," "adequate," or "ineffective." You will create a model trained on data that is representative of the 6th-12th grade population in the United States in order to minimize bias. Models derived from this competition will help pave the way for students to receive enhanced feedback on their argumentative writing. With automated guidance, students can complete more assignments and ultimately become more confident, proficient writers."</i>

<h2 style="font-family: Verdana; font-weight: bold;">Metric</h2>

<a style="font-size: 15px; font-family: Verdana;" href="https://www.kaggle.com/competitions/feedback-prize-effectiveness/overview/evaluation">Evaluation page says: </a><i style="font-size: 15px; font-family: Verdana;">"Submissions for this track are evaluated using multi-class logarithmic loss. Each row in the dataset has been labeled with one true effectiveness label. For each row, you must submit the predicted probabilities that the product belongs to each quality label."</i>

<p style="font-size: 20px; font-family: Verdana;">
 $ log loss = -\frac{1}{N} \sum_{i=1}^N \sum_{j=1}^M y_{ij} log(p_{ij}) $ 
</p>

<p style="font-size: 15px; font-family: Verdana;">
$ N $ is the number of rows in the test set<br>$ M $ is the number of class labels<br> $ log $ is the natural logarithm<br>$ y_{ij} $ is 1 if observation $ i $  is in class $ j $ and 0 otherwise<br>$ p_{ij} $  is the predicted probability that observation $ i $ belongs to class $ j $.
</p>


<i style="font-size: 15px; font-family: Verdana;">"The submitted probabilities for a given discourse element are not required to sum to one: they are rescaled prior to being scored, each row being divided by the row sum. In order to avoid the extremes of the $ log $  function, predicted probabilities are replaced with $ max(min(p, 1-10^{-15}), 10^{-15}) $."</i>

<h2 style="font-family: Verdana; font-weight: bold;">Sample Submission</h2>

In [None]:
sample_submission

<p style="font-size: 15px; font-family: Verdana;">Submission requires 4 columns: 1 for indentifier, 3 for predictions respectively, which require probabilities or logits for certain class, in order to compute $ log loss $.</p>

<h1 style="font-family: Verdana; font-weight: bold;">Exploratory Data Analysis</h1>

<h2 style="font-family: Verdana; font-weight: bold;">Data overview</h2>

In [None]:
train = preprocess_data_frame(train, train_directory)
test = preprocess_data_frame(test, test_directory)

In [None]:
train

<p style="font-size: 15px; font-family: Verdana;"><b>train.csv</b> - Contains the annotated discourse elements for all essays in the test set.</p>

<ul>
    <li style="font-size: 15px; font-family: Verdana;"><b>discourse_id</b> - ID code for discourse element.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>essay_id</b> - ID code for essay response. This ID code corresponds to the name of the full-text file in the train/ folder.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>discourse_text</b> - Text of discourse element.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>discourse_type</b> - Class label of discourse element.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>discourse_type_num</b> - Enumerated class label of discourse element.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>discourse_effectiveness</b> - Quality rating of discourse element, the target.</li>
</ul>

In [None]:
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot()
 
sns.countplot(x="discourse_type", data=train, palette=palette, ec="#000", linewidth=1.25, alpha=1, ax=ax, zorder=2)
ax.set_title("Discourse type distribution")
fig.show()

<ul>
    <li style="font-size: 15px; font-family: Verdana;"><b>Lead</b> - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>Position</b> - an opinion or conclusion on the main question.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>Claim</b> - a claim that supports the position.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>Counterclaim</b> - a claim that refutes another claim or gives an opposing reason to the position.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>Rebuttal</b> - a claim that refutes a counterclaim.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>Evidence</b> - ideas or examples that support claims, counterclaims, or rebuttals.</li>
    <li style="font-size: 15px; font-family: Verdana;"><b>Concluding</b> Statement - a concluding statement that restates the claims</li>
</ul>

<h3 style="font-family: Verdana; font-weight: bold;">Insights</h3>

<ul>
    <li style="font-size: 15px; font-family: Verdana;">The distribution is not balanced.</li>
    <li style="font-size: 15px; font-family: Verdana;"><i>Claim</i> and <i>Evidence</i> are most "popular", at the same time <i>Rebuttal</i> and <i>Lead</i> are "unpopular".</li>
</ul>

In [None]:
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot()

sns.countplot(x="discourse_effectiveness", data=train, palette=palette, ec="#000", linewidth=1.25, alpha=1, ax=ax, zorder=2)
ax.set_title("Discourse effectiveness distribution")
fig.show()

<h3 style="font-family: Verdana; font-weight: bold;">Insights</h3>

<ul>
    <li style="font-size: 15px; font-family: Verdana;">The distribution is not balanced. We can use Weighted Cross Entropy to slightly solve that problem.</li>
    <li style="font-size: 15px; font-family: Verdana;">People often write <i>Adequate</i> text than <i>Ineffective</i>.</li>
</ul>

In [None]:
train["essay_text_length"] = train["essay_text"].apply(lambda text: len(text.split()))
train["discourse_text_length"] = train["discourse_text"].apply(lambda text: len(text.split()))

In [None]:
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot()
sns.kdeplot(x="essay_text_length", data=train, ec="#000", color=palette[-1], fill=True, alpha=1, ax=ax, zorder=2)
ax.set_title("Essay text length distribution")
fig.show()

<h3 style="font-family: Verdana; font-weight: bold;">Insights</h3>

<ul>
    <li style="font-size: 15px; font-family: Verdana;">The distribution skewed to the left side.</li>
    <li style="font-size: 15px; font-family: Verdana;">There are some outliers after reaching 1200 essay texts' lenghts.</li>
    <li style="font-size: 15px; font-family: Verdana;">The longformers returns!</li>
</ul>

In [None]:
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot()
sns.kdeplot(x="discourse_text_length", data=train, ec="#000", color=palette[2], fill=True, alpha=1, ax=ax, zorder=2)
ax.set_title("Discourse text length distribution")
fig.show()

<h3 style="font-family: Verdana; font-weight: bold;">Insights</h3>

<ul>
    <li style="font-size: 15px; font-family: Verdana;">The distribution skewed to the left side.</li>
    <li style="font-size: 15px; font-family: Verdana;">The often length of discours is between 0 and 100.</li>
    <li style="font-size: 15px; font-family: Verdana;">We observe long tail.</li>
</ul>