In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os
from nltk import sent_tokenize, word_tokenize

In [2]:
os.chdir("../../")

In [3]:
from lib.paths import Paths
from lib.data_tools.utils import data_preprocessing

In [4]:
def plot(fig, name, row, col):
    color = {
        1: "blue",
        2: "red",
        3: "green",
        4: "purple",
        5: "orange",
        6: "pink",
    }
    for score in range(1, 7):
        fig.add_trace(
            go.Box(
                x=df.loc[df["score"] == score, name],
                name=score,
                legendgroup=score,
                marker=dict(color=color[score]),
                showlegend=False,
            ),
            row=row,
            col=col,
        )

    fig.update_yaxes(title_text="Score", row=row, col=col)
    fig.update_xaxes(title_text=name, row=row, col=col)

In [5]:
def plot_all(prefix):
    names = [
        f"{prefix}_sum",
        f"{prefix}_max",
        f"{prefix}_min",
        f"{prefix}_mean",
    ]

    fig = make_subplots(
        rows=2,
        cols=2,
        subplot_titles=names,
        shared_yaxes=True,
    )
    for i, name in enumerate(names):
        plot(fig, name, i // 2 + 1, i % 2 + 1)

    fig.update_layout(
        height=1080 // 2,
        width=1920 // 2,
        title_x=0.5,
        title_text=prefix,
        legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
        barmode="group",
    )
    fig.show()

In [6]:
train_df = pd.read_csv(Paths.TRAIN_CSV_PATH, usecols=["essay_id", "full_text", "score"])

In [7]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df["paragraphs"] = df["full_text"].map(lambda x: x.split("\n\n"))
    df["paragraphs"] = df["paragraphs"].map(lambda x: [data_preprocessing(y) for y in x])
    df.drop(columns=["full_text"], inplace=True)
    return df

In [8]:
df = preprocess(train_df.copy(deep=True))
df.columns

Index(['essay_id', 'score', 'paragraphs'], dtype='object')

In [9]:
def calculate_stats(df, column):
    df[f"{column}_sum"] = df[column].map(np.sum)
    df[f"{column}_min"] = df[column].map(np.min)
    df[f"{column}_mean"] = df[column].map(np.mean)
    df[f"{column}_max"] = df[column].map(np.max)
    return df

## Counts

### Paragraphs

In [10]:
df['paragraph_count'] = df["paragraphs"].map(len)

In [11]:
fig = px.box(df, x="paragraph_count", color="score")
fig.update_layout(
    height=1080 // 2,
    width=1920 // 2,
    title_x=0.5,
    title_text="Number of Paragraphs",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
)
fig.show()

### Sentences

In [12]:
df['paragraph_sentenceCount'] = df["paragraphs"].map(lambda x: [len(sent_tokenize(y)) for y in x])

In [13]:
df = calculate_stats(df, "paragraph_sentenceCount")
plot_all("paragraph_sentenceCount")

### Words

In [14]:
df['paragraph_wordCount'] = df["paragraphs"].map(lambda x: [len(word_tokenize(y)) for y in x])

In [15]:
df = calculate_stats(df, "paragraph_wordCount")
plot_all("paragraph_wordCount")

## Lengths

### Paragraph Length

In [23]:
df["paragraph_lengths"] = df["paragraphs"].map(lambda x: [len(y) for y in x])

In [25]:
df = calculate_stats(df, "paragraph_lengths")
plot_all("paragraph_lengths")

### Introduction Paragraph Length

In [None]:
df["paragraph_introductionLength"] = df["paragraphs"].map(lambda x: len(x[0]))

In [None]:
fig = px.box(df, x="paragraph_introductionLength", color="score")
fig.update_layout(
    height=1080 // 2,
    width=1920 // 2,
    title_x=0.5,
    title_text="Introduction Paragraph Length",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
)
fig.show()

### Conclusion Paragraph Length

In [None]:
df["paragraph_conclusionLength"] = df["paragraphs"].map(lambda x: len(x[-1]))

In [None]:
fig = px.box(df, x="paragraph_conclusionLength", color="score")
fig.update_layout(
    height=1080 // 2,
    width=1920 // 2,
    title_x=0.5,
    title_text="Conclusion Paragraph Length",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
)
fig.show()

### Body Paragraph Length

In [None]:
df["paragraph_bodyLength"] = df["paragraphs"].map(lambda x: sum([len(x[i]) for i in range(1, len(x)-1)]))

In [None]:
fig = px.box(df, x="paragraph_bodyLength", color="score")
fig.update_layout(
    height=1080 // 2,
    width=1920 // 2,
    title_x=0.5,
    title_text="Body Paragraph Length",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
)
fig.show()