In [1]:
import pandas as pd
import os
import plotly.express as px
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
os.chdir("../../")

In [3]:
from lib.paths import Paths

In [4]:
grammar_features = pd.read_csv("data/feature_engg/grammar_scores.csv")
grammar_features.shape

(17307, 29)

In [5]:
score_df = pd.read_csv(
    Paths.COMPETITION_TRAIN_CSV_PATH,
    usecols=["essay_id", "score"],
)

In [6]:
grammar_features = pd.merge(
    left=grammar_features,
    right=score_df,
    on="essay_id",
    how="inner",
)
grammar_features.shape

(17307, 30)

In [7]:
grammar_features.columns

Index(['essay_id', 'grammar_levenshtein_distance_mean',
       'grammar_levenshtein_distance_max', 'grammar_levenshtein_distance_sum',
       'grammar_levenshtein_distance_min',
       'grammar_levenshtein_distance_median', 'grammar_jaccard_distance_mean',
       'grammar_jaccard_distance_max', 'grammar_jaccard_distance_sum',
       'grammar_jaccard_distance_min', 'grammar_jaccard_distance_median',
       'grammar_hamming_distance_mean', 'grammar_hamming_distance_max',
       'grammar_hamming_distance_sum', 'grammar_hamming_distance_min',
       'grammar_hamming_distance_median', 'grammar_cosine_distance_mean',
       'grammar_cosine_distance_max', 'grammar_cosine_distance_sum',
       'grammar_cosine_distance_min', 'grammar_cosine_distance_median',
       'grammar_levenshtein_distance_q1', 'grammar_levenshtein_distance_q3',
       'grammar_jaccard_distance_q1', 'grammar_jaccard_distance_q3',
       'grammar_hamming_distance_q1', 'grammar_hamming_distance_q3',
       'grammar_cosine_di

In [8]:
def plot(fig, name, row, col):
    color = {
        1: "blue",
        2: "red",
        3: "green",
        4: "purple",
        5: "orange",
        6: "pink",
    }
    for score in range(1, 7):
        fig.add_trace(
            go.Box(
                x=grammar_features.loc[grammar_features["score"] == score, name],
                name=score,
                legendgroup=score,
                marker=dict(color=color[score]),
                showlegend=False,
            ),
            row=row,
            col=col,
        )

    fig.update_yaxes(title_text="Score", row=row, col=col)
    fig.update_xaxes(title_text=name, row=row, col=col)

In [9]:
def plot_all(postfix):
    names = [
        "grammar_levenshtein_distance",
        "grammar_jaccard_distance",
        "grammar_hamming_distance",
        "grammar_cosine_distance",
    ]

    fig = make_subplots(
        rows=2,
        cols=2,
        subplot_titles=names,
        shared_yaxes=True,
    )
    for i, name in enumerate(names):
        plot(fig, f"{name}_{postfix}", i // 2 + 1, i % 2 + 1)

    fig.update_layout(
        height=1080,
        width=1920,
        title_x=0.5,
        title_text=f"Grammar Distances ({postfix})",
        legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
        barmode="group",
    )
    fig.show()

In [10]:
plot_all("sum")

In [11]:
plot_all("max")

In [12]:
plot_all("mean")

In [13]:
plot_all("median")

In [30]:
def plot_all_types(prefix):
    names = [
        f"{prefix}_sum",
        f"{prefix}_max",
        f"{prefix}_q1",
        f"{prefix}_q3",
        f"{prefix}_min",
        f"{prefix}_mean",
    ]

    fig = make_subplots(
        rows=3,
        cols=2,
        subplot_titles=names,
        shared_yaxes=True,
    )
    
    for i, name in enumerate(names):
        plot(fig, name, i // 2 + 1, i % 2 + 1)

    fig.update_layout(
        height=1080,
        width=1920,
        title_x=0.5,
        title_text=f"Grammar Distances ({prefix})",
        legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
        barmode="group",
    )
    fig.show()

In [31]:
plot_all_types("grammar_levenshtein_distance")