In [1]:
import pandas as pd
import os
import numpy as np
from nltk import sent_tokenize, word_tokenize
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import re

In [2]:
os.chdir("../../")

In [3]:
from lib.paths import Paths
from lib.data_tools.feature_engineering import data_preprocessing
from lib.data_tools.word_feature_engineering import (
    noun_words,
    verb_words,
    pronoun_words,
    adj_words,
    adv_words,
    deter_words,
    conj_words,
    numerical_words,
    all_words,
)

In [4]:
df = pd.read_csv(
    Paths.TRAIN_CSV_PATH,
    usecols=["essay_id", "full_text", "score"],
    nrows=500,
)

In [6]:
def split_sentences(df, drop_full_text=True):
    df["full_text"] = df["full_text"].map(data_preprocessing)
    df["sentences"] = df["full_text"].map(sent_tokenize)

    if drop_full_text:
        df.drop(columns=["full_text"], inplace=True)

    return df

In [7]:
df = split_sentences(df.copy(deep=True))
df.shape

(500, 3)

## Total Sentences

In [8]:
df["sentence_count"] = df["sentences"].map(len)

In [9]:
df.sample(3)

Unnamed: 0,essay_id,score,sentences,sentence_count
239,042dc86,3,"[in the article ""driveless cars are coming"", t...",15
467,07b6457,2,[my argument is about how the landform is just...,16
210,03afc1e,2,[the facial action coding system would be a go...,12


In [10]:
fig = px.box(
    df,
    x="sentence_count",
    color="score",
    title="Sentence count distribution by Score",
)
fig.update_layout(
    height=1080 // 2,
    width=1920 // 2,
    title_x=0.5,
    title_text="Number of Sentences",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
)
fig.show()

Somewhat useful.

## Mistakes per Sentence (Useful)

In [11]:
def mistakes_in_sentence(sentences):
    return [
        sum([word.lower() not in all_words for word in word_tokenize(sentence)])
        for sentence in sentences
    ]

In [12]:
df["sentence_word_mistakes"] = df["sentences"].map(mistakes_in_sentence)
df["sentence_word_mistakes_sum"] = df["sentence_word_mistakes"].map(sum)
df["sentence_word_mistakes_max"] = df["sentence_word_mistakes"].map(max)
df["sentence_word_mistakes_min"] = df["sentence_word_mistakes"].map(min)
df["sentence_word_mistakes_mean"] = df["sentence_word_mistakes"].map(np.mean)

In [13]:
def plot(fig, name, row, col):
    color = {
        1: "blue",
        2: "red",
        3: "green",
        4: "purple",
        5: "orange",
        6: "pink",
    }
    for score in range(1, 7):
        fig.add_trace(
            go.Box(
                x=df.loc[df["score"] == score, name],
                name=score,
                legendgroup=score,
                marker=dict(color=color[score]),
                showlegend=False,
            ),
            row=row,
            col=col,
        )

    fig.update_yaxes(title_text="Score", row=row, col=col)
    fig.update_xaxes(title_text=name, row=row, col=col)

In [14]:
def plot_all(prefix):
    names = [
        f"{prefix}_sum",
        f"{prefix}_max",
        f"{prefix}_min",
        f"{prefix}_mean",
    ]

    fig = make_subplots(
        rows=2,
        cols=2,
        subplot_titles=names,
        shared_yaxes=True,
    )
    for i, name in enumerate(names):
        plot(fig, name, i // 2 + 1, i % 2 + 1)

    fig.update_layout(
        height=1080 // 2,
        width=1920 // 2,
        title_x=0.5,
        title_text=prefix,
        legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
        barmode="group",
    )
    fig.show()

In [15]:
plot_all("sentence_word_mistakes")

Not really useful

## Part of Speech

### Variety (Not Useful)

In [16]:
def count_unique_pos_per_sentence(sentences):
    types = [
        noun_words,
        verb_words,
        pronoun_words,
        adj_words,
        adv_words,
        deter_words,
        conj_words,
        numerical_words,
    ]
    count_list = [0 for _ in range(len(sentences))]

    for i, sentence in enumerate(sentences):
        for type in types:
            found = False

            for word in word_tokenize(sentence):
                if word in type:
                    found = True
                    break

            count_list[i] += found

    return count_list

In [17]:
df["sentence_unique_POS"] = df["sentences"].map(count_unique_pos_per_sentence)
df["sentence_unique_POS_sum"] = df["sentence_unique_POS"].map(sum)
df["sentence_unique_POS_max"] = df["sentence_unique_POS"].map(max)
df["sentence_unique_POS_min"] = df["sentence_unique_POS"].map(min)
df["sentence_unique_POS_mean"] = df["sentence_unique_POS"].map(np.mean)

In [18]:
plot_all("sentence_unique_POS")

### Individual Counts (Useful)

In [19]:
def count_type(sentences, type):
    return [
        sum([word in type for word in word_tokenize(sentence)])
        for sentence in sentences
    ]

type_dict = {
    "noun": noun_words,
    "pronoun": pronoun_words,
    "verb": verb_words,
    "determiner": deter_words,
    "adjective": adj_words
}

for key, val in type_dict.items():
    df[f"sentence_{key}_count"] = df["sentences"].map(lambda x: count_type(x, val))
    df[f"sentence_{key}_count_sum"] = df[f"sentence_{key}_count"].map(sum)
    df[f"sentence_{key}_count_max"] = df[f"sentence_{key}_count"].map(max)
    df[f"sentence_{key}_count_min"] = df[f"sentence_{key}_count"].map(min)
    df[f"sentence_{key}_count_mean"] = df[f"sentence_{key}_count"].map(np.mean)

In [20]:
plot_all("sentence_determiner_count")

Sum of nouns is somewhat useful.

## Punctuation

### Variety (Useful)

In [21]:
def count_unique_punctuation_per_sentence(sentence):
    tokens = word_tokenize(sentence)
    punctuations = set()
    
    for token in tokens:
        for char in token:
            if char.isalnum():
                continue
            
            punctuations.add(char)
    
    return len(punctuations)

In [22]:
def count_unique_punctuations(sentences):
    return [
        count_unique_punctuation_per_sentence(sentence)
        for sentence in sentences
    ]

In [23]:
df["sentence_unique_punctuations"] = df["sentences"].map(count_unique_punctuations)
df["sentence_unique_punctuations_sum"] = df["sentence_unique_punctuations"].map(sum)
df["sentence_unique_punctuations_max"] = df["sentence_unique_punctuations"].map(max)
df["sentence_unique_punctuations_min"] = df["sentence_unique_punctuations"].map(min)
df["sentence_unique_punctuations_mean"] = df["sentence_unique_punctuations"].map(np.mean)

In [24]:
plot_all("sentence_unique_punctuations")

Sum and max  unique punctuations are useful.

### Quotes (Not useful)

In [25]:
def count_quotations_with_regex(sentences):
    return [
        len(re.findall(r'"[^"]*"', sentence)) + len(re.findall(r"'[^']*'", sentence))
        for sentence in sentences
    ]

In [26]:
df["sentence_quotes"] = df["sentences"].map(count_quotations_with_regex)
df["sentence_quotes_sum"] = df["sentence_quotes"].map(sum)
df["sentence_quotes_max"] = df["sentence_quotes"].map(max)
df["sentence_quotes_min"] = df["sentence_quotes"].map(min)
df["sentence_quotes_mean"] = df["sentence_quotes"].map(np.mean)

In [27]:
plot_all("sentence_quotes")

Isn't useful.

### Setence Types

In [28]:
def is_compound(sentence, words):
    compound_words = set(["and", "or", "but"])

    if words.intersection(compound_words):
        return True
    elif ";" in words:
        return True
    return False

In [29]:
def is_complex(sentence, words):
    complex_words = set(
        [
            "because",
            "while",
            "though",
            "since",
            "if",
            "so that",
            "which",
            "who",
            "although",
            "unless",
            "when",
            "after",
            "until",
        ]
    )

    if words.intersection(complex_words):
        return True
    elif "," in words:
        return True
    return False

In [30]:
def classify_sentence_structure_type(sentences):
    sentence_type_list = ["" for _ in range(len(sentences))]

    for i, sentence in enumerate(sentences):
        sentence = sentence.lower()
        words = set(word_tokenize(sentence))

        contains_complex = is_complex(sentence, words)
        contains_compound = is_compound(sentence, words)

        if contains_complex and contains_compound:
            sentence_type_list[i] = "ComplexCompound"
        elif contains_complex:
            sentence_type_list[i] = "Complex"
        elif contains_compound:
            sentence_type_list[i] = "Compound"
        else:
            sentence_type_list[i] = "Simple"

    return sentence_type_list

In [31]:
classify_sentence_structure_type([
    "This is a simple sentence",
    "This is a text and a compound text",
    "While this looks simple, it's actually compound.",
    "This is a simple and this one is a complex sentence while attached with a compound."
])

['Simple', 'Compound', 'Complex', 'ComplexCompound']

In [32]:
df["sentence_structure_type_list"] = df["sentences"].map(classify_sentence_structure_type)
df["sentence_structure_simple_ratio"] = df["sentence_structure_type_list"].map(lambda x: sum([y == "Simple" for y in x])) / df["sentence_count"]
df["sentence_structure_compound_ratio"] = df["sentence_structure_type_list"].map(lambda x: sum([y == "Compound" for y in x])) / df["sentence_count"]
df["sentence_structure_complex_ratio"] = df["sentence_structure_type_list"].map(lambda x: sum([y == "Complex" for y in x])) / df["sentence_count"]
df["sentence_structure_complexCompound_ratio"] = df["sentence_structure_type_list"].map(lambda x: sum([y == "ComplexCompound" for y in x])) / df["sentence_count"]

In [33]:
fig = make_subplots(rows=1, cols=4)

plot(fig, "sentence_structure_simple_ratio", 1, 1)
plot(fig, "sentence_structure_compound_ratio", 1, 2)
plot(fig, "sentence_structure_complex_ratio", 1, 3)
plot(fig, "sentence_structure_complexCompound_ratio", 1, 4)

fig.update_layout(
    height=1080 // 2,
    width=1920,
    title_x=0.5,
    title_text="Sentence Sturcture Ratios",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    barmode="group",
)

fig.show()

Seems useful.

## Sentence Length

In [34]:
df["sentence_length"] = df["sentences"].map(lambda x: [len(y) for y in x])
df["sentence_length_sum"] = df["sentence_length"].map(sum)
df["sentence_length_max"] = df["sentence_length"].map(max)
df["sentence_length_min"] = df["sentence_length"].map(min)
df["sentence_length_mean"] = df["sentence_length"].map(np.mean)
plot_all("sentence_length")

Mean and max sentence length seems useful.