In [1]:
import os
import pandas as pd

In [2]:
os.chdir("../../")

In [3]:
from lib.data_tools.word_feature_engineering import all_words

### Scoring Tools

In [4]:
def scrabble_score(word):
    """Assigns difficulty score based on how uncommon letters it contains."""
    SCORES = {
        'a': 1, 'b': 3, 'c': 3, 'd': 2, 'e': 1, 'f': 4, 'g': 2,
        'h': 4, 'i': 1, 'j': 8, 'k': 5, 'l': 1, 'm': 3, 'n': 1,
        'o': 1, 'p': 3, 'q': 10, 'r': 1, 's': 1, 't': 1, 'u': 1,
        'v': 4, 'w': 4, 'x': 8, 'y': 4, 'z': 10
    }
    total_score = sum(SCORES.get(letter.lower(), 0) for letter in word)
    return total_score

for word in list(all_words)[:10]:
    print(word, scrabble_score(word))

Repeat 8
Serve 8
despitefully 21
tile 4
Osama 7
appearance 16
exempt 17
it.30 2
PROFESSIONAL 17
e're 3


In [5]:
def consonent_score(word):
    """Assigns difficulty score based on the number of consequent consonents."""
    max_score = score = 0

    for letter in word:
        if letter in 'aeiou':
            score += 1
            max_score = max(score, max_score)
        else:
            score = 0

    return max_score

for word in list(all_words)[:10]:
    print(word, consonent_score(word))

Repeat 2
Serve 1
despitefully 1
tile 1
Osama 1
appearance 2
exempt 1
it.30 1
PROFESSIONAL 0
e're 1


In [6]:
import json

if not os.path.exists("data/syllable.json"):
    syllable_dict = {}

    with open("data/cmudict.rep") as file:
        line_count = 0

        for line in file:
            line_count += 1

            if line_count < 107:
                continue
            
            line = line.strip()

            parts = line.split(' ')
            word, syllables = parts[0].lower(), parts[2:]
            syllable_dict[word] = syllables

    json.dump(syllable_dict, open("data/syllable.json", "w"))
else:
    syllable_dict = json.load(open("data/syllable.json", "r"))

In [7]:
def syllable_score(word):
    """Assigns score based on number of syllables there are in the word."""
    syllables = syllable_dict.get(word.lower(), [])
    return len(syllables)

for word in list(all_words)[:10]:
    print(word, syllable_score(word))

Repeat 6
Serve 3
despitefully 0
tile 3
Osama 7
appearance 9
exempt 8
it.30 0
PROFESSIONAL 13
e're 0


In [8]:
from collections import Counter

In [9]:
if not os.path.exists("syllable_counter.json"):
    all_syllables = []

    for syllables in syllable_dict.values():
        all_syllables += syllables

    syllable_count = Counter(all_syllables)
    syllable_count = {syl: count for syl, count in syllable_count.most_common()}
    json.dump(syllable_count, open("data/syllable_counter.json", "w"))
else:
    syllable_count = json.load(open("data/syllable_counter.json", "r"))

In [10]:
class WordDifficultyScorer:
    def __init__(self):
        self.syllable_dict = json.load(open("data/syllable.json", "r"))
        self.syllable_counter = json.load(open("data/syllable_counter.json", "r"))
        self.SCORES = {
            'a': 1, 'b': 3, 'c': 3, 'd': 2, 'e': 1, 'f': 4, 'g': 2,
            'h': 4, 'i': 1, 'j': 8, 'k': 5, 'l': 1, 'm': 3, 'n': 1,
            'o': 1, 'p': 3, 'q': 10, 'r': 1, 's': 1, 't': 1, 'u': 1,
            'v': 4, 'w': 4, 'x': 8, 'y': 4, 'z': 10
        }

    def total_syllable_score(self, word):
        """Assigns score based on number of syllables."""
        return len(self.syllable_dict.get(word.lower(), []))
    

    def unique_syllable_score(self, word):
        """Assigns score based on number of unique syllables."""
        return len(set(self.syllable_dict.get(word.lower(), [])))
    
    
    def syllable_rarity_score(self, word):
        """Assigns score based on syllable rarity. Lower value means rarer."""
        total_score = 0

        for syllable in self.syllable_dict.get(word.lower(), []):
            count = self.syllable_counter.get(syllable, 0)
            
            if count < 1000:
                total_score += 1
            elif count < 10000:
                total_score += 5
            else:
                total_score += 25

        return total_score
    
    
    def consonent_score(self, word):
        """Assigns difficulty score based on the number of consequent consonents."""
        max_score = score = 0

        for letter in word:
            if letter in 'aeiou':
                score += 1
                max_score = max(score, max_score)
            else:
                score = 0

        return max_score
    
    
    def scrabble_score(self, word):
        """Assigns difficulty score based on how uncommon letters it contains."""
        return sum(self.SCORES.get(letter.lower(), 0) for letter in word)

In [11]:
scorer = WordDifficultyScorer()

for word in list(all_words)[:10] + ["Levenshtein"]:
    print(
        word,
        scorer.scrabble_score(word),
        scorer.consonent_score(word),
        scorer.total_syllable_score(word),
        scorer.unique_syllable_score(word),
        scorer.syllable_rarity_score(word)
    )

Repeat 8 2 6 6 130
Serve 8 1 3 3 55
despitefully 21 1 0 0 0
tile 4 1 3 3 55
Osama 7 1 7 6 155
appearance 16 2 9 7 225
exempt 17 1 8 8 200
it.30 2 1 0 0 0
PROFESSIONAL 17 0 13 9 305
e're 3 1 0 0 0
Levenshtein 17 2 0 0 0


### EDA on Difficulty Scoring

In [12]:
from lib.data_tools.word_feature_engineering import process_word
from lib.paths import Paths

In [13]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)
train_df.shape

(17307, 3)

In [14]:
words = process_word(train_df.copy(deep=True))
words.shape

(17307, 4)

In [15]:
words.drop(columns=["full_text"], inplace=True)

In [16]:
words.columns

Index(['essay_id', 'score', 'words'], dtype='object')

In [17]:
words["scrabble_scores"] = words["words"].map(lambda x: [scorer.scrabble_score(y) for y in x])
words["consonent_score"] = words["words"].map(lambda x: [scorer.consonent_score(y) for y in x])
words["total_syllable_score"] = words["words"].map(lambda x: [scorer.total_syllable_score(y) for y in x])
words["unique_syllable_score"] = words["words"].map(lambda x: [scorer.unique_syllable_score(y) for y in x])
words["syllable_rarity_score"] = words["words"].map(lambda x: [scorer.syllable_rarity_score(y) for y in x])

In [18]:
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [19]:
def plot(fig, name, row, col):
    color = {
        1: "blue",
        2: "red",
        3: "green",
        4: "purple",
        5: "orange",
        6: "pink",
    }
    for score in range(1, 7):
        fig.add_trace(
            go.Box(
                x=words.loc[words["score"] == score, name],
                name=score,
                legendgroup=score,
                marker=dict(color=color[score]),
                showlegend=False,
            ),
            row=row,
            col=col,
        )

    fig.update_yaxes(title_text="Score", row=row, col=col)
    fig.update_xaxes(title_text=name, row=row, col=col)

#### Mean

In [20]:
words["mean_scrabble_score"] = words["scrabble_scores"].map(np.mean)
words["mean_consonent_score"] = words["consonent_score"].map(np.mean)
words["mean_total_syllable_score"] = words["total_syllable_score"].map(np.mean)
words["mean_unique_syllable_score"] = words["unique_syllable_score"].map(np.mean)
words["mean_syllable_rarity_score"] = words["syllable_rarity_score"].map(np.mean)

In [21]:
names = [
    "mean_scrabble_score",
    "mean_consonent_score",
    "mean_total_syllable_score",
    "mean_unique_syllable_score",
    "mean_syllable_rarity_score",
]

fig = make_subplots(
    rows=2,
    cols=3,
    subplot_titles=names,
    shared_yaxes=True,
)
for i, name in enumerate(names):
    plot(fig, name, i // 3 + 1, i % 3 + 1)

fig.update_layout(
    height=1080,
    width=1920,
    title_x=0.5,
    title_text=f"Mean Word Difficulty Score",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    barmode="group",
)
fig.show()

#### Sum

In [22]:
words["sum_scrabble_score"] = words["scrabble_scores"].map(np.sum)
words["sum_consonent_score"] = words["consonent_score"].map(np.sum)
words["sum_total_syllable_score"] = words["total_syllable_score"].map(np.sum)
words["sum_unique_syllable_score"] = words["unique_syllable_score"].map(np.sum)
words["sum_syllable_rarity_score"] = words["syllable_rarity_score"].map(np.sum)

In [23]:
names = [
    "sum_scrabble_score",
    "sum_consonent_score",
    "sum_total_syllable_score",
    "sum_unique_syllable_score",
    "sum_syllable_rarity_score",
]

fig = make_subplots(
    rows=2,
    cols=3,
    subplot_titles=names,
    shared_yaxes=True,
)
for i, name in enumerate(names):
    plot(fig, name, i // 3 + 1, i % 3 + 1)

fig.update_layout(
    height=1080,
    width=1920,
    title_x=0.5,
    title_text=f"Sum Word Difficulty Score",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    barmode="group",
)
fig.show()

#### Normalized Sum

In [24]:
words["word_count"] = words["words"].map(len)
words["norm_scrabble_score"] = words["scrabble_scores"].map(np.sum) / words["word_count"]
words["norm_consonent_score"] = words["consonent_score"].map(np.sum) / words["word_count"]
words["norm_total_syllable_score"] = words["total_syllable_score"].map(np.sum) / words["word_count"]
words["norm_unique_syllable_score"] = words["unique_syllable_score"].map(np.sum) / words["word_count"]
words["norm_syllable_rarity_score"] = words["syllable_rarity_score"].map(np.sum) / words["word_count"]

In [25]:
names = [
    "norm_scrabble_score",
    "norm_consonent_score",
    "norm_total_syllable_score",
    "norm_unique_syllable_score",
    "norm_syllable_rarity_score",
]

fig = make_subplots(
    rows=2,
    cols=3,
    subplot_titles=names,
    shared_yaxes=True,
)
for i, name in enumerate(names):
    plot(fig, name, i // 3 + 1, i % 3 + 1)

fig.update_layout(
    height=1080,
    width=1920,
    title_x=0.5,
    title_text=f"Normalized Word Difficulty Score",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    barmode="group",
)
fig.show()

#### Max

In [26]:
words["max_scrabble_score"] = words["scrabble_scores"].map(np.max)
words["max_consonent_score"] = words["consonent_score"].map(np.max)
words["max_total_syllable_score"] = words["total_syllable_score"].map(np.max)
words["max_unique_syllable_score"] = words["unique_syllable_score"].map(np.max)
words["max_syllable_rarity_score"] = words["syllable_rarity_score"].map(np.max)

In [27]:
names = [
    "max_scrabble_score",
    "max_consonent_score",
    "max_total_syllable_score",
    "max_unique_syllable_score",
    "max_syllable_rarity_score",
]

fig = make_subplots(
    rows=2,
    cols=3,
    subplot_titles=names,
    shared_yaxes=True,
)
for i, name in enumerate(names):
    plot(fig, name, i // 3 + 1, i % 3 + 1)

fig.update_layout(
    height=1080,
    width=1920,
    title_x=0.5,
    title_text=f"Max Word Difficulty Score",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    barmode="group",
)
fig.show()

#### Min

In [28]:
words["min_scrabble_score"] = words["scrabble_scores"].map(np.min)
words["min_consonent_score"] = words["consonent_score"].map(np.min)
words["min_total_syllable_score"] = words["total_syllable_score"].map(np.min)
words["min_unique_syllable_score"] = words["unique_syllable_score"].map(np.min)
words["min_syllable_rarity_score"] = words["syllable_rarity_score"].map(np.min)

In [29]:
names = [
    "min_scrabble_score",
    "min_consonent_score",
    "min_total_syllable_score",
    "min_unique_syllable_score",
    "min_syllable_rarity_score",
]

fig = make_subplots(
    rows=2,
    cols=3,
    subplot_titles=names,
    shared_yaxes=True,
)
for i, name in enumerate(names):
    plot(fig, name, i // 3 + 1, i % 3 + 1)

fig.update_layout(
    height=1080,
    width=1920,
    title_x=0.5,
    title_text=f"Min Word Difficulty Score",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    barmode="group",
)
fig.show()