# 📊 Data Preparation

## ⚙️ Setup 

### 📚 Importing Libraries

Importing from packages

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from transformers import AutoTokenizer
from tokenizers import AddedToken
import plotly.express as px

In [2]:
os.chdir("../../")

Importing user defined packages

In [3]:
from lib.utils.utils import seed_everything
from lib.config import config
from lib.paths import Paths
from lib.data_tools.data import (
    get_data_loaders,
    clean_text,
    sliding_window,
    negative_sample_df,
    save_data_loaders,
)

### 🌱 Setting Random Seeds

In [4]:
seed_everything()

### 💽 Loading Data

In [5]:
train_df = pd.read_csv(Paths.TRAIN_CSV_PATH)
test_df = pd.read_csv(Paths.TEST_CSV_PATH)

train_df.shape, test_df.shape

((17307, 4), (3, 2))

## ⌛ Data Processing

Converting classes of scores to range from 0 to 5.

In [6]:
train_df["score"] = train_df["score"] - 1

Cleaning text.

In [7]:
train_df["full_text"] = train_df["full_text"].map(lambda x: clean_text(x))

## 🪙 Tokenizer

Sources:
1. [MOTH's Notebook](https://www.kaggle.com/code/alejopaullier/aes-2-multi-class-classification-train?scriptVersionId=170290107&cellId=14)

In [8]:
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [9]:
tokenizer = AutoTokenizer.from_pretrained(config.model)



[Idea of adding special tokens from Chris Deotte](https://www.kaggle.com/code/cdeotte/deberta-v3-small-starter-cv-0-820-lb-0-800?scriptVersionId=174239814&cellId=17)

In [10]:
tokenizer.add_tokens([AddedToken("\n", normalized=False)])
tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])

1

In [11]:
tokenizer.save_pretrained(Paths.TOKENIZER_PATH)

('output/microsoft/deberta-v3-xsmall/tokenizer_v2/tokenizer_config.json',
 'output/microsoft/deberta-v3-xsmall/tokenizer_v2/special_tokens_map.json',
 'output/microsoft/deberta-v3-xsmall/tokenizer_v2/spm.model',
 'output/microsoft/deberta-v3-xsmall/tokenizer_v2/added_tokens.json',
 'output/microsoft/deberta-v3-xsmall/tokenizer_v2/tokenizer.json')

In [12]:
print(tokenizer)

DebertaV2TokenizerFast(name_or_path='microsoft/deberta-v3-xsmall', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("
", rstrip=False, lstrip

## ✂️ Train-Validation Splitting

Sources
1. [MOTH's Notebook](https://www.kaggle.com/code/alejopaullier/aes-2-multi-class-classification-train?scriptVersionId=170290107&cellId=12)
2. [Martin's post](https://www.kaggle.com/competitions/learning-agency-lab-automated-essay-scoring-2/discussion/499959)

To split competition and external data

In [13]:
# def split(df):
#     df["fold"] = -1
#     competition_topics = [0, 1, 2, 4, 5, 6, 10]
#     comp_df = df[df.topic.isin(competition_topics)].reset_index(drop=True)
#     external_df = df[~df.topic.isin(competition_topics)].reset_index(drop=True)

#     skf = StratifiedGroupKFold(
#         n_splits=config.n_folds,
#         shuffle=True,
#         random_state=config.random_seed,
#     )
#     X, y = comp_df["full_text"], comp_df["score"]
#     for i, (_, valid_idx) in enumerate(skf.split(X, y, groups=comp_df["topic"])):
#         comp_df.loc[valid_idx, "fold"] = i

#     skf = StratifiedKFold(
#         n_splits=config.n_folds,
#         shuffle=True,
#         random_state=config.random_seed,
#     )
#     X, y = external_df["full_text"], external_df["score"]
#     for i, (_, valid_idx) in enumerate(skf.split(X, y)):
#         external_df.loc[valid_idx, "fold"] = i

#     return pd.concat([comp_df, external_df]).reset_index(drop=True)

To split competition data only

In [14]:
def split(df):
    df["fold"] = -1
    X, y = df["full_text"], df["score"]

    skf = StratifiedGroupKFold(
        n_splits=config.n_folds,
        shuffle=True,
        random_state=config.random_seed,
    )

    for i, (_, valid_idx) in enumerate(skf.split(X, y, groups=df["topic"])):
        df.loc[valid_idx, "fold"] = i

    return df

In [15]:
train_df = split(train_df)
train_df["fold"].value_counts()

fold
4    3499
2    3043
0    3017
5    2094
6    2046
1    1960
3    1648
Name: count, dtype: int64

In [16]:
fig = px.bar(
    train_df.groupby(["fold", "topic"])["score"]
    .count()
    .reset_index()
    .rename(columns={"score": "count"}),
    x="fold",
    y="count",
    color="topic",
    text="count",
    barmode="group",
)
fig.update_layout(
    height=1080//2,
    width=1920//2,
    title_x=0.5,
    title_text=f"Data Distribution across folds<br><sup>Colored by topics</sup>",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    barmode="group",
)
fig.show()

In [17]:
fig = px.bar(
    train_df.groupby(["fold"])["score"].value_counts().reset_index(),
    x="score",
    y="count",
    color="fold",
    text="count",
    barmode="group",
)
fig.update_layout(
    height=1080//2,
    width=1920//2,
    title_x=0.5,
    title_text=f"Data Distribution across scores<br><sup>Colored by Folds</sup>",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    barmode="group",
)
fig.show()

## 🪟 Sliding Window

### Token Sequence Lengths before sliding window

In [18]:
lengths_original = [
    len(tokenizer(text, add_special_tokens=False)["input_ids"])
    for text in tqdm(train_df["full_text"].fillna(""), total=len(train_df))
]

100%|██████████| 17307/17307 [00:08<00:00, 1989.85it/s]


Most sequences have length exceeding `config.max_length`. Instead of truncating and keeping only the first `config.max_length` tokens, I'll use sliding window.

1. In training, the model gets to see more data.
2. In inference, the model gets to the entire essay.

In [19]:
train_df = sliding_window(train_df, tokenizer)
train_df.shape

100%|██████████| 17307/17307 [00:11<00:00, 1546.72it/s]


(17442, 5)

### Token Sequence Lengths after sliding window

In [20]:
lengths_sliding = [
    len(tokenizer(text, add_special_tokens=False)["input_ids"])
    for text in tqdm(train_df["full_text"].fillna(""), total=len(train_df))
]

100%|██████████| 17442/17442 [00:10<00:00, 1716.61it/s]


In [21]:
l1 = pd.DataFrame(lengths_original).value_counts().reset_index().rename(columns={0: "length"})
l1["type"] = "original"
l2 = pd.DataFrame(lengths_sliding).value_counts().reset_index().rename(columns={0: "length"})
l2["type"] = "sliding_window"
l3 = pd.concat([l1, l2], axis=0)
del l1, l2, lengths_original, lengths_sliding

In [27]:
fig = px.histogram(
    l3,
    x="length",
    y="count",
    color="type",
    text_auto=True,
)
fig.update_layout(
    height=1080 // 2,
    width=1920 // 2,
    title_x=0.5,
    title_text="Token length histogram<br><sup>Colored by before and after applying sliding window</sup>",
    legend=dict(orientation="h", yanchor="top", xanchor="center", y=-0.2, x=0.5),
    yaxis_title="count",
    xaxis_title="Token Count",
)
fig.show()

In [23]:
px.histogram(train_df, x="score", color="fold", text_auto=True, barmode="group")

## 🗃️ Torch Dataset

Sources
1. [MOTH's Notebook](https://www.kaggle.com/code/alejopaullier/aes-2-multi-class-classification-train?scriptVersionId=170290107&cellId=16)

In [24]:
if config.debug:
    train_fold = train_df[train_df["fold"] != 0].reset_index(drop=True)
    valid_fold = train_df[train_df["fold"] == 0].reset_index(drop=True)

    train_loader, valid_loader = get_data_loaders(train_fold, valid_fold, tokenizer)

    for sample in train_loader:
        pprint(f"Encoding keys: {sample.keys()} \n")
        pprint(sample)
        print(sample["inputs"]["attention_mask"].shape)
        print(sample["inputs"]["input_ids"].shape)
        break

In [25]:
if not os.path.exists(Paths.DATA_LOADER_PATH):
    os.makedirs(Paths.DATA_LOADER_PATH)

for fold in train_df.fold.unique():
    train_fold = train_df[train_df["fold"] != fold].reset_index(drop=True)
    valid_fold = train_df[train_df["fold"] == fold].reset_index(drop=True)

    # Keeping only competition data in validation set.
    # train_fold = pd.concat(
    #     [
    #         train_fold,
    #         valid_fold[~valid_fold.source.isin(["competition", "both"])],
    #     ],
    # ).reset_index(drop=True)
    # valid_fold = valid_fold[valid_fold.source.isin(["competition", "both"])]

    if config.negative_sample:
        for i, train_sampled_df in negative_sample_df(train_fold):
            save_data_loaders(tokenizer, fold, train_sampled_df, valid_fold, i)
    else:
        save_data_loaders(tokenizer, fold, train_fold, valid_fold)

Saved data/dataloader_v3/train_1.pth with 15416 samples 
Saved data/dataloader_v3/valid_1.pth with 2026 samples 
Saved data/dataloader_v3/valid_1.csv
Saved data/dataloader_v3/train_5.pth with 15346 samples 
Saved data/dataloader_v3/valid_5.pth with 2096 samples 
Saved data/dataloader_v3/valid_5.csv
Saved data/dataloader_v3/train_4.pth with 13931 samples 
Saved data/dataloader_v3/valid_4.pth with 3511 samples 
Saved data/dataloader_v3/valid_4.csv
Saved data/dataloader_v3/train_0.pth with 14418 samples 
Saved data/dataloader_v3/valid_0.pth with 3024 samples 
Saved data/dataloader_v3/valid_0.csv
Saved data/dataloader_v3/train_6.pth with 15369 samples 
Saved data/dataloader_v3/valid_6.pth with 2073 samples 
Saved data/dataloader_v3/valid_6.csv
Saved data/dataloader_v3/train_2.pth with 14381 samples 
Saved data/dataloader_v3/valid_2.pth with 3061 samples 
Saved data/dataloader_v3/valid_2.csv
Saved data/dataloader_v3/train_3.pth with 15791 samples 
Saved data/dataloader_v3/valid_3.pth with 1