# LGBM - Data Preparation

## Setup

In [1]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from transformers import AutoTokenizer
import torch
import sys

In [2]:
os.chdir("../../")
sys.path.append("lib/textstat/textstat")

In [22]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.data_tools.feature_engineering import (
    process_paragraph,
    paragraph_feature_engineering,
    process_sentence,
    sentence_feature_engineering,
    generate_tfidf_features,
)
from lib.model.inference import ensemble_inference
from lib.data_tools.data import sliding_window
from lib.textstat import textstat

In [4]:
seed_everything()

## Data Preparation

### Data Loading

In [5]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)
train_df.shape

(17307, 3)

### Feature Engineerings

#### Paragraph Level

In [6]:
paragraph_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "paragraphs.csv")

In [7]:
if not os.path.exists(paragraph_csv_path):
    paragraph_features = process_paragraph(train_df.copy(deep=True))
    assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(paragraph_features.shape)
    paragraph_features.to_csv(paragraph_csv_path, index=False)
else:
    paragraph_features = pd.read_csv(paragraph_csv_path)

In [8]:
paragraph_features = paragraph_feature_engineering(paragraph_features)
assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape
paragraph_features.shape

(17307, 37)

#### Sentence Level

In [9]:
sentence_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "sentences.csv")

In [10]:
if not os.path.exists(sentence_csv_path):
    sentence_features = process_sentence(train_df.copy(deep=True))
    assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(sentence_features.shape)
    sentence_features.to_csv(sentence_csv_path, index=False)
else:
    sentence_features = pd.read_csv(sentence_csv_path)

In [11]:
sentence_features = sentence_feature_engineering(sentence_features)
assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape
sentence_features.shape

(17307, 32)

#### TF-IDF Features

A TF-IDF vectorizer is used to convert the essays into numerical features.

In [6]:
vectorizer, tfidf_features = generate_tfidf_features(train_df.copy(deep=True), None)



In [7]:
assert tfidf_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [8]:
with open("output/LGBM/vectorizer.pkl", "wb") as file:
    pkl.dump(vectorizer, file)

#### DeBERTA Predictions

Predictions made using DeBERTA models

In [26]:
deberta_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "deberta.csv")

In [19]:
if not os.path.exists(deberta_csv_path):
    tokenizer = AutoTokenizer.from_pretrained(Paths.TOKENIZER_PATH)
    root_model_path = "output/microsoft/deberta-v3-large"
    model_paths = {
        os.path.join(root_model_path, f"microsoft_deberta-v3-large_fold_{f}_best.pth"): 1 / config.n_folds
        for f in range(config.n_folds)
    }
    model_paths
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    df = sliding_window(train_df.copy(deep=True), tokenizer)
    deberta_features = ensemble_inference(
        df,
        tokenizer,
        model_paths,
        device,
        logits=True,
        overall=True,
        model_wise_reduction=False,
    )
    assert deberta_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(deberta_features.shape)
    deberta_features.to_csv(deberta_csv_path, index=False)
else:
    deberta_features = pd.read_csv(deberta_csv_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 0 Inference: 100%|██████████| 779/779 [16:15<00:00,  1.25s/test_batch]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 1 Inference: 100%|██████████| 779/779 [15:54<00:00,  1.23s/test_batch]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 2 Inference: 100%|██████████| 779/779 [14:20<00:00,  1.10s/test_batch]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 3 Inference: 100%|██████████| 779/779 [14:20<00:00,  1.10s/test_batch]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 4 Inference: 100%|██████████| 779/779 [14:20<00:00,  1.10s/test_batch]
Special tokens have been 

(17307, 43)

### Text stat features

[Credits to this repo by andrei-papou](https://github.com/andrei-papou/textstat)

In [11]:
feature_map = {
    "flesch_reading_ease": textstat.flesch_reading_ease,
    "flesch_kincaid_grade": textstat.flesch_kincaid_grade,
    "smog_index": textstat.smog_index,
    "coleman_liau_index": textstat.coleman_liau_index,
    "automated_readability_index": textstat.automated_readability_index,
    "dale_chall_readability_score": textstat.dale_chall_readability_score,
    "difficult_words": textstat.difficult_words,
    "linsear_write_formula": textstat.linsear_write_formula,
    "gunning_fog": textstat.gunning_fog,
    "text_standard": lambda x: textstat.text_standard(x, float_output=True),
    "fernandez_huerta": textstat.fernandez_huerta,
    "szigriszt_pazos": textstat.szigriszt_pazos,
    "gutierrez_polini": textstat.gutierrez_polini,
    "crawford": textstat.crawford,
    "gulpease_index": textstat.gulpease_index,
    "osman": textstat.osman,
}

text_state_features = train_df.loc[:, ["essay_id", "full_text"]].copy()

for name, func in feature_map.items():
    text_state_features[name] = text_state_features["full_text"].map(func)

assert text_state_features.essay_id.unique().shape == train_df.essay_id.unique().shape
text_state_features.shape

(17307, 18)

#### Combining Features

In [22]:
all_features = deberta_features.copy(deep=True)

for feature_df in [paragraph_features, sentence_features, tfidf_features]:
    all_features = pd.merge(all_features, feature_df, on="essay_id")

all_features = pd.merge(all_features, train_df[["essay_id", "score"]], on="essay_id")
assert all_features.essay_id.unique().shape == train_df.essay_id.unique().shape
all_features.shape

(17307, 211)

In [24]:
# all_features.to_csv(Paths.FEATURE_ENGG_CSV_PATH, index=False)