# LGBM - Data Preparation

## Setup

In [1]:
import os
import pandas as pd
from transformers import AutoTokenizer
import torch

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.data_tools.feature_engineering import (
    process_paragraph,
    paragraph_feature_engineering,
    process_sentence,
    sentence_feature_engineering,
)
from lib.data_tools.word_feature_engineering import engineer_word_features
from lib.data_tools.grammar_feature_engineering import engineer_grammar_feature
from lib.model.inference import ensemble_inference
from lib.data_tools.data import sliding_window

In [4]:
seed_everything()

## Data Preparation

### Data Loading

In [5]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)
train_df.shape

(17307, 3)

### Feature Engineerings

#### Paragraph Level

In [6]:
paragraph_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "paragraphs.csv")

In [7]:
if not os.path.exists(paragraph_csv_path):
    paragraph_features = process_paragraph(train_df.copy(deep=True))
    assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(paragraph_features.shape)
    paragraph_features.to_csv(paragraph_csv_path, index=False)
else:
    paragraph_features = pd.read_csv(paragraph_csv_path)

In [8]:
paragraph_features = paragraph_feature_engineering(paragraph_features)
assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape
paragraph_features.shape

(17307, 37)

#### Sentence Level

In [9]:
sentence_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "sentences.csv")

In [10]:
if not os.path.exists(sentence_csv_path):
    sentence_features = process_sentence(train_df.copy(deep=True))
    assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(sentence_features.shape)
    sentence_features.to_csv(sentence_csv_path, index=False)
else:
    sentence_features = pd.read_csv(sentence_csv_path)

In [11]:
sentence_features = sentence_feature_engineering(sentence_features)
assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape
sentence_features.shape

(17307, 32)

### Word Features

In [12]:
word_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "word.csv")

In [13]:
if not os.path.exists(word_csv_path):
    word_features = engineer_word_features(train_df.copy(deep=True))
    assert word_features.essay_id.unique().shape == word_features.essay_id.unique().shape
    print(word_features.shape)
    word_features.to_csv(word_csv_path, index=False)
else:
    word_features = pd.read_csv(word_csv_path)

(17307, 9)


#### DeBERTA Predictions

Predictions made using DeBERTA models

In [14]:
deberta_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "deberta.csv")

In [15]:
if not os.path.exists(deberta_csv_path):
    tokenizer = AutoTokenizer.from_pretrained(Paths.TOKENIZER_PATH)
    root_model_path = "output/microsoft/deberta-v3-large"
    model_paths = {
        os.path.join(root_model_path, f"microsoft_deberta-v3-large_fold_{f}_best.pth"): 1 / config.n_folds
        for f in range(config.n_folds)
    }
    model_paths
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    df = sliding_window(train_df.copy(deep=True), tokenizer)
    deberta_features = ensemble_inference(
        df,
        tokenizer,
        model_paths,
        device,
        logits=True,
        overall=True,
        model_wise_reduction=False,
    )
    assert deberta_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(deberta_features.shape)
    deberta_features.to_csv(deberta_csv_path, index=False)
else:
    deberta_features = pd.read_csv(deberta_csv_path)

### Grammar Features

In [6]:
grammar_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "grammar_scores.csv")

In [7]:
if not os.path.exists(grammar_csv_path):
    grammar_features = pd.read_csv("data/feature_engg/grammar_correct.csv")
    grammar_features = engineer_grammar_feature(grammar_features)
    assert grammar_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(grammar_features.shape)
    grammar_features.to_csv(grammar_csv_path, index=False)
else:
    grammar_features = pd.read_csv(grammar_csv_path)

(17307, 29)


#### Combining Features

In [18]:
feature_df_list = [
    paragraph_features,
    sentence_features,
    word_features,
    grammar_features,
]

all_features = deberta_features.copy(deep=True)

for feature_df in feature_df_list:
    all_features = pd.merge(all_features, feature_df, on="essay_id")

all_features = pd.merge(all_features, train_df[["essay_id", "score"]], on="essay_id")
assert all_features.essay_id.unique().shape == train_df.essay_id.unique().shape
all_features.shape

(17307, 147)

In [19]:
all_features.to_csv(Paths.FEATURE_ENGG_CSV_PATH, index=False)

In [21]:
all_features.columns.to_list()

['essay_id',
 'deberta_m0_c0',
 'deberta_m0_c1',
 'deberta_m0_c2',
 'deberta_m0_c3',
 'deberta_m0_c4',
 'deberta_m0_c5',
 'deberta_m1_c0',
 'deberta_m1_c1',
 'deberta_m1_c2',
 'deberta_m1_c3',
 'deberta_m1_c4',
 'deberta_m1_c5',
 'deberta_m2_c0',
 'deberta_m2_c1',
 'deberta_m2_c2',
 'deberta_m2_c3',
 'deberta_m2_c4',
 'deberta_m2_c5',
 'deberta_m3_c0',
 'deberta_m3_c1',
 'deberta_m3_c2',
 'deberta_m3_c3',
 'deberta_m3_c4',
 'deberta_m3_c5',
 'deberta_m4_c0',
 'deberta_m4_c1',
 'deberta_m4_c2',
 'deberta_m4_c3',
 'deberta_m4_c4',
 'deberta_m4_c5',
 'deberta_m5_c0',
 'deberta_m5_c1',
 'deberta_m5_c2',
 'deberta_m5_c3',
 'deberta_m5_c4',
 'deberta_m5_c5',
 'deberta_m6_c0',
 'deberta_m6_c1',
 'deberta_m6_c2',
 'deberta_m6_c3',
 'deberta_m6_c4',
 'deberta_m6_c5',
 'paragraph_error_count_mean',
 'paragraph_error_count_max',
 'paragraph_error_count_sum',
 'paragraph_char_count_mean',
 'paragraph_char_count_max',
 'paragraph_char_count_sum',
 'paragraph_word_count_mean',
 'paragraph_word_count