# LGBM - Data Preparation

## Setup

In [1]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from transformers import AutoTokenizer
import torch

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.data_tools.feature_engineering import (
    process_paragraph,
    paragraph_feature_engineering,
    process_sentence,
    sentence_feature_engineering,
    process_word,
    word_feature_engineering,
    generate_tfidf_features,
    generate_count_features,
)
from lib.model.inference import ensemble_inference
from lib.data_tools.data import sliding_window

In [4]:
seed_everything()

## Data Preparation

### Data Loading

In [5]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)
train_df.shape

(17307, 3)

### Feature Engineerings

#### Paragraph Level

In [6]:
paragraph_features = process_paragraph(train_df.copy(deep=True))
paragraph_features.shape

(85934, 9)

In [7]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,full_text,score,paragraph,paragraph_no_punctuation,paragraph_error_count,paragraph_char_count,paragraph_word_count,paragraph_sentence_count
12725,bbc0087,The development of driverless cars should be a...,4,"in just a few years, with manufacturers making...",in just a few years with manufacturers making ...,0,150,23,1
5194,4d8bb17,When you say that the face on mars is an alien...,3,when you say that the face on mars is an alien...,when you say that the face on mars is an alien...,1,244,55,5
13746,c9d4074,Driverless cars are quickly becoming a reality...,4,the focus should be on teaching people to driv...,the focus should be on teaching people to driv...,0,148,27,1


In [8]:
assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [9]:
paragraph_features = paragraph_feature_engineering(paragraph_features)
paragraph_features.shape

  lambda x: kurtosis(x),


(17307, 125)

In [10]:
paragraph_features.columns.to_list()

['essay_id',
 'paragraph_error_count_mean',
 'paragraph_error_count_min',
 'paragraph_error_count_max',
 'paragraph_error_count_sum',
 'paragraph_error_count_first',
 'paragraph_error_count_last',
 'paragraph_char_count_mean',
 'paragraph_char_count_min',
 'paragraph_char_count_max',
 'paragraph_char_count_sum',
 'paragraph_char_count_first',
 'paragraph_char_count_last',
 'paragraph_word_count_mean',
 'paragraph_word_count_min',
 'paragraph_word_count_max',
 'paragraph_word_count_sum',
 'paragraph_word_count_first',
 'paragraph_word_count_last',
 'paragraph_sentence_count_mean',
 'paragraph_sentence_count_min',
 'paragraph_sentence_count_max',
 'paragraph_sentence_count_sum',
 'paragraph_sentence_count_first',
 'paragraph_sentence_count_last',
 'paragraph_error_count_q1',
 'paragraph_error_count_q3',
 'paragraph_error_count_kurtosis',
 'paragraph_char_count_q1',
 'paragraph_char_count_q3',
 'paragraph_char_count_kurtosis',
 'paragraph_word_count_q1',
 'paragraph_word_count_q3',
 'para

In [11]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,paragraph_error_count_mean,paragraph_error_count_min,paragraph_error_count_max,paragraph_error_count_sum,paragraph_error_count_first,paragraph_error_count_last,paragraph_char_count_mean,paragraph_char_count_min,paragraph_char_count_max,...,paragraph_error_count_len_l_30,paragraph_error_count_len_goe_30,paragraph_error_count_len_l_35,paragraph_error_count_len_goe_35,paragraph_error_count_len_l_40,paragraph_error_count_len_goe_40,paragraph_error_count_len_l_45,paragraph_error_count_len_goe_45,paragraph_error_count_len_l_50,paragraph_error_count_len_goe_50
10716,9e219bb,1.625,0,7,13,0,0,215.5,17,422,...,True,False,True,False,True,False,True,False,True,False
16341,f114047,7.5,1,13,45,1,5,416.666667,128,699,...,True,False,True,False,True,False,True,False,True,False
9190,87c531e,2.833333,0,7,17,1,2,416.166667,44,1093,...,True,False,True,False,True,False,True,False,True,False


In [12]:
assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape

#### Sentence Level

In [13]:
sentence_features = process_sentence(train_df.copy(deep=True))
sentence_features.shape

(330422, 8)

In [14]:
assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [15]:
sentence_features.columns.to_list()

['essay_id',
 'full_text',
 'score',
 'sentence',
 'sentence_no_punctuation',
 'sentence_error_count',
 'sentence_char_count',
 'sentence_word_count']

In [16]:
sentence_features = sentence_feature_engineering(sentence_features)
sentence_features.shape

  lambda x: kurtosis(x),


(17307, 92)

In [17]:
sentence_features.sample(3)

Unnamed: 0,essay_id,sentence_error_count_mean,sentence_error_count_min,sentence_error_count_max,sentence_error_count_sum,sentence_error_count_first,sentence_error_count_last,sentence_char_count_mean,sentence_char_count_min,sentence_char_count_max,...,sentence_error_count_len_l_30,sentence_error_count_len_goe_30,sentence_error_count_len_l_35,sentence_error_count_len_goe_35,sentence_error_count_len_l_40,sentence_error_count_len_goe_40,sentence_error_count_len_l_45,sentence_error_count_len_goe_45,sentence_error_count_len_l_50,sentence_error_count_len_goe_50
10166,9612d64,0.214286,0,2,6,0,0,89.107143,18,146,...,True,False,True,False,True,False,True,False,True,False
10215,96c77d6,0.103448,0,1,3,0,0,96.482759,39,189,...,True,False,True,False,True,False,True,False,True,False
3377,3217d6b,0.444444,0,2,8,1,0,76.777778,18,152,...,True,False,True,False,True,False,True,False,True,False


In [18]:
assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [19]:
sentence_features.columns.to_list()

['essay_id',
 'sentence_error_count_mean',
 'sentence_error_count_min',
 'sentence_error_count_max',
 'sentence_error_count_sum',
 'sentence_error_count_first',
 'sentence_error_count_last',
 'sentence_char_count_mean',
 'sentence_char_count_min',
 'sentence_char_count_max',
 'sentence_char_count_sum',
 'sentence_char_count_first',
 'sentence_char_count_last',
 'sentence_word_count_mean',
 'sentence_word_count_min',
 'sentence_word_count_max',
 'sentence_word_count_sum',
 'sentence_word_count_first',
 'sentence_word_count_last',
 'sentence_error_count_q1',
 'sentence_error_count_q3',
 'sentence_error_count_kurtosis',
 'sentence_char_count_q1',
 'sentence_char_count_q3',
 'sentence_char_count_kurtosis',
 'sentence_word_count_q1',
 'sentence_word_count_q3',
 'sentence_word_count_kurtosis',
 'sentence_char_count_len_l_25',
 'sentence_char_count_len_goe_25',
 'sentence_char_count_len_l_50',
 'sentence_char_count_len_goe_50',
 'sentence_char_count_len_l_75',
 'sentence_char_count_len_goe_75

#### Word Level

In [20]:
word_features = process_word(train_df.copy(deep=True))
word_features.shape

(6350538, 5)

In [21]:
assert word_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [22]:
word_features.columns.to_list()

['essay_id', 'full_text', 'score', 'word', 'word_char_count']

In [23]:
word_features = word_feature_engineering(word_features)
word_features.shape

(17307, 33)

In [24]:
assert word_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [25]:
word_features.columns.to_list()

['essay_id',
 'word_char_count_mean',
 'word_char_count_min',
 'word_char_count_max',
 'word_char_count_q1',
 'word_char_count_q3',
 'word_char_count_<lambda_2>',
 'word_char_count_len_l_5',
 'word_char_count_len_goe_5',
 'word_char_count_len_l_7',
 'word_char_count_len_goe_7',
 'word_char_count_len_l_9',
 'word_char_count_len_goe_9',
 'word_char_count_len_l_11',
 'word_char_count_len_goe_11',
 'word_char_count_len_l_13',
 'word_char_count_len_goe_13',
 'word_char_count_len_l_15',
 'word_char_count_len_goe_15',
 'word_char_count_len_l_17',
 'word_char_count_len_goe_17',
 'word_char_count_len_l_19',
 'word_char_count_len_goe_19',
 'word_char_count_len_l_21',
 'word_char_count_len_goe_21',
 'word_char_count_len_l_23',
 'word_char_count_len_goe_23',
 'word_char_count_len_l_25',
 'word_char_count_len_goe_25',
 'word_char_count_len_l_27',
 'word_char_count_len_goe_27',
 'word_char_count_len_l_29',
 'word_char_count_len_goe_29']

#### TF-IDF Features

A TF-IDF vectorizer is used to convert the essays into numerical features.

In [26]:
vectorizer, tfidf_features = generate_tfidf_features(train_df.copy(deep=True), None)

In [27]:
assert tfidf_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [28]:
with open("output/LGBM/vectorizer.pkl", "wb") as file:
    pkl.dump(vectorizer, file)

#### Count Vectorizer

A countVectorizer is used to convert the essays into numerical features

In [29]:
vectorizer_cnt, count_features = generate_count_features(train_df.copy(deep=True), None)

In [30]:
assert count_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [31]:
with open("output/LGBM/vectorizer_cnt.pkl", "wb") as file:
    pkl.dump(vectorizer_cnt, file)

#### DeBERTA Predictions

Predictions made using DeBERTA models

In [32]:
tokenizer = AutoTokenizer.from_pretrained(Paths.TOKENIZER_PATH)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [33]:
root_model_path = "output/microsoft/deberta-v3-large"
model_paths = {
    os.path.join(root_model_path, f"microsoft_deberta-v3-large_fold_{f}_best.pth"): 1 / config.n_folds
    for f in range(config.n_folds)
}
model_paths

{'output/microsoft/deberta-v3-large/microsoft_deberta-v3-large_fold_0_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-large/microsoft_deberta-v3-large_fold_1_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-large/microsoft_deberta-v3-large_fold_2_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-large/microsoft_deberta-v3-large_fold_3_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-large/microsoft_deberta-v3-large_fold_4_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-large/microsoft_deberta-v3-large_fold_5_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-large/microsoft_deberta-v3-large_fold_6_best.pth': 0.14285714285714285}

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [35]:
df = sliding_window(train_df.copy(deep=True), tokenizer)

  0%|          | 0/17307 [00:00<?, ?it/s]

100%|██████████| 17307/17307 [00:35<00:00, 494.31it/s]


In [36]:
deberta_features = ensemble_inference(
    df,
    tokenizer,
    model_paths,
    device,
    logits=True,
    overall=True,
    model_wise_reduction=False,
)
deberta_features.shape

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 0 Inference: 100%|██████████| 779/779 [14:19<00:00,  1.10s/test_batch]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 1 Inference: 100%|██████████| 779/779 [14:21<00:00,  1.11s/test_batch]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 2 Inference: 100%|██████████| 779/779 [14:20<00:00,  1.11s/test_batch]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 3 Inference: 100%|██████████| 779/779 [14:21<00:00,  1.11s/test_batch]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 4 Inference: 100%|██████████| 779/779 [14:21<00:00,  1.11s/test_batch]
Special tokens have been 

(17307, 43)

In [37]:
deberta_features.sample(3)

Unnamed: 0,essay_id,deberta_m0_c0,deberta_m0_c1,deberta_m0_c2,deberta_m0_c3,deberta_m0_c4,deberta_m0_c5,deberta_m1_c0,deberta_m1_c1,deberta_m1_c2,...,deberta_m5_c2,deberta_m5_c3,deberta_m5_c4,deberta_m5_c5,deberta_m6_c0,deberta_m6_c1,deberta_m6_c2,deberta_m6_c3,deberta_m6_c4,deberta_m6_c5
1328,14c4255,8.4e-05,0.001594,0.073336,0.065685,0.002018,0.00014,0.000109,0.000935,0.027895,...,0.019551,0.099968,0.021904,0.000688,5.5e-05,0.000137,0.004116,0.086218,0.050099,0.002233
13892,cbe244c,0.088357,0.052156,0.002095,0.000125,7.2e-05,5.1e-05,0.060156,0.08124,0.001273,...,0.003556,0.000264,0.000101,0.000115,0.115403,0.026325,0.000871,9.3e-05,5.6e-05,0.000108
11404,a7e1ea2,0.003894,0.062318,0.07431,0.002211,5.8e-05,6.6e-05,0.013737,0.111881,0.016601,...,0.029885,0.002686,0.000244,0.000149,0.009005,0.051844,0.076255,0.005413,0.000248,9.2e-05


In [38]:
assert deberta_features.essay_id.unique().shape == train_df.essay_id.unique().shape

#### Combining Features

In [40]:
all_features = deberta_features.copy(deep=True)

for feature_df in [paragraph_features, sentence_features, word_features, tfidf_features, count_features]:
    all_features = pd.merge(all_features, feature_df, on="essay_id")

all_features = pd.merge(all_features, train_df[["essay_id", "score"]], on="essay_id")
all_features.shape

(17307, 22088)

In [41]:
assert all_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [42]:
all_features.to_csv(Paths.FEATURE_ENGG_CSV_PATH, index=False)