# LGBM - Data Preparation

## Setup

In [1]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.data_tools.feature_engineering import (
    process_paragraph,
    paragraph_feature_engineering,
    process_sentence,
    sentence_feature_engineering,
    process_word,
    word_feature_engineering,
    generate_tfidf_features,
    generate_count_features,
)

In [4]:
seed_everything()

## Data Preparation

### Data Loading

In [5]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)
train_df.shape

(10, 3)

### Feature Engineerings

#### Paragraph Level

In [6]:
paragraph_features = process_paragraph(train_df.copy(deep=True))
paragraph_features.shape

(69, 9)

In [7]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,full_text,score,paragraph,paragraph_no_punctuation,paragraph_error_count,paragraph_char_count,paragraph_word_count,paragraph_sentence_count
7,0033bf4,What is the Seagoing Cowboys progam?\n\nIt was...,3,"the article states, ""besides helping people, i...",the article states besides helping people i ha...,0,91,15,1
8,0036253,The challenge of exploring Venus\n\nThis stori...,2,the challenge of exploring venus,the challenge of exploring venus,0,32,5,0
3,001bdc0,"We all heard about Venus, the planet without a...",4,"( ) ""a thick atmosphere of almost percent carb...",a thick atmosphere of almost percent carbon ...,4,805,129,7


In [8]:
assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [9]:
paragraph_features = paragraph_feature_engineering(paragraph_features)
paragraph_features.shape

(10, 125)

In [10]:
paragraph_features.columns.to_list()

['essay_id',
 'paragraph_error_count_mean',
 'paragraph_error_count_min',
 'paragraph_error_count_max',
 'paragraph_error_count_sum',
 'paragraph_error_count_first',
 'paragraph_error_count_last',
 'paragraph_char_count_mean',
 'paragraph_char_count_min',
 'paragraph_char_count_max',
 'paragraph_char_count_sum',
 'paragraph_char_count_first',
 'paragraph_char_count_last',
 'paragraph_word_count_mean',
 'paragraph_word_count_min',
 'paragraph_word_count_max',
 'paragraph_word_count_sum',
 'paragraph_word_count_first',
 'paragraph_word_count_last',
 'paragraph_sentence_count_mean',
 'paragraph_sentence_count_min',
 'paragraph_sentence_count_max',
 'paragraph_sentence_count_sum',
 'paragraph_sentence_count_first',
 'paragraph_sentence_count_last',
 'paragraph_error_count_q1',
 'paragraph_error_count_q3',
 'paragraph_error_count_kurtosis',
 'paragraph_char_count_q1',
 'paragraph_char_count_q3',
 'paragraph_char_count_kurtosis',
 'paragraph_word_count_q1',
 'paragraph_word_count_q3',
 'para

In [11]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,paragraph_error_count_mean,paragraph_error_count_min,paragraph_error_count_max,paragraph_error_count_sum,paragraph_error_count_first,paragraph_error_count_last,paragraph_char_count_mean,paragraph_char_count_min,paragraph_char_count_max,...,paragraph_error_count_len_l_30,paragraph_error_count_len_goe_30,paragraph_error_count_len_l_35,paragraph_error_count_len_goe_35,paragraph_error_count_len_l_40,paragraph_error_count_len_goe_40,paragraph_error_count_len_l_45,paragraph_error_count_len_goe_45,paragraph_error_count_len_l_50,paragraph_error_count_len_goe_50
8,0036253,3.833333,0,11,23,0,1,308.666667,32,517,...,True,False,True,False,True,False,True,False,True,False
4,002ba53,2.333333,0,5,14,0,1,363.166667,17,689,...,True,False,True,False,True,False,True,False,True,False
0,000d118,27.0,27,27,27,27,27,2640.0,2640,2640,...,True,False,True,False,True,False,True,False,True,False


In [12]:
assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape

#### Sentence Level

In [13]:
sentence_features = process_sentence(train_df.copy(deep=True))
sentence_features.shape

(191, 8)

In [14]:
assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [15]:
sentence_features.columns.to_list()

['essay_id',
 'full_text',
 'score',
 'sentence',
 'sentence_no_punctuation',
 'sentence_error_count',
 'sentence_char_count',
 'sentence_word_count']

In [16]:
sentence_features = sentence_feature_engineering(sentence_features)
sentence_features.shape

(10, 92)

In [17]:
sentence_features.sample(3)

Unnamed: 0,essay_id,sentence_error_count_mean,sentence_error_count_min,sentence_error_count_max,sentence_error_count_sum,sentence_error_count_first,sentence_error_count_last,sentence_char_count_mean,sentence_char_count_min,sentence_char_count_max,...,sentence_error_count_len_l_30,sentence_error_count_len_goe_30,sentence_error_count_len_l_35,sentence_error_count_len_goe_35,sentence_error_count_len_l_40,sentence_error_count_len_goe_40,sentence_error_count_len_l_45,sentence_error_count_len_goe_45,sentence_error_count_len_l_50,sentence_error_count_len_goe_50
2,001ab80,0.208333,0,1,5,0,1,126.75,58,237,...,True,False,True,False,True,False,True,False,True,False
1,000fe60,0.238095,0,1,5,0,0,78.238095,8,250,...,True,False,True,False,True,False,True,False,True,False
8,0036253,1.277778,0,6,23,2,0,102.222222,54,239,...,True,False,True,False,True,False,True,False,True,False


In [18]:
assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [19]:
sentence_features.columns.to_list()

['essay_id',
 'sentence_error_count_mean',
 'sentence_error_count_min',
 'sentence_error_count_max',
 'sentence_error_count_sum',
 'sentence_error_count_first',
 'sentence_error_count_last',
 'sentence_char_count_mean',
 'sentence_char_count_min',
 'sentence_char_count_max',
 'sentence_char_count_sum',
 'sentence_char_count_first',
 'sentence_char_count_last',
 'sentence_word_count_mean',
 'sentence_word_count_min',
 'sentence_word_count_max',
 'sentence_word_count_sum',
 'sentence_word_count_first',
 'sentence_word_count_last',
 'sentence_error_count_q1',
 'sentence_error_count_q3',
 'sentence_error_count_kurtosis',
 'sentence_char_count_q1',
 'sentence_char_count_q3',
 'sentence_char_count_kurtosis',
 'sentence_word_count_q1',
 'sentence_word_count_q3',
 'sentence_word_count_kurtosis',
 'sentence_char_count_len_l_25',
 'sentence_char_count_len_goe_25',
 'sentence_char_count_len_l_50',
 'sentence_char_count_len_goe_50',
 'sentence_char_count_len_l_75',
 'sentence_char_count_len_goe_75

#### Word Level

In [20]:
word_features = process_word(train_df.copy(deep=True))
word_features.shape

(3708, 5)

In [21]:
assert word_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [22]:
word_features.columns.to_list()

['essay_id', 'full_text', 'score', 'word', 'word_char_count']

In [23]:
word_features = word_feature_engineering(word_features)
word_features.shape

(10, 33)

In [24]:
assert word_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [25]:
word_features.columns.to_list()

['essay_id',
 'word_char_count_mean',
 'word_char_count_min',
 'word_char_count_max',
 'word_char_count_q1',
 'word_char_count_q3',
 'word_char_count_<lambda_2>',
 'word_char_count_len_l_5',
 'word_char_count_len_goe_5',
 'word_char_count_len_l_7',
 'word_char_count_len_goe_7',
 'word_char_count_len_l_9',
 'word_char_count_len_goe_9',
 'word_char_count_len_l_11',
 'word_char_count_len_goe_11',
 'word_char_count_len_l_13',
 'word_char_count_len_goe_13',
 'word_char_count_len_l_15',
 'word_char_count_len_goe_15',
 'word_char_count_len_l_17',
 'word_char_count_len_goe_17',
 'word_char_count_len_l_19',
 'word_char_count_len_goe_19',
 'word_char_count_len_l_21',
 'word_char_count_len_goe_21',
 'word_char_count_len_l_23',
 'word_char_count_len_goe_23',
 'word_char_count_len_l_25',
 'word_char_count_len_goe_25',
 'word_char_count_len_l_27',
 'word_char_count_len_goe_27',
 'word_char_count_len_l_29',
 'word_char_count_len_goe_29']

#### TF-IDF Features

A TF-IDF vectorizer is used to convert the essays into numerical features.

In [28]:
vectorizer, tfidf_features = generate_tfidf_features(train_df.copy(deep=True), None)

In [29]:
assert tfidf_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [30]:
with open("output/LGBM/vectorizer.pkl", "wb") as file:
    pkl.dump(vectorizer, file)

#### Count Vectorizer

A countVectorizer is used to convert the essays into numerical features

In [32]:
vectorizer_cnt, count_features = generate_count_features(train_df.copy(deep=True), None)

In [33]:
assert count_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [34]:
with open("output/LGBM/vectorizer_cnt.pkl", "wb") as file:
    pkl.dump(vectorizer_cnt, file)

#### DeBERTA Predictions

Predictions made using DeBERTA models

In [35]:
# oov_path = "output/microsoft/deberta-v3-xsmall/oof_df.csv"
# deberta_features = pd.read_csv(oov_path, usecols=[f"score_prob_{i}" for i in range(config.num_classes)] + ["essay_id"])
# deberta_features.shape

In [36]:
# deberta_features.sample(3)

#### Combining Features

In [37]:
all_features = paragraph_features.copy()

# Merge using essay_id column
for feature_df in [sentence_features, word_features, tfidf_features, count_features]:
    all_features = pd.merge(all_features, feature_df, on="essay_id")

all_features = pd.merge(all_features, train_df[["essay_id", "score"]], on="essay_id")
all_features.shape

(10, 34807)

In [38]:
assert all_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [None]:
all_features.to_csv(Paths.FEATURE_ENGG_CSV_PATH, index=False)