# LGBM - Data Preparation

## Setup

In [1]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.data_tools.feature_engineering import (
    process_paragraph,
    paragraph_feature_engineering,
    process_sentence,
    sentence_feature_engineering,
    process_word,
    word_feature_engineering,
    generate_tfidf_features,
    generate_count_features,
)

In [4]:
seed_everything()

## Data Preparation

### Data Loading

In [5]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)
train_df.shape

(17307, 3)

### Feature Engineerings

#### Paragraph Level

In [6]:
paragraph_features = process_paragraph(train_df)
paragraph_features.shape

(85934, 9)

In [7]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,full_text,score,paragraph,paragraph_no_punctuation,paragraph_error_count,paragraph_char_count,paragraph_word_count,paragraph_sentence_count
12725,bbc0087,The development of driverless cars should be a...,4,"in just a few years, with manufacturers making...",in just a few years with manufacturers making ...,0,150,23,1
5194,4d8bb17,When you say that the face on mars is an alien...,3,when you say that the face on mars is an alien...,when you say that the face on mars is an alien...,1,244,55,5
13746,c9d4074,Driverless cars are quickly becoming a reality...,4,the focus should be on teaching people to driv...,the focus should be on teaching people to driv...,0,148,27,1


In [8]:
assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [9]:
paragraph_features = paragraph_feature_engineering(paragraph_features)
paragraph_features.shape

  lambda x: kurtosis(x),


(17307, 129)

In [10]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,paragraph_error_count_mean,paragraph_error_count_min,paragraph_error_count_max,paragraph_error_count_sum,paragraph_error_count_first,paragraph_error_count_last,paragraph_char_count_mean,paragraph_char_count_min,paragraph_char_count_max,...,paragraph_error_count_len<30,paragraph_error_count_len>=30,paragraph_error_count_len<35,paragraph_error_count_len>=35,paragraph_error_count_len<40,paragraph_error_count_len>=40,paragraph_error_count_len<45,paragraph_error_count_len>=45,paragraph_error_count_len<50,paragraph_error_count_len>=50
10716,9e219bb,1.625,0,7,13,0,0,215.5,17,422,...,True,False,True,False,True,False,True,False,True,False
16341,f114047,7.5,1,13,45,1,5,416.666667,128,699,...,True,False,True,False,True,False,True,False,True,False
9190,87c531e,2.833333,0,7,17,1,2,416.166667,44,1093,...,True,False,True,False,True,False,True,False,True,False


In [11]:
assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape

#### Sentence Level

In [12]:
sentence_features = process_sentence(train_df)
sentence_features.shape

(330422, 9)

In [13]:
assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [14]:
sentence_features = sentence_feature_engineering(sentence_features)
sentence_features.sample(3)

  lambda x: kurtosis(x),


Unnamed: 0,essay_id,sentence_error_count_mean,sentence_error_count_min,sentence_error_count_max,sentence_error_count_sum,sentence_error_count_first,sentence_error_count_last,sentence_char_count_mean,sentence_char_count_min,sentence_char_count_max,...,sentence_error_count_len<2,sentence_error_count_len>=2,sentence_error_count_len<4,sentence_error_count_len>=4,sentence_error_count_len<6,sentence_error_count_len>=6,sentence_error_count_len<8,sentence_error_count_len>=8,sentence_error_count_len<10,sentence_error_count_len>=10
10166,9612d64,0.214286,0,2,6,0,0,89.107143,18,146,...,False,True,False,True,False,True,False,True,False,True
10215,96c77d6,0.103448,0,1,3,0,0,96.482759,39,189,...,False,True,False,True,False,True,False,True,False,True
3377,3217d6b,0.444444,0,2,8,1,0,76.777778,18,152,...,False,True,False,True,False,True,False,True,False,True


In [15]:
sentence_features.shape

(17307, 85)

In [16]:
assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape

#### Word Level

In [17]:
word_features = process_word(train_df)
word_features.shape

(6350538, 7)

In [18]:
assert word_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [19]:
word_features = word_feature_engineering(word_features)
word_features.shape

(17307, 33)

In [20]:
assert word_features.essay_id.unique().shape == train_df.essay_id.unique().shape

#### TF-IDF Features

A TF-IDF vectorizer is used to convert the essays into numerical features.

In [21]:
vectorizer, tfidf_features = generate_tfidf_features(train_df, None)

In [22]:
assert tfidf_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [23]:
with open("output/LGBM/vectorizer.pkl", "wb") as file:
    pkl.dump(vectorizer, file)

#### Count Vectorizer

A countVectorizer is used to convert the essays into numerical features

In [24]:
vectorizer_cnt, count_features = generate_count_features(train_df)

In [25]:
assert count_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [26]:
with open("output/LGBM/vectorizer_cnt.pkl", "wb") as file:
    pkl.dump(vectorizer_cnt, file)

#### DeBERTA Predictions

Predictions made using DeBERTA models

In [27]:
# oov_path = "output/microsoft/deberta-v3-xsmall/oof_df.csv"
# deberta_features = pd.read_csv(oov_path, usecols=[f"score_prob_{i}" for i in range(config.num_classes)] + ["essay_id"])
# deberta_features.shape

In [28]:
# deberta_features.sample(3)

#### Combining Features

In [29]:
all_features = paragraph_features.copy()

# Merge using essay_id column
for feature_df in [sentence_features, word_features, tfidf_features, count_features]:
    all_features = pd.merge(all_features, feature_df, on="essay_id")

all_features = pd.merge(all_features, train_df[["essay_id", "score"]], on="essay_id")
all_features.shape

(17307, 22043)

In [30]:
assert all_features.essay_id.unique().shape == train_df.essay_id.unique().shape

In [31]:
all_features.to_csv(Paths.FEATURE_ENGG_CSV_PATH, index=False)