# LGBM - Data Preparation

## Setup

In [1]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.data_tools.feature_engineering import (
    process_paragraph,
    paragraph_feature_engineering,
    process_sentence,
    sentence_feature_engineering,
    process_word,
    word_feature_engineering,
    generate_tfidf_features,
    generate_count_features,
)

In [4]:
seed_everything()

## Data Preparation

### Data Loading

In [5]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)

### Feature Engineerings

#### Paragraph Level

In [6]:
paragraph_features = process_paragraph(train_df)
paragraph_features.shape

In [None]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,full_text,score,paragraph,paragraph_no_punctuation,paragraph_error_count,paragraph_char_count,paragraph_word_count,paragraph_sentence_count
12725,bbc0087,The development of driverless cars should be a...,4,"in just a few years, with manufacturers making...",in just a few years with manufacturers making ...,0,150,23,1
5194,4d8bb17,When you say that the face on mars is an alien...,3,when you say that the face on mars is an alien...,when you say that the face on mars is an alien...,1,244,55,5
13746,c9d4074,Driverless cars are quickly becoming a reality...,4,the focus should be on teaching people to driv...,the focus should be on teaching people to driv...,0,148,27,1


In [None]:
paragraph_features = paragraph_feature_engineering(paragraph_features)
paragraph_features.shape

  lambda x: kurtosis(x),


TypeError: cannot concatenate object of type '<class 'str'>'; only Series and DataFrame objs are valid

In [None]:
paragraph_features.columns

Index(['essay_id', 'paragraph_error_count_mean', 'paragraph_error_count_min',
       'paragraph_error_count_max', 'paragraph_error_count_sum',
       'paragraph_error_count_first', 'paragraph_error_count_last',
       'paragraph_char_count_mean', 'paragraph_char_count_min',
       'paragraph_char_count_max',
       ...
       'paragraph_sentence_count_len<650', 'paragraph_sentence_count_len>=650',
       'paragraph_error_count_len<700', 'paragraph_error_count_len>=700',
       'paragraph_char_count_len<700', 'paragraph_char_count_len>=700',
       'paragraph_word_count_len<700', 'paragraph_word_count_len>=700',
       'paragraph_sentence_count_len<700',
       'paragraph_sentence_count_len>=700'],
      dtype='object', length=149)

In [None]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,paragraph_error_count_mean,paragraph_error_count_min,paragraph_error_count_max,paragraph_error_count_sum,paragraph_error_count_first,paragraph_error_count_last,paragraph_char_count_mean,paragraph_char_count_min,paragraph_char_count_max,...,paragraph_sentence_count_len<650,paragraph_sentence_count_len>=650,paragraph_error_count_len<700,paragraph_error_count_len>=700,paragraph_char_count_len<700,paragraph_char_count_len>=700,paragraph_word_count_len<700,paragraph_word_count_len>=700,paragraph_sentence_count_len<700,paragraph_sentence_count_len>=700
643,0a50c61,1.5,0.0,5.0,12.0,1.0,0.0,438.625,67.0,707.0,...,,,,,,,,,,
15284,e1554c4,6.0,5.0,7.0,18.0,5.0,7.0,748.0,360.0,1019.0,...,,,,,,,,,,
1464,16d8683,0.5,0.0,1.0,2.0,1.0,1.0,321.25,192.0,409.0,...,,,,,,,,,,


#### Sentence Level

In [None]:
sentence_features = process_sentence(train_df)
sentence_features.shape

In [None]:
sentence_features = sentence_feature_engineering(sentence_features)
sentence_features.sample(3)

  lambda x: kurtosis(x),


Unnamed: 0,index,sentence_error_count_mean,sentence_error_count_min,sentence_error_count_max,sentence_error_count_sum,sentence_error_count_first,sentence_error_count_last,sentence_char_count_mean,sentence_char_count_min,sentence_char_count_max,...,sentence_char_count_len<275,sentence_char_count_len>=275,sentence_word_count_len<275,sentence_word_count_len>=275,sentence_error_count_len<300,sentence_error_count_len>=300,sentence_char_count_len<300,sentence_char_count_len>=300,sentence_word_count_len<300,sentence_word_count_len>=300
118,0226250,0.823529,0.0,3.0,14.0,0.0,0.0,129.235294,37.0,284.0,...,,,,,,,,,,
5312,4f57f96,0.6,0.0,2.0,6.0,0.0,1.0,147.2,76.0,237.0,...,,,,,,,,,,
3516,340e8e8,0.307692,0.0,2.0,4.0,0.0,0.0,106.076923,47.0,218.0,...,,,,,,,,,,


In [None]:
sentence_features.shape

(34614, 100)

#### Word Level

In [None]:
word_features = process_word(train_df)
word_features.shape

(6350538, 7)

In [None]:
word_features = word_feature_engineering(word_features)
word_features.shape

(34614, 33)

#### TF-IDF Features

A TF-IDF vectorizer is used to convert the essays into numerical features.

In [None]:
vectorizer, tfidf_features = generate_tfidf_features(train_df, None)

In [None]:
with open("output/LGBM/vectorizer.pkl", "wb") as file:
    pkl.dump(vectorizer, file)

#### Count Vectorizer

A countVectorizer is used to convert the essays into numerical features

In [None]:
vectorizer_cnt, count_features = generate_count_features(train_df)

In [None]:
with open("output/LGBM/vectorizer_cnt.pkl", "wb") as file:
    pkl.dump(vectorizer_cnt, file)

#### DeBERTA Predictions

Predictions made using DeBERTA models

In [None]:
oov_path = "output/microsoft/deberta-v3-xsmall/oof_df.csv"
deberta_features = pd.read_csv(oov_path, usecols=[f"score_prob_{i}" for i in range(config.num_classes)] + ["essay_id"])
deberta_features.shape

FileNotFoundError: [Errno 2] No such file or directory: 'output/microsoft/deberta-v3-xsmall/oof_df.csv'

In [None]:
deberta_features.sample(3)

Unnamed: 0,essay_id,score_prob_0,score_prob_1,score_prob_2,score_prob_3,score_prob_4,score_prob_5
9685,86cc1c1,0.030715,0.308556,0.614073,0.043541,0.002481,0.000634
5966,812bb51,0.02778,0.509559,0.440155,0.016868,0.003152,0.002486
23306,d7d09c0,0.089226,0.771728,0.124473,0.00989,0.002669,0.002014


#### Combining Features

In [None]:
all_features = paragraph_features.copy()

# Merge using essay_id column
for feature_df in [sentence_features, word_features, tfidf_features, count_features]:
    all_features = pd.merge(all_features, feature_df, on="essay_id")

all_features.shape

KeyError: 'essay_id'

In [None]:
"essay_id" in sentence_features.columns

False

In [None]:
all_features.drop_duplicates(subset="essay_id", inplace=True)

In [None]:
all_features.to_csv(Paths.FEATURE_ENGG_CSV_PATH, index=False)