# LGBM - Data Preparation

## Setup

In [24]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.data_tools.feature_engineering import (
    process_paragraph,
    paragraph_feature_engineering,
    process_sentence,
    sentence_feature_engineering,
    process_word,
    word_feature_engineering,
    generate_tfidf_features,
    generate_count_features,
)

In [4]:
seed_everything()

## Data Preparation

### Data Loading

In [5]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)

### Feature Engineerings

#### Paragraph Level

In [6]:
paragraph_features = process_paragraph(train_df)
paragraph_features.shape

(85934, 9)

In [7]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,full_text,score,paragraph,paragraph_no_punctuation,paragraph_error_count,paragraph_char_count,paragraph_word_count,paragraph_sentence_count
12725,bbc0087,The development of driverless cars should be a...,4,"in just a few years, with manufacturers making...",in just a few years with manufacturers making ...,0,150,23,1
5194,4d8bb17,When you say that the face on mars is an alien...,3,when you say that the face on mars is an alien...,when you say that the face on mars is an alien...,1,244,55,5
13746,c9d4074,Driverless cars are quickly becoming a reality...,4,the focus should be on teaching people to driv...,the focus should be on teaching people to driv...,0,148,27,1


In [8]:
paragraph_features = paragraph_feature_engineering(paragraph_features)
paragraph_features.shape

(17307, 33)

In [9]:
paragraph_features.columns

Index(['essay_id', 'paragraph_error_count_mean', 'paragraph_error_count_min',
       'paragraph_error_count_max', 'paragraph_error_count_sum',
       'paragraph_error_count_first', 'paragraph_error_count_last',
       'paragraph_char_count_mean', 'paragraph_char_count_min',
       'paragraph_char_count_max', 'paragraph_char_count_sum',
       'paragraph_char_count_first', 'paragraph_char_count_last',
       'paragraph_word_count_mean', 'paragraph_word_count_min',
       'paragraph_word_count_max', 'paragraph_word_count_sum',
       'paragraph_word_count_first', 'paragraph_word_count_last',
       'paragraph_sentence_count_mean', 'paragraph_sentence_count_min',
       'paragraph_sentence_count_max', 'paragraph_sentence_count_sum',
       'paragraph_sentence_count_first', 'paragraph_sentence_count_last',
       'paragraph_error_count_q1', 'paragraph_error_count_q3',
       'paragraph_char_count_q1', 'paragraph_char_count_q3',
       'paragraph_word_count_q1', 'paragraph_word_count_q3',
 

In [10]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,paragraph_error_count_mean,paragraph_error_count_min,paragraph_error_count_max,paragraph_error_count_sum,paragraph_error_count_first,paragraph_error_count_last,paragraph_char_count_mean,paragraph_char_count_min,paragraph_char_count_max,...,paragraph_sentence_count_first,paragraph_sentence_count_last,paragraph_error_count_q1,paragraph_error_count_q3,paragraph_char_count_q1,paragraph_char_count_q3,paragraph_word_count_q1,paragraph_word_count_q3,paragraph_sentence_count_q1,paragraph_sentence_count_q3
10716,9e219bb,1.625,0,7,13,0,0,215.5,17,422,...,0,1,0.0,1.75,120.5,285.25,22.75,47.75,1.0,3.5
16341,f114047,7.5,1,13,45,1,5,416.666667,128,699,...,1,2,5.0,11.25,224.75,599.0,38.75,101.25,2.5,5.5
9190,87c531e,2.833333,0,7,17,1,2,416.166667,44,1093,...,2,1,1.0,5.0,222.25,522.25,41.25,95.25,1.25,3.5


#### Sentence Level

In [11]:
sentence_features = process_sentence(train_df)
sentence_features.shape

(330422, 9)

In [12]:
sentence_features = sentence_feature_engineering(sentence_features)
sentence_features.sample(3)

Unnamed: 0,essay_id,sentence_error_count_mean,sentence_error_count_min,sentence_error_count_max,sentence_error_count_sum,sentence_error_count_first,sentence_error_count_last,sentence_char_count_mean,sentence_char_count_min,sentence_char_count_max,...,sentence_word_count_max,sentence_word_count_sum,sentence_word_count_first,sentence_word_count_last,sentence_error_count_q1,sentence_error_count_q3,sentence_char_count_q1,sentence_char_count_q3,sentence_word_count_q1,sentence_word_count_q3
10166,9612d64,0.214286,0,2,6,0,0,89.107143,18,146,...,29,490,4,17,0.0,0.0,72.25,105.5,15.75,20.0
10215,96c77d6,0.103448,0,1,3,0,0,96.482759,39,189,...,34,514,19,16,0.0,0.0,78.0,113.0,16.0,19.0
3377,3217d6b,0.444444,0,2,8,1,0,76.777778,18,152,...,26,257,7,20,0.0,1.0,56.25,94.75,11.25,18.25


In [13]:
sentence_features.shape

(17307, 25)

#### Word Level

In [14]:
word_features = process_word(train_df)
word_features.shape

(6350538, 7)

In [15]:
word_features = word_feature_engineering(word_features)
word_features.shape

(17307, 8)

#### TF-IDF Features

A TF-IDF vectorizer is used to convert the essays into numerical features.

In [34]:
def func(x):
    return x

In [35]:
def generate_tfidf_features(df: pd.DataFrame) -> pd.DataFrame:
    vectorizer = TfidfVectorizer(
        tokenizer=func,
        preprocessor=func,
        token_pattern=None,
        strip_accents="unicode",
        analyzer="word",
        ngram_range=(3, 6),
        min_df=0.05,
        max_df=0.95,
        sublinear_tf=True,
    )

    tfidf_features = vectorizer.fit_transform([i for i in df["full_text"]])
    tfidf_features = pd.DataFrame(tfidf_features.toarray())
    tfidf_features.columns = [f"tfidf_{i}" for i in range(tfidf_features.shape[1])]
    tfidf_features["essay_id"] = df["essay_id"].copy()

    return vectorizer, tfidf_features

In [36]:
vectorizer, tfidf_features = generate_tfidf_features(train_df)

In [37]:
with open("output/LGBM/vectorizer.pkl", "wb") as file:
    pkl.dump(vectorizer, file)

#### Count Vectorizer

A countVectorizer is used to convert the essays into numerical features

In [39]:
vectorizer_cnt, count_features = generate_count_features(train_df)

In [41]:
with open("output/LGBM/vectorizer_cnt.pkl", "wb") as file:
    pkl.dump(vectorizer_cnt, file)

#### DeBERTA Predictions

Predictions made using DeBERTA models

In [42]:
oov_path = "output/microsoft/deberta-v3-xsmall/oof_df.csv"
deberta_features = pd.read_csv(oov_path, usecols=[f"score_prob_{i}" for i in range(config.num_classes)] + ["essay_id"])
deberta_features.shape

(23811, 7)

In [43]:
deberta_features.sample(3)

Unnamed: 0,essay_id,score_prob_0,score_prob_1,score_prob_2,score_prob_3,score_prob_4,score_prob_5
6270,43fb35c,-2.684635,-2.468811,-0.468599,1.975218,2.228659,0.406944
1648,1319f06,-2.932105,-1.537177,1.361386,3.072376,0.741203,-2.185421
9005,6203d5a,-1.352405,1.065698,3.577487,1.846844,-1.232293,-2.218516


#### Combining Features

In [44]:
all_features = deberta_features.copy()

# Merge using essay_id column
for feature_df in [paragraph_features, sentence_features, word_features, tfidf_features, count_features]:
    all_features = pd.merge(all_features, feature_df, on="essay_id")

all_features.shape

(23810, 21867)

In [45]:
all_features.drop_duplicates(subset="essay_id", inplace=True)

In [None]:
all_features.to_csv(Paths.FEATURE_ENGG_CSV_PATH, index=False)