# LGBM

Sources
1. [LGBM & Deberta Explained by ZULQARNAIN ALI](https://www.kaggle.com/code/zulqarnainalipk/lgbm-deberta-explained)

## Setup

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
os.chdir("../")

In [3]:
from lib.config import config
from lib.paths import Paths

## Data Preparation

### Data Loading

In [4]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)
test_df = pd.read_csv(Paths.COMPETITION_TEST_CSE_PATH)

### Counting Spelling Errors

In [5]:
import spacy

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
with open(Paths.ENG_WORDS_HX, 'r') as file:
    english_vocab = set(word.strip().lower() for word in file)

In [8]:
def count_spelling_errors(text):
    """Uses `spacy` and list of correctly spelled english words
    located at `Paths.ENG_WORDS_HX` to count number of spelling
    errors.
    """
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]

    spelling_errors = sum(
        1 for token in lemmatized_tokens if token not in english_vocab
    )

    return spelling_errors


count_spelling_errors("There is one speling error here")

1

### Expanding Contractions

In [9]:
import re
import json

In [10]:
contraction_dict = json.load(open(Paths.CONTRACTION_FILE_PATH, "r"))
contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))

In [11]:
def expand_contractions(text: str, c_re=contraction_re) -> str:
    """Replaces contracted word/phrase with enlongated word/phrase."""

    def replace(match):
        return contraction_dict[match.group(0)]

    return c_re.sub(replace, text)

expand_contractions("You aren't working!")

'You are not working!'

### Data Cleaning

In [12]:
def remove_HTML_tags(text: str) -> str:
    """Remove HTML tags from a text string"""
    return re.sub(r"<[^>]*>", "", text)

In [13]:
def remove_URL(text: str) -> str:
    """Remove URLs from a text string"""
    return re.sub(r"http\S+", "", text)

In [14]:
def data_preprocessing(x: str) -> str:
    x = x.lower()
    x = remove_HTML_tags(x)
    x = re.sub("@\w+", "", x)
    x = re.sub("'\d+", "", x)
    x = re.sub("\d+", "", x)
    x = remove_URL(x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x


data_preprocessing("This is 1 example: <b>https://www.kaggle.com/</b> for user @shakleen")

'this is example: for user'

### Punctuation Removal

In [15]:
import string

In [16]:
def remove_punctuation(text: str) -> str:
    """A translator is created using str.maketrans('', '', string.punctuation), 
    which generates a translation table that maps each character in the 
    string string.punctuation to None. This effectively removes all punctuation characters."""
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)


remove_punctuation("This.Has!No-Punctuations!")

'ThisHasNoPunctuations'

### Feature Engineerings

#### Paragraph Level

In [17]:
def process_paragraph(df: pd.DataFrame) -> pd.DataFrame:
    # Construct paragraphs
    df["paragraph"] = df["full_text"].map(lambda x: x.split("\n\n"))

    # Have each paragraph be its own row
    df = df.explode("paragraph")

    # Process Paragraph text
    df["paragraph"] = df["paragraph"].map(data_preprocessing)
    df["paragraph_no_punctuation"] = df["paragraph"].map(remove_punctuation)

    # Calculate base stats
    df["paragraph_error_count"] = df["paragraph_no_punctuation"].map(count_spelling_errors)
    df["paragraph_char_count"] = df["paragraph"].map(lambda x: len(x))
    df["paragraph_word_count"] = df["paragraph"].map(lambda x: len(re.findall(r'\w+', x)))
    df["paragraph_sentence_count"] = df["paragraph"].map(lambda x: len(re.findall(r'[.!?]', x)))
    
    return df

In [18]:
paragraph_features = process_paragraph(train_df)
paragraph_features.shape

(85934, 9)

In [19]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,full_text,score,paragraph,paragraph_no_punctuation,paragraph_error_count,paragraph_char_count,paragraph_word_count,paragraph_sentence_count
14463,d513e3a,"""What the picture actually shows is the Martia...",2,"""thousands of anixous web surfers were waiting...",thousands of anixous web surfers were waiting ...,2,131,21,4
12043,b0fc064,A good reason for other people to join the Sea...,3,luke joined the seagoing cowboys program becau...,luke joined the seagoing cowboys program becau...,1,301,55,3
2483,25588ad,The invention of driverless cars could be a re...,3,the cost of gas can be a major problem.,the cost of gas can be a major problem,0,39,9,1


In [20]:
def paragraph_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    feature_list = [
        "paragraph_error_count",
        "paragraph_char_count",
        "paragraph_word_count",
        "paragraph_sentence_count",
    ]

    feature_df = df.groupby("essay_id")[feature_list].agg(
        ["mean", "min", "max", "sum", "first", "last"]
    )

    feature_df = pd.concat(
        [
            feature_df,
            df.groupby("essay_id")[feature_list]
            .agg([lambda x: np.quantile(x, 0.25), lambda x: np.quantile(x, 0.75)])
            .rename(columns={"<lambda_0>": "q1", "<lambda_1>": "q3"}),
        ],
        axis=1,
    )

    feature_df = feature_df.set_axis(feature_df.columns.map("_".join), axis=1)
    return feature_df.reset_index()

In [21]:
paragraph_features = paragraph_feature_engineering(paragraph_features)
paragraph_features.shape

(17307, 33)

In [22]:
paragraph_features.columns

Index(['essay_id', 'paragraph_error_count_mean', 'paragraph_error_count_min',
       'paragraph_error_count_max', 'paragraph_error_count_sum',
       'paragraph_error_count_first', 'paragraph_error_count_last',
       'paragraph_char_count_mean', 'paragraph_char_count_min',
       'paragraph_char_count_max', 'paragraph_char_count_sum',
       'paragraph_char_count_first', 'paragraph_char_count_last',
       'paragraph_word_count_mean', 'paragraph_word_count_min',
       'paragraph_word_count_max', 'paragraph_word_count_sum',
       'paragraph_word_count_first', 'paragraph_word_count_last',
       'paragraph_sentence_count_mean', 'paragraph_sentence_count_min',
       'paragraph_sentence_count_max', 'paragraph_sentence_count_sum',
       'paragraph_sentence_count_first', 'paragraph_sentence_count_last',
       'paragraph_error_count_q1', 'paragraph_error_count_q3',
       'paragraph_char_count_q1', 'paragraph_char_count_q3',
       'paragraph_word_count_q1', 'paragraph_word_count_q3',
 

In [23]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,paragraph_error_count_mean,paragraph_error_count_min,paragraph_error_count_max,paragraph_error_count_sum,paragraph_error_count_first,paragraph_error_count_last,paragraph_char_count_mean,paragraph_char_count_min,paragraph_char_count_max,...,paragraph_sentence_count_first,paragraph_sentence_count_last,paragraph_error_count_q1,paragraph_error_count_q3,paragraph_char_count_q1,paragraph_char_count_q3,paragraph_word_count_q1,paragraph_word_count_q3,paragraph_sentence_count_q1,paragraph_sentence_count_q3
13643,c83d850,1.166667,0,3,7,2,0,341.166667,62,546,...,5,5,0.25,1.75,215.0,473.75,42.25,89.0,1.25,4.5
3610,357ce9b,2.4,0,6,12,0,1,403.8,55,1009,...,0,3,1.0,4.0,158.0,404.0,25.0,71.0,2.0,4.0
6088,5b0af71,3.2,1,7,16,2,1,601.2,416,883,...,4,6,2.0,3.0,443.0,657.0,85.0,113.0,6.0,8.0


#### Sentence Level

In [24]:
from nltk import sent_tokenize

In [25]:
def process_sentence(df: pd.DataFrame) -> pd.DataFrame:
    # Construct sentences
    df["sentence"] = df["full_text"].map(lambda x: sent_tokenize(x))

    # Have each paragraph be its own row
    df = df.explode("sentence")

    # Process Paragraph text
    df["sentence"] = df["sentence"].map(data_preprocessing)
    df["sentence_no_punctuation"] = df["sentence"].map(remove_punctuation)

    # Calculate base stats
    df["sentence_error_count"] = df["sentence_no_punctuation"].map(count_spelling_errors)
    df["sentence_char_count"] = df["sentence"].map(lambda x: len(x))
    df["sentence_word_count"] = df["sentence"].map(lambda x: len(re.findall(r'\w+', x)))
    
    return df

In [27]:
sentence_features = process_sentence(train_df)
sentence_features.shape

(330422, 9)

In [28]:
def sentence_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    feature_list = [
        "sentence_error_count",
        "sentence_char_count",
        "sentence_word_count",
    ]

    feature_df = df.groupby("essay_id")[feature_list].agg(
        ["mean", "min", "max", "sum", "first", "last"]
    )

    feature_df = pd.concat(
        [
            feature_df,
            df.groupby("essay_id")[feature_list]
            .agg([lambda x: np.quantile(x, 0.25), lambda x: np.quantile(x, 0.75)])
            .rename(columns={"<lambda_0>": "q1", "<lambda_1>": "q3"}),
        ],
        axis=1,
    )

    feature_df = feature_df.set_axis(feature_df.columns.map("_".join), axis=1)
    return feature_df.reset_index()

In [29]:
sentence_features = sentence_feature_engineering(sentence_features)

In [30]:
sentence_features.sample(3)

Unnamed: 0,essay_id,sentence_error_count_mean,sentence_error_count_min,sentence_error_count_max,sentence_error_count_sum,sentence_error_count_first,sentence_error_count_last,sentence_char_count_mean,sentence_char_count_min,sentence_char_count_max,...,sentence_word_count_max,sentence_word_count_sum,sentence_word_count_first,sentence_word_count_last,sentence_error_count_q1,sentence_error_count_q3,sentence_char_count_q1,sentence_char_count_q3,sentence_word_count_q1,sentence_word_count_q3
1878,1cbab90,0.625,0,2,15,0,0,178.583333,59,264,...,45,723,26,29,0.0,1.0,154.5,218.0,25.75,38.0
15912,eadfbe2,0.769231,0,4,30,1,0,62.333333,15,189,...,38,477,19,9,0.0,1.0,33.0,85.5,6.0,16.5
3643,3605e2a,0.357143,0,1,5,0,1,87.0,35,147,...,31,245,14,22,0.0,1.0,71.5,99.5,14.0,20.5


In [31]:
sentence_features.shape

(17307, 25)

#### Word Level

In [32]:
def process_word(df: pd.DataFrame) -> pd.DataFrame:
    # Get words
    temp = df["full_text"].map(data_preprocessing)
    df["word"] = temp.map(lambda x: x.split(" "))

    # Have each paragraph be its own row
    df = df.explode("word")

    # Calculate base stats
    df["word_char_count"] = df["word"].map(lambda x: len(x))

    return df

In [33]:
word_features = process_word(train_df)
word_features.shape

(6350538, 7)

In [34]:
def word_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    feature_list = ["word_char_count"]

    feature_df = df.groupby("essay_id")[feature_list].agg(["mean", "min", "max"])

    feature_df = pd.concat(
        [
            feature_df,
            df.groupby("essay_id")[feature_list]
            .agg(
                [
                    lambda x: np.quantile(x, 0.25),
                    lambda x: np.quantile(x, 0.50),
                    lambda x: np.quantile(x, 0.75),
                ]
            )
            .rename(
                columns={
                    "<lambda_0>": "q1",
                    "<lambda_1>": "q2",
                    "<lambda_1>": "q3",
                }
            ),
        ],
        axis=1,
    )

    feature_df = feature_df.set_axis(feature_df.columns.map("_".join), axis=1)
    feature_df = pd.concat([feature_df, df.groupby("essay_id")["score"].mean()], axis=1)
    return feature_df.reset_index()

In [35]:
word_features = word_feature_engineering(word_features)
word_features.shape

(17307, 8)

#### TF-IDF Features

A TF-IDF vectorizer is used to convert the essays into numerical features.

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    strip_accents="unicode",
    analyzer="word",
    ngram_range=(3, 6),
    min_df=0.05,
    max_df=0.95,
    sublinear_tf=True,
)

In [55]:
tfidf_features = vectorizer.fit_transform([i for i in train_df['full_text']])

In [56]:
tfidf_features = pd.DataFrame(tfidf_features.toarray())
tfidf_features.columns = [f"tfidf_{i}" for i in range(tfidf_features.shape[1])]
tfidf_features.shape

(17307, 19627)

In [57]:
tfidf_features["essay_id"] = train_df["essay_id"].copy()

#### Count Vectorizer

A countVectorizer is used to convert the essays into numerical features

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

In [59]:
vectorizer_cnt = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(2,3),
    min_df=0.10,
    max_df=0.85,
)

In [60]:
count_features = vectorizer_cnt.fit_transform([i for i in train_df['full_text']])

In [61]:
count_features = pd.DataFrame(count_features.toarray())
count_features.columns = [f"tfidf_count_{i}" for i in range(count_features.shape[1])]
count_features.shape

(17307, 2170)

In [62]:
count_features["essay_id"] = train_df["essay_id"].copy()

#### DeBERTA Predictions

Predictions made using DeBERTA models

In [63]:
oov_path = "output/microsoft/deberta-v3-xsmall/oof_df.csv"
deberta_features = pd.read_csv(oov_path, usecols=[f"score_prob_{i}" for i in range(config.num_classes)] + ["essay_id"])
deberta_features.shape

(23811, 7)

In [64]:
deberta_features.sample(3)

Unnamed: 0,essay_id,score_prob_0,score_prob_1,score_prob_2,score_prob_3,score_prob_4,score_prob_5
10141,6ed94f5,-3.045542,-3.26724,-1.664371,0.658285,2.745111,2.705156
15542,a698bb4,-1.512836,2.838059,3.235294,0.125551,-2.400192,-2.780882
21911,eafac49,-0.380366,1.164232,2.426566,1.046173,-1.295725,-2.493743


#### Combining Features

In [78]:
all_features = deberta_features.copy()

# Merge using essay_id column
for feature_df in [paragraph_features, sentence_features, word_features, tfidf_features, count_features]:
    all_features = pd.merge(all_features, feature_df, on="essay_id")

all_features.shape

(23810, 21867)

In [81]:
all_features.drop_duplicates(subset="essay_id", inplace=True)

In [82]:
all_features.to_csv(Paths.FEATURE_ENGG_CSV_PATH, index=False)