# LGBM

Sources
1. [LGBM & Deberta Explained by ZULQARNAIN ALI](https://www.kaggle.com/code/zulqarnainalipk/lgbm-deberta-explained)

## Setup

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
os.chdir("../")

In [3]:
from lib.config import config
from lib.paths import Paths

## Data Preparation

### Data Loading

In [4]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)
test_df = pd.read_csv(Paths.COMPETITION_TEST_CSE_PATH)

### Counting Spelling Errors

In [5]:
import spacy

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
with open(Paths.ENG_WORDS_HX, 'r') as file:
    english_vocab = set(word.strip().lower() for word in file)

In [8]:
def count_spelling_errors(text):
    """Uses `spacy` and list of correctly spelled english words
    located at `Paths.ENG_WORDS_HX` to count number of spelling
    errors.
    """
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]

    spelling_errors = sum(
        1 for token in lemmatized_tokens if token not in english_vocab
    )

    return spelling_errors


count_spelling_errors("There is one speling error here")

1

### Expanding Contractions

In [9]:
import re
import json

In [10]:
contraction_dict = json.load(open(Paths.CONTRACTION_FILE_PATH, "r"))
contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))

In [11]:
def expand_contractions(text: str, c_re=contraction_re) -> str:
    """Replaces contracted word/phrase with enlongated word/phrase."""

    def replace(match):
        return contraction_dict[match.group(0)]

    return c_re.sub(replace, text)

expand_contractions("You aren't working!")

'You are not working!'

### Data Cleaning

In [12]:
def remove_HTML_tags(text: str) -> str:
    """Remove HTML tags from a text string"""
    return re.sub(r"<[^>]*>", "", text)

In [13]:
def remove_URL(text: str) -> str:
    """Remove URLs from a text string"""
    return re.sub(r"http\S+", "", text)

In [14]:
def data_preprocessing(x: str) -> str:
    x = x.lower()
    x = remove_HTML_tags(x)
    x = re.sub("@\w+", "", x)
    x = re.sub("'\d+", "", x)
    x = re.sub("\d+", "", x)
    x = remove_URL(x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x


data_preprocessing("This is 1 example: <b>https://www.kaggle.com/</b> for user @shakleen")

'this is example: for user'

### Punctuation Removal

In [15]:
import string

In [16]:
def remove_punctuation(text: str) -> str:
    """A translator is created using str.maketrans('', '', string.punctuation), 
    which generates a translation table that maps each character in the 
    string string.punctuation to None. This effectively removes all punctuation characters."""
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)


remove_punctuation("This.Has!No-Punctuations!")

'ThisHasNoPunctuations'

### Feature Engineerings

#### Paragraph Level

In [17]:
def process_paragraph(df: pd.DataFrame) -> pd.DataFrame:
    # Construct paragraphs
    df["paragraph"] = df["full_text"].map(lambda x: x.split("\n\n"))

    # Have each paragraph be its own row
    df = df.explode("paragraph")

    # Process Paragraph text
    df["paragraph"] = df["paragraph"].map(data_preprocessing)
    df["paragraph_no_punctuation"] = df["paragraph"].map(remove_punctuation)

    # Calculate base stats
    df["paragraph_error_count"] = df["paragraph_no_punctuation"].map(count_spelling_errors)
    df["paragraph_char_count"] = df["paragraph"].map(lambda x: len(x))
    df["paragraph_word_count"] = df["paragraph"].map(lambda x: len(re.findall(r'\w+', x)))
    df["paragraph_sentence_count"] = df["paragraph"].map(lambda x: len(re.findall(r'[.!?]', x)))
    
    return df

In [18]:
train_df = process_paragraph(train_df)
train_df.shape

(85934, 9)

In [19]:
train_df.sample(3)

Unnamed: 0,essay_id,full_text,score,paragraph,paragraph_no_punctuation,paragraph_error_count,paragraph_char_count,paragraph_word_count,paragraph_sentence_count
7632,7235637,Driverless cars might sound awesome but at the...,2,my overall opinion is to get these cars. like ...,my overall opinion is to get these cars like i...,1,453,97,5
1974,1e1bb4e,Electoral college should be changed to electio...,2,electoral college should be changed to electio...,electoral college should be changed to electio...,3,619,99,4
1444,16729f5,"Dear Senator,\n\nI believe that we should keep...",2,"dear senator,",dear senator,0,13,2,0


In [43]:
def paragraph_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    feature_list = [
        "paragraph_error_count",
        "paragraph_char_count",
        "paragraph_word_count",
        "paragraph_sentence_count",
    ]

    feature_df = df.groupby("essay_id")[feature_list].agg(
        ["mean", "min", "max", "sum", "first", "last"]
    )

    feature_df = pd.concat(
        [
            feature_df,
            df.groupby("essay_id")[feature_list]
            .agg([lambda x: np.quantile(x, 0.25), lambda x: np.quantile(x, 0.75)])
            .rename(columns={"<lambda_0>": "q1", "<lambda_1>": "q3"}),
        ],
        axis=1,
    )

    feature_df = feature_df.set_axis(feature_df.columns.map("_".join), axis=1)
    feature_df = pd.concat([feature_df, df.groupby("essay_id")["score"].mean()], axis=1)
    return feature_df.reset_index()

In [47]:
paragraph_features = paragraph_feature_engineering(train_df)
paragraph_features.shape

(17307, 34)

In [48]:
paragraph_features.columns

Index(['essay_id', 'paragraph_error_count_mean', 'paragraph_error_count_min',
       'paragraph_error_count_max', 'paragraph_error_count_sum',
       'paragraph_error_count_first', 'paragraph_error_count_last',
       'paragraph_char_count_mean', 'paragraph_char_count_min',
       'paragraph_char_count_max', 'paragraph_char_count_sum',
       'paragraph_char_count_first', 'paragraph_char_count_last',
       'paragraph_word_count_mean', 'paragraph_word_count_min',
       'paragraph_word_count_max', 'paragraph_word_count_sum',
       'paragraph_word_count_first', 'paragraph_word_count_last',
       'paragraph_sentence_count_mean', 'paragraph_sentence_count_min',
       'paragraph_sentence_count_max', 'paragraph_sentence_count_sum',
       'paragraph_sentence_count_first', 'paragraph_sentence_count_last',
       'paragraph_error_count_q1', 'paragraph_error_count_q3',
       'paragraph_char_count_q1', 'paragraph_char_count_q3',
       'paragraph_word_count_q1', 'paragraph_word_count_q3',
 

In [49]:
paragraph_features.sample(3)

Unnamed: 0,essay_id,paragraph_error_count_mean,paragraph_error_count_min,paragraph_error_count_max,paragraph_error_count_sum,paragraph_error_count_first,paragraph_error_count_last,paragraph_char_count_mean,paragraph_char_count_min,paragraph_char_count_max,...,paragraph_sentence_count_last,paragraph_error_count_q1,paragraph_error_count_q3,paragraph_char_count_q1,paragraph_char_count_q3,paragraph_word_count_q1,paragraph_word_count_q3,paragraph_sentence_count_q1,paragraph_sentence_count_q3,score
4347,408b7f2,2.0,0,5,6,1,5,514.666667,468,605,...,4,0.5,3.0,469.5,538.0,83.0,103.0,4.5,5.5,4.0
16749,f6ff58e,3.333333,0,7,20,0,3,386.0,85,693,...,1,2.25,4.0,265.5,542.5,46.5,102.75,1.25,3.75,4.0
16400,f1c54fc,4.0,1,7,12,1,4,506.0,233,841,...,3,2.5,5.5,338.5,642.5,63.0,121.5,2.5,3.5,3.0


In [54]:
paragraph_features.to_csv(Paths.PARAGRAPH_FEATURES_CSV_PATH, index=False)

#### Sentence Level

In [17]:
from nltk import sent_tokenize

In [21]:
def process_sentence(df: pd.DataFrame) -> pd.DataFrame:
    # Construct sentences
    df["sentence"] = df["full_text"].map(lambda x: sent_tokenize(x))

    # Have each paragraph be its own row
    df = df.explode("sentence")

    # Process Paragraph text
    df["sentence"] = df["sentence"].map(data_preprocessing)
    df["sentence_no_punctuation"] = df["sentence"].map(remove_punctuation)

    # Calculate base stats
    df["sentence_error_count"] = df["sentence_no_punctuation"].map(count_spelling_errors)
    df["sentence_char_count"] = df["sentence"].map(lambda x: len(x))
    df["sentence_word_count"] = df["sentence"].map(lambda x: len(re.findall(r'\w+', x)))
    
    return df

In [22]:
train_df = process_sentence(train_df)
train_df.shape

(330422, 8)

In [25]:
def sentence_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    feature_list = [
        "sentence_error_count",
        "sentence_char_count",
        "sentence_word_count",
    ]

    feature_df = df.groupby("essay_id")[feature_list].agg(
        ["mean", "min", "max", "sum", "first", "last"]
    )

    feature_df = pd.concat(
        [
            feature_df,
            df.groupby("essay_id")[feature_list]
            .agg([lambda x: np.quantile(x, 0.25), lambda x: np.quantile(x, 0.75)])
            .rename(columns={"<lambda_0>": "q1", "<lambda_1>": "q3"}),
        ],
        axis=1,
    )

    feature_df = feature_df.set_axis(feature_df.columns.map("_".join), axis=1)
    feature_df = pd.concat([feature_df, df.groupby("essay_id")["score"].mean()], axis=1)
    return feature_df.reset_index()

In [27]:
sentence_features = sentence_feature_engineering(train_df)
sentence_features.to_csv(Paths.SENTENCE_FEATURES_CSV_PATH, index=False)

In [28]:
sentence_features.sample(3)

Unnamed: 0,essay_id,sentence_error_count_mean,sentence_error_count_min,sentence_error_count_max,sentence_error_count_sum,sentence_error_count_first,sentence_error_count_last,sentence_char_count_mean,sentence_char_count_min,sentence_char_count_max,...,sentence_word_count_sum,sentence_word_count_first,sentence_word_count_last,sentence_error_count_q1,sentence_error_count_q3,sentence_char_count_q1,sentence_char_count_q3,sentence_word_count_q1,sentence_word_count_q3,score
3446,3302250,1.2,0,2,12,1,2,152.2,57,243,...,253,15,23,1.0,2.0,107.5,201.75,16.75,34.5,1.0
15744,e86e8a9,0.473684,0,3,9,1,0,124.421053,51,258,...,446,32,21,0.0,1.0,83.5,152.5,15.0,29.5,4.0
3528,3432298,0.125,0,1,3,0,0,73.75,29,124,...,346,19,6,0.0,0.0,55.75,93.0,11.75,18.25,3.0


In [29]:
sentence_features.shape

(17307, 26)

#### Word Level

In [19]:
def process_word(df: pd.DataFrame) -> pd.DataFrame:
    # Construct sentences
    temp = df["full_text"].map(data_preprocessing)
    df["word"] = temp.map(lambda x: x.split(" "))

    # Have each paragraph be its own row
    df = df.explode("word")

    # Calculate base stats
    df["word_char_count"] = df["word"].map(lambda x: len(x))

    return df

In [20]:
train_df = process_word(train_df)
train_df.shape

(6350538, 5)

In [21]:
def word_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    feature_list = ["word_char_count"]

    feature_df = df.groupby("essay_id")[feature_list].agg(["mean", "min", "max"])

    feature_df = pd.concat(
        [
            feature_df,
            df.groupby("essay_id")[feature_list]
            .agg(
                [
                    lambda x: np.quantile(x, 0.25),
                    lambda x: np.quantile(x, 0.50),
                    lambda x: np.quantile(x, 0.75),
                ]
            )
            .rename(
                columns={
                    "<lambda_0>": "q1",
                    "<lambda_1>": "q2",
                    "<lambda_1>": "q3",
                }
            ),
        ],
        axis=1,
    )

    feature_df = feature_df.set_axis(feature_df.columns.map("_".join), axis=1)
    feature_df = pd.concat([feature_df, df.groupby("essay_id")["score"].mean()], axis=1)
    return feature_df.reset_index()

In [22]:
word_features = word_feature_engineering(train_df)
word_features.shape

(17307, 8)

In [23]:
word_features.to_csv(Paths.WORD_FEATURES_CSV_PATH, index=False)