# LGBM

Sources
1. [LGBM & Deberta Explained by ZULQARNAIN ALI](https://www.kaggle.com/code/zulqarnainalipk/lgbm-deberta-explained)

## Setup

In [1]:
import os
import pandas as pd

In [2]:
os.chdir("../")

In [3]:
from lib.config import config
from lib.paths import Paths

## Data Preparation

### Data Loading

In [4]:
train_df = pd.read_csv(Paths.COMPETITION_TRAIN_CSV_PATH)
test_df = pd.read_csv(Paths.COMPETITION_TEST_CSE_PATH)

### Counting Spelling Errors

In [5]:
import spacy

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
with open(Paths.ENG_WORDS_HX, 'r') as file:
    english_vocab = set(word.strip().lower() for word in file)

In [8]:
def count_spelling_errors(text):
    """Uses `spacy` and list of correctly spelled english words
    located at `Paths.ENG_WORDS_HX` to count number of spelling
    errors.
    """
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]

    spelling_errors = sum(
        1 for token in lemmatized_tokens if token not in english_vocab
    )

    return spelling_errors


count_spelling_errors("There is one speling error here")

1

### Expanding Contractions

In [9]:
import re
import json

In [10]:
contraction_dict = json.load(open(Paths.CONTRACTION_FILE_PATH, "r"))
contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))

In [11]:
def expand_contractions(text: str, c_re=contraction_re) -> str:
    """Replaces contracted word/phrase with enlongated word/phrase."""

    def replace(match):
        return contraction_dict[match.group(0)]

    return c_re.sub(replace, text)

expand_contractions("You aren't working!")

'You are not working!'

### Data Cleaning

In [12]:
def remove_HTML_tags(text: str) -> str:
    """Remove HTML tags from a text string"""
    return re.sub(r"<[^>]*>", "", text)

In [13]:
def remove_URL(text: str) -> str:
    """Remove URLs from a text string"""
    return re.sub(r"http\S+", "", text)

In [14]:
def data_preprocessing(x: str) -> str:
    x = x.lower()
    x = remove_HTML_tags(x)
    x = re.sub("@\w+", "", x)
    x = re.sub("'\d+", "", x)
    x = re.sub("\d+", "", x)
    x = remove_URL(x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x


data_preprocessing("This is 1 example: <b>https://www.kaggle.com/</b> for user @shakleen")

'this is example: for user'

### Punctuation Removal

In [15]:
import string

In [16]:
def remove_punctuation(text: str) -> str:
    """A translator is created using str.maketrans('', '', string.punctuation), 
    which generates a translation table that maps each character in the 
    string string.punctuation to None. This effectively removes all punctuation characters."""
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)


remove_punctuation("This.Has!No-Punctuations!")

'ThisHasNoPunctuations'

### Feature Engineerings

#### Paragraph Level

In [21]:
def process_paragraph(df: pd.DataFrame) -> pd.DataFrame:
    # Construct paragraphs
    df["paragraph"] = df["full_text"].map(lambda x: x.split("\n\n"))

    # Have each paragraph be its own row
    df = df.explode("paragraph")

    # Process Paragraph text
    df["paragraph"] = df["paragraph"].map(data_preprocessing)
    df["paragraph_no_punctuation"] = df["paragraph"].map(remove_punctuation)

    # Calculate base stats
    df["paragraph_error_count"] = df["paragraph_no_punctuation"].map(count_spelling_errors)
    df["paragraph_char_count"] = df["paragraph"].map(lambda x: len(x))
    df["paragraph_word_count"] = df["paragraph"].map(lambda x: len(re.findall(r'\w+', x)))
    df["paragraph_sentence_count"] = df["paragraph"].map(lambda x: len(re.findall(r'[.!?]', x)))
    
    return df

In [22]:
train_df = process_paragraph(train_df)
train_df.shape

#### Sentence Level

#### Word Level