# LGBM - Data Preparation

## Setup

In [1]:
import os
import pandas as pd
from transformers import AutoTokenizer
import torch

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.data_tools.paragraph_feature_engineering import engineer_paragraph_features
from lib.data_tools.sentence_feature_engineering import engineer_sentence_features
from lib.data_tools.word_feature_engineering import engineer_word_features
from lib.data_tools.grammar_feature_engineering import engineer_grammar_feature
from lib.model.inference import ensemble_inference
from lib.data_tools.data import sliding_window

In [4]:
seed_everything()

## Data Preparation

### Data Loading

In [5]:
train_df = pd.read_csv(
    Paths.COMPETITION_TRAIN_CSV_PATH,
    usecols=["essay_id", "full_text"],
)
train_df.shape

(17307, 2)

### Feature Engineerings

#### Paragraph Level

In [6]:
paragraph_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "paragraphs.csv")

In [7]:
if not os.path.exists(paragraph_csv_path):
    paragraph_features = engineer_paragraph_features(train_df.copy(deep=True), "output/sentence_transformer")
    assert paragraph_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(paragraph_features.shape)
    paragraph_features.to_csv(paragraph_csv_path, index=False)
else:
    paragraph_features = pd.read_csv(paragraph_csv_path)

(17307, 29)


In [8]:
paragraph_features.columns

Index(['essay_id', 'paragraph_count', 'paragraph_sentenceCount_sum',
       'paragraph_sentenceCount_min', 'paragraph_sentenceCount_mean',
       'paragraph_sentenceCount_max', 'paragraph_wordCount_sum',
       'paragraph_wordCount_min', 'paragraph_wordCount_mean',
       'paragraph_wordCount_max', 'paragraph_lengths_sum',
       'paragraph_lengths_min', 'paragraph_lengths_mean',
       'paragraph_lengths_max', 'paragraph_introductionLength',
       'paragraph_conclusionLength', 'paragraph_bodyLength',
       'paragraph_pairwiseSimilarity_sum', 'paragraph_pairwiseSimilarity_min',
       'paragraph_pairwiseSimilarity_mean', 'paragraph_pairwiseSimilarity_max',
       'paragraph_dualSimilarity_sum', 'paragraph_dualSimilarity_min',
       'paragraph_dualSimilarity_mean', 'paragraph_dualSimilarity_max',
       'paragraph_tripleSimilarity_sum', 'paragraph_tripleSimilarity_min',
       'paragraph_tripleSimilarity_mean', 'paragraph_tripleSimilarity_max'],
      dtype='object')

#### Sentence Level

In [9]:
sentence_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "sentences.csv")

In [10]:
if not os.path.exists(sentence_csv_path):
    sentence_features = engineer_sentence_features(train_df.copy(deep=True))
    assert sentence_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(sentence_features.shape)
    sentence_features.to_csv(sentence_csv_path, index=False)
else:
    sentence_features = pd.read_csv(sentence_csv_path)

In [11]:
sentence_features.columns

Index(['essay_id', 'sentence_count', 'sentence_wordMistakesPerSentence_sum',
       'sentence_wordMistakesPerSentence_min',
       'sentence_wordMistakesPerSentence_mean',
       'sentence_wordMistakesPerSentence_max', 'sentence_nounCount_sum',
       'sentence_nounCount_min', 'sentence_nounCount_mean',
       'sentence_nounCount_max', 'sentence_pronounCount_sum',
       'sentence_pronounCount_min', 'sentence_pronounCount_mean',
       'sentence_pronounCount_max', 'sentence_verbCount_sum',
       'sentence_verbCount_min', 'sentence_verbCount_mean',
       'sentence_verbCount_max', 'sentence_determinerCount_sum',
       'sentence_determinerCount_min', 'sentence_determinerCount_mean',
       'sentence_determinerCount_max', 'sentence_adjectiveCount_sum',
       'sentence_adjectiveCount_min', 'sentence_adjectiveCount_mean',
       'sentence_adjectiveCount_max', 'sentence_adverbCount_sum',
       'sentence_adverbCount_min', 'sentence_adverbCount_mean',
       'sentence_adverbCount_max', 'se

#### Word Level

In [12]:
word_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "word.csv")

In [13]:
if not os.path.exists(word_csv_path):
    word_features = engineer_word_features(train_df.copy(deep=True))
    assert word_features.essay_id.unique().shape == word_features.essay_id.unique().shape
    print(word_features.shape)
    word_features.to_csv(word_csv_path, index=False)
else:
    word_features = pd.read_csv(word_csv_path)

In [14]:
word_features.columns

Index(['essay_id', 'word_count', 'word_variety', 'word_scrabbleScores_sum',
       'word_scrabbleScores_min', 'word_scrabbleScores_mean',
       'word_scrabbleScores_max', 'word_consonentScores_sum',
       'word_consonentScores_min', 'word_consonentScores_mean',
       'word_consonentScores_max', 'word_syllableScores_sum',
       'word_syllableScores_min', 'word_syllableScores_mean',
       'word_syllableScores_max', 'word_uniqueSyllableScores_sum',
       'word_uniqueSyllableScores_min', 'word_uniqueSyllableScores_mean',
       'word_uniqueSyllableScores_max', 'word_syllableRarityScores_sum',
       'word_syllableRarityScores_min', 'word_syllableRarityScores_mean',
       'word_syllableRarityScores_max'],
      dtype='object')

#### DeBERTA Predictions

Predictions made using DeBERTA models

In [15]:
deberta_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "deberta.csv")

In [16]:
if not os.path.exists(deberta_csv_path):
    tokenizer = AutoTokenizer.from_pretrained(Paths.TOKENIZER_PATH)
    root_model_path = "output/microsoft/deberta-v3-large"
    model_paths = {
        os.path.join(root_model_path, f"microsoft_deberta-v3-large_fold_{f}_best.pth"): 1 / config.n_folds
        for f in range(config.n_folds)
    }
    model_paths
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    df = sliding_window(train_df.copy(deep=True), tokenizer)
    deberta_features = ensemble_inference(
        df,
        tokenizer,
        model_paths,
        device,
        logits=True,
        overall=True,
        model_wise_reduction=False,
    )
    assert deberta_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(deberta_features.shape)
    deberta_features.to_csv(deberta_csv_path, index=False)
else:
    deberta_features = pd.read_csv(deberta_csv_path)

### Grammar Features

In [17]:
grammar_csv_path = os.path.join(Paths.ROOT_FEATURE_PATH, "grammar_scores.csv")

In [18]:
if not os.path.exists(grammar_csv_path):
    grammar_features = pd.read_csv("data/feature_engg/grammar_correct.csv")
    grammar_features = engineer_grammar_feature(grammar_features)
    assert grammar_features.essay_id.unique().shape == train_df.essay_id.unique().shape
    print(grammar_features.shape)
    grammar_features.to_csv(grammar_csv_path, index=False)
else:
    grammar_features = pd.read_csv(grammar_csv_path)

In [19]:
grammar_features.columns

Index(['essay_id', 'grammar_levenshtein_distance_max',
       'grammar_levenshtein_distance_sum', 'grammar_levenshtein_distance_min',
       'grammar_levenshtein_distance_mean'],
      dtype='object')

#### Combining Features

In [20]:
feature_df_list = [
    # deberta_features,
    sentence_features,
    word_features,
    grammar_features,
]

all_features = paragraph_features.copy(deep=True)

for feature_df in feature_df_list:
    all_features = pd.merge(all_features, feature_df, on="essay_id")

all_features = pd.merge(all_features, train_df[["essay_id"]], on="essay_id")
assert all_features.essay_id.unique().shape == train_df.essay_id.unique().shape
all_features.shape

(17307, 104)

In [21]:
all_features.to_csv(Paths.FEATURE_ENGG_CSV_PATH, index=False)

In [22]:
all_features.columns

Index(['essay_id', 'paragraph_count', 'paragraph_sentenceCount_sum',
       'paragraph_sentenceCount_min', 'paragraph_sentenceCount_mean',
       'paragraph_sentenceCount_max', 'paragraph_wordCount_sum',
       'paragraph_wordCount_min', 'paragraph_wordCount_mean',
       'paragraph_wordCount_max',
       ...
       'word_uniqueSyllableScores_mean', 'word_uniqueSyllableScores_max',
       'word_syllableRarityScores_sum', 'word_syllableRarityScores_min',
       'word_syllableRarityScores_mean', 'word_syllableRarityScores_max',
       'grammar_levenshtein_distance_max', 'grammar_levenshtein_distance_sum',
       'grammar_levenshtein_distance_min',
       'grammar_levenshtein_distance_mean'],
      dtype='object', length=104)