# 🔍 LGBM - Inference

## ⚙️ Setup 

### 📚 Importing Libraries

In [28]:
import os
import pandas as pd
from transformers import AutoTokenizer
import torch
import pickle as pkl
from sklearn.metrics import cohen_kappa_score
import numpy as np

In [2]:
os.chdir("../../")

In [3]:
from lib.data_tools.feature_engineering import (
    process_paragraph,
    paragraph_feature_engineering,
    process_sentence,
    sentence_feature_engineering,
    process_word,
    word_feature_engineering,
    generate_tfidf_features,
    generate_count_features,
)
from lib.paths import Paths
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything, get_model_path
from lib.data_tools.data import clean_text, sliding_window
from lib.model.inference import ensemble_inference

# Without the following the LGBM models can't be read
from lib.model.utils import cohen_kappa_score, qwk_obj

In [4]:
seed_everything()

## 📖 Definitions

### 🌎 Global Variables

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
config.model.replace("/", "_")

'microsoft_deberta-v3-xsmall'

In [7]:
deberat_model_paths = {
    get_model_path(i): 1 / config.n_folds for i in range(config.n_folds)
}
deberat_model_paths

{'output/microsoft/deberta-v3-xsmall/microsoft_deberta-v3-xsmall_fold_0_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-xsmall/microsoft_deberta-v3-xsmall_fold_1_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-xsmall/microsoft_deberta-v3-xsmall_fold_2_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-xsmall/microsoft_deberta-v3-xsmall_fold_3_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-xsmall/microsoft_deberta-v3-xsmall_fold_4_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-xsmall/microsoft_deberta-v3-xsmall_fold_5_best.pth': 0.14285714285714285,
 'output/microsoft/deberta-v3-xsmall/microsoft_deberta-v3-xsmall_fold_6_best.pth': 0.14285714285714285}

In [21]:
lgbm_model_paths = [f"output/LGBM/{i}.pkl" for i in range(config.lgbm_n_folds)]
lgbm_model_paths

['output/LGBM/0.pkl',
 'output/LGBM/1.pkl',
 'output/LGBM/2.pkl',
 'output/LGBM/3.pkl',
 'output/LGBM/4.pkl',
 'output/LGBM/5.pkl',
 'output/LGBM/6.pkl',
 'output/LGBM/7.pkl',
 'output/LGBM/8.pkl',
 'output/LGBM/9.pkl',
 'output/LGBM/10.pkl',
 'output/LGBM/11.pkl',
 'output/LGBM/12.pkl',
 'output/LGBM/13.pkl',
 'output/LGBM/14.pkl']

## 💿 Loading from Disk

### 🪙 Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(Paths.TOKENIZER_PATH)
vocabulary = tokenizer.get_vocab()
total_tokens = len(vocabulary)
print("Total number of tokens in the tokenizer:", total_tokens)
print(tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Total number of tokens in the tokenizer: 128003
DebertaV2TokenizerFast(name_or_path='output/microsoft/deberta-v3-xsmall/tokenizer_v2', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized

### 🗃️ Dataset

In [9]:
test_df = pd.read_csv(Paths.TEST_CSV_PATH)
test_df.shape

(3, 2)

## ⌛ Data Processing

### Cleaning text

In [10]:
test_df["full_text"] = test_df["full_text"].map(lambda x: clean_text(x))

### Sliding Window

In [11]:
df = sliding_window(test_df, tokenizer)
df.shape, test_df.shape

100%|██████████| 3/3 [00:00<00:00, 182.90it/s]


((5, 2), (3, 2))

## DeBERTa Predictions

In [12]:
deberta_predictions = ensemble_inference(
    df,
    tokenizer,
    deberat_model_paths,
    device,
    logits=True,
)
deberta_predictions

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 0 Inference: 100%|██████████| 1/1 [00:00<00:00,  1.27test_batch/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 1 Inference: 100%|██████████| 1/1 [00:00<00:00, 28.94test_batch/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 2 Inference: 100%|██████████| 1/1 [00:00<00:00, 30.18test_batch/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 3 Inference: 100%|██████████| 1/1 [00:00<00:00, 31.61test_batch/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model 4 Inference: 100%|██████████| 1/1 [00:00<00:00, 29.36test_batch/s]
Special tokens have been added in the vocabul

Unnamed: 0,essay_id,p_0,p_1,p_2,p_3,p_4,p_5,score
0,000d118,0.117078,0.370102,0.419184,0.081213,0.008851,0.003573,2
1,000fe60,0.016278,0.219967,0.675862,0.080437,0.005159,0.002297,2
2,001ab80,0.005337,0.007076,0.047527,0.407841,0.440049,0.092171,4


## LGBM Predictions

### Feature Engineering

In [13]:
paragraph_features = process_paragraph(test_df)
paragraph_features = paragraph_feature_engineering(paragraph_features)
paragraph_features.shape

(3, 33)

In [14]:
sentence_features = process_sentence(test_df)
sentence_features = sentence_feature_engineering(sentence_features)
sentence_features.shape

(3, 25)

In [15]:
word_features = process_word(test_df)
word_features = word_feature_engineering(word_features)
word_features.shape

(3, 7)

In [16]:
with open("output/LGBM/vectorizer.pkl", "rb") as file:
    vectorizer = pkl.load(file)
    _, tfidf_features = generate_tfidf_features(test_df, vectorizer)
    
tfidf_features.shape

(3, 19628)

In [17]:
with open("output/LGBM/vectorizer_cnt.pkl", "rb") as file:
    vectorizer = pkl.load(file)
    _, count_features = generate_count_features(test_df, vectorizer)
    
count_features.shape

(3, 2171)

In [18]:
all_features = deberta_predictions.drop(columns=["score"]).copy()

# Merge using essay_id column
for feature_df in [paragraph_features, sentence_features, word_features, tfidf_features, count_features]:
    all_features = pd.merge(all_features, feature_df, on="essay_id")

all_features.shape

(3, 21866)

### Inference

In [27]:
probabilities = []

for model_path in lgbm_model_paths:
    with open(model_path, "rb") as file:
        model = pkl.load(file)

        probabilities.append(model.predict(all_features.drop(columns=["essay_id"])) + config.lgbm_a)

In [29]:
predictions = np.mean(probabilities, axis=0)
predictions = np.round(predictions.clip(1, 6))
print(predictions)

[2. 2. 4.]


## Submission

In [31]:
submission = pd.DataFrame()
submission["essay_id"] = test_df["essay_id"]
submission["score"] = predictions
print(f"Submission shape: {submission.shape}")

Submission shape: (3, 2)


In [None]:
submission.to_csv("submission.csv", index=False)