# Prepare Data - LGBM

## Setup

In [1]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything

In [4]:
seed_everything()

## Global Definitions

### Constants

In [5]:
root_data_dir = "data/lgbm_deberta"

In [6]:
all_features = pd.read_csv(Paths.FEATURE_ENGG_CSV_PATH)

### Functions

In [7]:
def get_train_and_valid_df(fold, fold_dir):
    usecols = ["essay_id"]

    train_df = pd.read_csv(os.path.join(fold_dir, f"train_{fold}.csv"), usecols=usecols)
    valid_df = pd.read_csv(os.path.join(fold_dir, f"valid_{fold}.csv"), usecols=usecols)

    return train_df, valid_df

In [8]:
def get_train_and_valid_oof(fold, fold_dir):
    train_oof = pd.read_csv(os.path.join(fold_dir, f"oof_train_{fold}.csv"))
    valid_oof = pd.read_csv(os.path.join(fold_dir, f"oof_valid_{fold}.csv"))

    feature_list = [f"score_prob_{i}" for i in range(config.num_classes)]
    train_oof = train_oof.groupby("essay_id")[feature_list].mean()
    valid_oof = valid_oof.groupby("essay_id")[feature_list].mean()

    return train_oof, valid_oof

In [9]:
def select_fold_rows(train_df, valid_df):
    train_features = all_features[
        all_features.essay_id.isin(train_df.essay_id)
    ].reset_index(drop=True)
    valid_features = all_features[
        all_features.essay_id.isin(valid_df.essay_id)
    ].reset_index(drop=True)
    
    return train_features,valid_features

In [10]:
def add_oof_prediction_column(fold, fold_dir, train_features: pd.DataFrame, valid_features: pd.DataFrame):
    train_oof, valid_oof = get_train_and_valid_oof(fold, fold_dir)
    
    train_features = train_features.join(train_oof, on="essay_id", how="outer")
    valid_features = valid_features.join(valid_oof, on="essay_id", how="outer")

    return train_features, valid_features

## Feature Engineering

In [11]:
skf = StratifiedKFold(
    n_splits=config.n_folds,
    shuffle=True,
    random_state=config.random_seed + 20,
)

In [12]:
for fold in range(config.n_folds):
    fold_dir = os.path.join(root_data_dir, f"fold_{fold}")
    print(f"Processing: {fold_dir}")

    train_df, valid_df = get_train_and_valid_df(fold, fold_dir)
    train_features, valid_features = select_fold_rows(train_df, valid_df)
    train_features, valid_features = add_oof_prediction_column(
        fold, fold_dir, train_features, valid_features
    )

    X = train_features.drop(columns=["score"])
    y = train_features["score"]

    for part, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        print("Processing part", part)
        train_part = train_features.loc[train_idx].reset_index(drop=True)
        valid_part = train_features.loc[valid_idx].reset_index(drop=True)

        part_dir = os.path.join(fold_dir, f"part_{part}")
        train_part.to_csv(
            os.path.join(part_dir, f"train_lgbm_{fold}_{part}.csv"),
            index=False,
        )
        valid_part.to_csv(
            os.path.join(part_dir, f"valid_lgbm_{fold}_{part}.csv"),
            index=False,
        )

    valid_features.to_csv(
        os.path.join(fold_dir, f"lgbm_valid_{fold}.csv"),
        index=False,
    )

Processing: data/lgbm_deberta/fold_0
add_oof_prediction_column (13845, 22052) (3462, 22052)
Processing part 0
Processing part 1
Processing part 2
Processing part 3
Processing part 4
Processing: data/lgbm_deberta/fold_1
add_oof_prediction_column (13845, 22052) (3462, 22052)
Processing part 0
Processing part 1
Processing part 2
Processing part 3
Processing part 4
Processing: data/lgbm_deberta/fold_2
add_oof_prediction_column (13846, 22052) (3461, 22052)
Processing part 0
Processing part 1
Processing part 2
Processing part 3
Processing part 4
Processing: data/lgbm_deberta/fold_3
add_oof_prediction_column (13846, 22052) (3461, 22052)
Processing part 0
Processing part 1
Processing part 2
Processing part 3
Processing part 4
Processing: data/lgbm_deberta/fold_4
add_oof_prediction_column (13846, 22052) (3461, 22052)
Processing part 0
Processing part 1
Processing part 2
Processing part 3
Processing part 4
