# Feature Engineering

In this notebook, I engineer features for LGBM.

## Setup

In [1]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
os.chdir("../../")

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.data_tools.feature_engineering import (
    process_paragraph,
    paragraph_feature_engineering,
    process_sentence,
    sentence_feature_engineering,
    process_word,
    word_feature_engineering,
    generate_tfidf_features,
    generate_count_features,
)

In [4]:
seed_everything()

## Global Definitions

### Constants

In [5]:
root_data_dir = "data/lgbm_deberta"

In [7]:
all_features = pd.read_csv(Paths.FEATURE_ENGG_CSV_PATH)

### Functions

In [6]:
def get_train_and_valid_df(fold, fold_dir):
    usecols = ["essay_id", "full_text"]

    train_df = pd.read_csv(os.path.join(fold_dir, f"train_{fold}.csv"), usecols=usecols)
    valid_df = pd.read_csv(os.path.join(fold_dir, f"valid_{fold}.csv"), usecols=usecols)

    return train_df, valid_df

## Feature Engineering

In [10]:
for fold in range(config.n_folds):
    fold_dir = os.path.join(root_data_dir, f"fold_{fold}")
    print(f"Processing: {fold_dir}")

    train_df, valid_df = get_train_and_valid_df(fold, fold_dir)

    train_features = all_features[
        all_features.essay_id.isin(train_df.essay_id)
    ].reset_index(drop=True)
    valid_features = all_features[
        all_features.essay_id.isin(valid_df.essay_id)
    ].reset_index(drop=True)

    train_features.to_csv(
        os.path.join(fold_dir, f"train_features_{fold}.csv"),
        index=False
    )
    valid_features.to_csv(
        os.path.join(fold_dir, f"valid_features_{fold}.csv"),
        index=False
    )

Processing: data/lgbm_deberta/fold_0
Processing: data/lgbm_deberta/fold_1
Processing: data/lgbm_deberta/fold_2
Processing: data/lgbm_deberta/fold_3
Processing: data/lgbm_deberta/fold_4
