# Preprocessing
## Setup

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

In [5]:
load_dotenv()  # Load environment variables from .env file
PARENT = Path(os.getcwd()).parent

DATA_DIR_PROCESSED = Path(os.getenv("DATA_DIR_PROCESSED"))
DATA_DIR_PROCESSED = PARENT / DATA_DIR_PROCESSED

In [23]:
df = pd.read_csv(DATA_DIR_PROCESSED / "2-loan_data_feature_engineered.csv")

## train-test split

In [35]:
X = df.drop(columns=['default'])
y = df['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [25]:
X_train.head()

Unnamed: 0,credit_policy,purpose,interest_rate,installment,log_annual_income,debt_income_ratio,fico,days_with_credit_line,revolve_balance,revolve_utilized,inquiries_last_6_mon,delinquent_2_yrs,public_recs,income_installment_ratio,debt_to_credit_ratio
6381,1,credit_card,0.1426,548.9,11.362103,12.71,687,5264.041667,16799,74.7,1,1,0,13.056416,0.195337
7589,1,credit_card,0.0751,155.55,11.041321,12.75,737,5160.0,11757,42.6,0,0,0,33.429765,0.188413
465,1,debt_consolidation,0.0983,136.0,10.778956,11.93,717,6329.041667,19708,37.1,0,1,0,29.411765,0.410583
5343,1,all_other,0.1357,339.69,11.608236,16.12,687,3330.0,1584,49.5,1,0,0,26.985389,0.0144
8016,0,debt_consolidation,0.1312,438.78,11.654425,10.33,662,660.0,0,0.0,4,0,0,21.878846,0.0


In [26]:
y_train.head()

6381    0
7589    0
465     0
5343    0
8016    0
Name: default, dtype: int64

In [36]:
X_train.to_csv(DATA_DIR_PROCESSED / "X_train.csv", index=False)
X_test.to_csv(DATA_DIR_PROCESSED / "X_test.csv", index=False)
y_train.to_csv(DATA_DIR_PROCESSED / "y_train.csv", index=False)
y_test.to_csv(DATA_DIR_PROCESSED / "y_test.csv", index=False)

## Preprocessing
### Encoding categorical feature

In [28]:
class FrequencyEncoder:
    def __init__(self):
        self.freq_maps = {}

    def fit(self, df, cols):
        """
        Learn frequency encoding maps from training data.
        
        Parameters:
            df (pd.DataFrame): training dataframe
            cols (list): list of categorical column names
        """
        for col in cols:
            self.freq_maps[col] = df[col].value_counts(normalize=True)
        return self

    def transform(self, df):
        """
        Apply learned frequency encoding to a dataframe.
        
        Parameters:
            df (pd.DataFrame): dataframe to transform
        """
        df = df.copy()
        for col, freq_map in self.freq_maps.items():
            df[col] = df[col].map(freq_map).fillna(0)
        return df

    def fit_transform(self, df, cols):
        """
        Fit and transform in one step (for training data).
        """
        self.fit(df, cols)
        return self.transform(df)

In [37]:
freq_encoder = FrequencyEncoder()
X_train = freq_encoder.fit_transform(X_train, cols=["purpose"])
X_test = freq_encoder.transform(X_test)

### Scaling numerical features

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, RobustScaler
from sklearn.pipeline import Pipeline

In [38]:
def build_preprocessor(X_train, categorical_cols, skew_thresholds=(1.0, 3.0)):
    """
    Build a preprocessing ColumnTransformer based on training data.

    Parameters
    ----------
    X_train : pd.DataFrame
        Training features.
    categorical_cols : list
        List of categorical columns to ignore.
    skew_thresholds : tuple (mild, heavy)
        Thresholds for absolute skewness to define mild vs heavy skew.
    
    Returns
    -------
    preprocessor : ColumnTransformer
        Preprocessing pipeline (unfitted).
    col_groups : dict
        Dictionary with assigned column groups.
    """
    
    # Numerical columns = all numeric except categorical ones
    numerical_cols = [col for col in X_train.select_dtypes(include=[np.number]).columns 
                      if col not in categorical_cols]
    
    # Calculate skewness on training set
    skewness = X_train[numerical_cols].skew().fillna(0)
    
    normal_cols = skewness[skewness.abs() <= skew_thresholds[0]].index.tolist()
    mild_skewed_cols = skewness[(skewness.abs() > skew_thresholds[0]) & (skewness.abs() <= skew_thresholds[1])].index.tolist()
    heavy_skewed_cols = skewness[skewness.abs() > skew_thresholds[1]].index.tolist()
    
    # Define preprocessing
    preprocessor = ColumnTransformer([
        ('norm', StandardScaler(), normal_cols),
        ('mild', Pipeline([('pt_mild', PowerTransformer()), ('scaler', StandardScaler())]), mild_skewed_cols),
        ('heavy', Pipeline([('pt_heavy', PowerTransformer()), ('robust', RobustScaler())]), heavy_skewed_cols)
    ], remainder='passthrough')
    
    col_groups = {
        "categorical": categorical_cols,
        "normal": normal_cols,
        "mild_skewed": mild_skewed_cols,
        "heavy_skewed": heavy_skewed_cols
    }
    
    return preprocessor, col_groups

In [39]:
preprocessor, col_groups = build_preprocessor(X_train, categorical_cols=["purpose", "credit_policy", "default"])
col_groups

{'categorical': ['purpose', 'credit_policy', 'default'],
 'normal': ['interest_rate',
  'installment',
  'log_annual_income',
  'debt_income_ratio',
  'fico',
  'revolve_utilized'],
 'mild_skewed': ['days_with_credit_line'],
 'heavy_skewed': ['revolve_balance',
  'inquiries_last_6_mon',
  'delinquent_2_yrs',
  'public_recs',
  'income_installment_ratio',
  'debt_to_credit_ratio']}

In [40]:
preprocessor.fit(X_train)

# Transform train and test
X_train_preproc = preprocessor.transform(X_train)
X_test_preproc = preprocessor.transform(X_test)

In [41]:
# Get feature names
feature_names = preprocessor.get_feature_names_out()
# Remove the transformer prefix before "__"
clean_names = [name.split("__")[-1] for name in feature_names]

X_train_df = pd.DataFrame(X_train_preproc, columns=clean_names, index=X_train.index)
X_test_df = pd.DataFrame(X_test_preproc, columns=clean_names, index=X_test.index)

X_train_df.head()

Unnamed: 0,interest_rate,installment,log_annual_income,debt_income_ratio,fico,revolve_utilized,days_with_credit_line,revolve_balance,inquiries_last_6_mon,delinquent_2_yrs,public_recs,income_installment_ratio,debt_to_credit_ratio,credit_policy,purpose
6381,0.755456,1.112193,0.717639,0.01479,-0.629324,0.953291,0.441663,0.41815,0.0,3.118873,0.0,-0.284398,0.112932,1.0,0.130253
7589,-1.793957,-0.796104,0.184816,0.020626,0.689102,-0.150348,0.402946,0.185216,-0.687179,0.0,0.0,0.654462,0.087067,1.0,0.130253
465,-0.917714,-0.890948,-0.250974,-0.099005,0.161732,-0.339445,0.813208,0.528819,-0.687179,3.118873,0.0,0.542418,0.682058,1.0,0.410337
5343,0.49485,0.097232,1.12647,0.512281,-0.629324,0.086883,-0.37664,-0.817662,0.0,0.0,0.0,0.464378,-0.835441,1.0,0.246933
8016,0.324889,0.577957,1.203191,-0.332432,-1.288536,-1.614991,-2.354656,-2.202819,0.633533,0.0,0.0,0.264898,-0.943676,0.0,0.410337


In [42]:
# Save to CSV
X_train_df.to_csv(DATA_DIR_PROCESSED / "X_train_preprocessed.csv", index=False)
X_test_df.to_csv(DATA_DIR_PROCESSED / "X_test_preprocessed.csv", index=False)

In [45]:
MODEL_DIR = Path(os.getenv("MODEL_DIR"))
MODEL_DIR = PARENT / MODEL_DIR

In [47]:
# Save the processor
import joblib
joblib.dump(preprocessor, MODEL_DIR / "preprocessor.pkl")

['c:\\Users\\shrey\\Desktop\\bootcamp_shreya_doodipala\\project\\models\\preprocessor.pkl']

## Test reusable functions

In [1]:
import sys, os
sys.path.append(os.path.abspath(".."))
from src import preprocessing as preproc

In [6]:
df = pd.read_csv(DATA_DIR_PROCESSED / "2-loan_data_feature_engineered.csv")

In [7]:
X = df.drop(columns=['default'])
y = df['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
freq_encoder = preproc.FrequencyEncoder()
X_train = freq_encoder.fit_transform(X_train, cols=["purpose"])
X_test = freq_encoder.transform(X_test)

In [9]:
X_train_proc, X_test_proc, preprocessor, col_groups = preproc.preprocess_data(
    X_train, X_test, categorical_cols=["purpose", "credit_policy", "default"]
)

In [11]:
col_groups

{'categorical': ['purpose', 'credit_policy', 'default'],
 'normal': ['interest_rate',
  'installment',
  'log_annual_income',
  'debt_income_ratio',
  'fico',
  'revolve_utilized'],
 'mild_skewed': ['days_with_credit_line'],
 'heavy_skewed': ['revolve_balance',
  'inquiries_last_6_mon',
  'delinquent_2_yrs',
  'public_recs',
  'income_installment_ratio',
  'debt_to_credit_ratio']}

In [10]:
X_train_proc.head()

Unnamed: 0,interest_rate,installment,log_annual_income,debt_income_ratio,fico,revolve_utilized,days_with_credit_line,revolve_balance,inquiries_last_6_mon,delinquent_2_yrs,public_recs,income_installment_ratio,debt_to_credit_ratio,credit_policy,purpose
6381,0.755456,1.112193,0.717639,0.01479,-0.629324,0.953291,0.441663,0.41815,0.0,3.118873,0.0,-0.284398,0.112932,1.0,0.130253
7589,-1.793957,-0.796104,0.184816,0.020626,0.689102,-0.150348,0.402946,0.185216,-0.687179,0.0,0.0,0.654462,0.087067,1.0,0.130253
465,-0.917714,-0.890948,-0.250974,-0.099005,0.161732,-0.339445,0.813208,0.528819,-0.687179,3.118873,0.0,0.542418,0.682058,1.0,0.410337
5343,0.49485,0.097232,1.12647,0.512281,-0.629324,0.086883,-0.37664,-0.817662,0.0,0.0,0.0,0.464378,-0.835441,1.0,0.246933
8016,0.324889,0.577957,1.203191,-0.332432,-1.288536,-1.614991,-2.354656,-2.202819,0.633533,0.0,0.0,0.264898,-0.943676,0.0,0.410337


In [12]:
X_test_proc.head()

Unnamed: 0,interest_rate,installment,log_annual_income,debt_income_ratio,fico,revolve_utilized,days_with_credit_line,revolve_balance,inquiries_last_6_mon,delinquent_2_yrs,public_recs,income_installment_ratio,debt_to_credit_ratio,credit_policy,purpose
8157,1.22757,0.481948,0.174134,0.404321,-1.684064,0.795137,-1.231483,0.155777,-0.687179,0.0,0.0,-0.350389,0.058169,0.0,0.410337
1928,-1.608889,-1.26567,-0.183169,0.941202,0.557259,-0.040328,0.254522,0.782599,-0.687179,0.0,0.0,1.222269,0.903867,1.0,0.410337
2779,0.577942,1.755536,1.515088,-0.800744,0.029889,-0.745145,0.380398,-0.072354,-0.687179,0.0,0.0,0.006709,-0.562947,1.0,0.246933
2520,0.219135,-1.236756,-2.182984,-0.310548,-0.233796,-1.48778,-0.927952,-1.132001,0.0,0.0,0.0,0.139913,-0.634959,1.0,0.246933
2601,1.291777,0.147347,0.034471,-0.727798,-0.761166,-0.439151,-0.623925,-0.763355,0.312821,0.0,0.0,-0.240373,-0.712596,1.0,0.064082
