In [8]:
import pandas as pd
import numpy as np
import warnings
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib
import os # Import os to join paths

In [9]:
# --- 0. Setup ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# --- 1a. Mount Google Drive (for Colab) ---
print("Mounting Google Drive...")
try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    pass

# --- 1b. Constants ---
DRIVE_MOUNT_POINT = '/content/drive/MyDrive/'
# Define your project folder
PROJECT_DIR = os.path.join(DRIVE_MOUNT_POINT, 'shodhAI')

# Update FILE_PATH to point to your shodhAI folder
FILE_PATH = os.path.join(PROJECT_DIR, 'accepted_2007_to_2018Q4.csv.gz')
# Update SAVE_DIR to save processed files in the shodhAI folder
SAVE_DIR = PROJECT_DIR
SAMPLE_ROWS = 200000

DEFAULT_STATUSES = ['Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off']
PAID_STATUSES = ['Fully Paid', 'Does not meet the credit policy. Status:Fully Paid']
TERMINAL_STATUSES = DEFAULT_STATUSES + PAID_STATUSES

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.


In [10]:
# --- 2. Feature & Target Definitions ---
# Based on our EDA in task_1_eda.py
NUMERIC_FEATURES = [
    'loan_amnt', 'int_rate', 'annual_inc', 'dti',
    'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc'
]
CATEGORICAL_FEATURES = [
    'grade', 'emp_length', 'home_ownership', 'verification_status'
]
# All features our model will use
ALL_FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES
# Features needed for reward calculation
REWARD_FEATURES = ['loan_amnt', 'int_rate']

# --- 3. Load Data ---
print(f"Loading {SAMPLE_ROWS} rows from {FILE_PATH}...")
try:
    df = pd.read_csv(
        FILE_PATH,
        nrows=SAMPLE_ROWS,
        compression='gzip',
        usecols=ALL_FEATURES + REWARD_FEATURES + ['loan_status'],
        low_memory=False
    )
    print(f"Successfully loaded {df.shape[0]} rows and {df.shape[1]} columns.")
except FileNotFoundError:
    print(f"Error: File not found at {FILE_PATH}")
    print("Please double-check the FILE_PATH variable.")
    exit()
except Exception as e:
    print(f"An error occurred loading data: {e}")
    exit()

Loading 200000 rows from /content/drive/MyDrive/shodhAI/accepted_2007_to_2018Q4.csv.gz...
Successfully loaded 200000 rows and 14 columns.


In [11]:
# --- 4. Initial Cleaning & Filtering ---
print("Filtering for terminal loan statuses...")
df_filtered = df[df['loan_status'].isin(TERMINAL_STATUSES)].copy()
print(f"Filtered size (terminal loans only): {df_filtered.shape[0]} rows")

if df_filtered.empty:
    print("\nWarning: No terminal-status loans found. Try increasing SAMPLE_ROWS.")
    exit()

# --- 5. Target and Reward Engineering ---
print("Engineering Target (y) and Reward (r) variables...")

# 5.1. Target (y) for Task 2 (DL Model)
# {0: Fully Paid, 1: Defaulted}
df_filtered['target'] = df_filtered['loan_status'].apply(lambda x: 1 if x in DEFAULT_STATUSES else 0)

# 5.2. Reward (r) for Task 3 (RL Model)
# Clean int_rate for calculation
df_filtered['int_rate'] = pd.to_numeric(df_filtered['int_rate'], errors='coerce')

# Handle potential NaNs in reward features before calculation
reward_imputer = SimpleImputer(strategy='median')
df_filtered[REWARD_FEATURES] = reward_imputer.fit_transform(df_filtered[REWARD_FEATURES])

# Calculate profit (as a percentage of loan amount)
df_filtered['profit_pct'] = df_filtered['int_rate'] / 100.0

# Define reward (r)
# r = 0 (Deny) is implied, we only care about the 'Approve' action
# If action == Approve and Fully Paid (target=0): reward = + (loan_amnt * int_rate) -> We'll use profit_pct for simplicity
# If action == Approve and Defaulted (target=1): reward = - loan_amnt -> We'll use -1 (100% loss of principal)
df_filtered['reward'] = df_filtered.apply(
    lambda row: row['profit_pct'] if row['target'] == 0 else -1.0,
    axis=1
)

print(f"Target variable 'target' created. Distribution:\n{df_filtered['target'].value_counts(normalize=True)}")
print(f"Reward variable 'reward' created. Mean reward: {df_filtered['reward'].mean():.4f}")

Filtering for terminal loan statuses...
Filtered size (terminal loans only): 176083 rows
Engineering Target (y) and Reward (r) variables...
Target variable 'target' created. Distribution:
target
0    0.800713
1    0.199287
Name: proportion, dtype: float64
Reward variable 'reward' created. Mean reward: -0.1075


In [13]:
# --- 6. Build Preprocessing Pipeline ---
print("Building preprocessing pipeline...")

# Numeric pipeline: Impute missing values with median, then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: Impute missing values with most frequent, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, NUMERIC_FEATURES),
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ],
    remainder='drop' # Drop any columns not specified
)


Building preprocessing pipeline...


In [14]:
#
# --- 7. Split, Process, and Save Data ---
print("Splitting data into train and test sets...")
X = df_filtered[ALL_FEATURES]
y = df_filtered['target']
r = df_filtered['reward']

# Split the data
X_train, X_test, y_train, y_test, r_train, r_test = train_test_split(
    X, y, r, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Fit the preprocessor on the training data
print("Fitting preprocessor on training data...")
preprocessor.fit(X_train)

# Define full save paths
preprocessor_path = os.path.join(SAVE_DIR, 'preprocessor.joblib')
train_data_path = os.path.join(SAVE_DIR, 'processed_data_train.npz')
test_data_path = os.path.join(SAVE_DIR, 'processed_data_test.npz')

# Save the fitted preprocessor
joblib.dump(preprocessor, preprocessor_path)
print(f"Fitted preprocessor saved to '{preprocessor_path}'")

# Transform both train and test data
print("Transforming train and test data...")
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names after transformation (for debugging/inspection)
feature_names = preprocessor.get_feature_names_out()
print(f"Data transformed. Total features after processing: {len(feature_names)}")


Splitting data into train and test sets...
Training set size: 140866
Test set size: 35217
Fitting preprocessor on training data...
Fitted preprocessor saved to '/content/drive/MyDrive/shodhAI/preprocessor.joblib'
Transforming train and test data...
Data transformed. Total features after processing: 33


In [15]:
# --- 8. Save Processed Data for Models ---
print(f"Saving processed data to {SAVE_DIR}...")
# Save processed data in a compressed format
np.savez_compressed(
    train_data_path,
    X=X_train_processed,
    y=y_train.values,
    r=r_train.values
)
np.savez_compressed(
    test_data_path,
    X=X_test_processed,
    y=y_test.values,
    r=r_test.values
)

print("\n--- Preprocessing Task 1 Complete ---")
print("We now have the following files in your Google Drive /shodhAI/ folder:")
print(f"1. '{preprocessor_path}' (our fitted pipeline)")
print(f"2. '{train_data_path}' (for training models)")
print(f"3. '{test_data_path}' (for evaluating models)")



Saving processed data to /content/drive/MyDrive/shodhAI...

--- Preprocessing Task 1 Complete ---
We now have the following files in your Google Drive /shodhAI/ folder:
1. '/content/drive/MyDrive/shodhAI/preprocessor.joblib' (our fitted pipeline)
2. '/content/drive/MyDrive/shodhAI/processed_data_train.npz' (for training models)
3. '/content/drive/MyDrive/shodhAI/processed_data_test.npz' (for evaluating models)
