In [1]:
# Cell 1: Imports & environment check (Milestone 4)

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

print("Basic imports loaded")


Basic imports loaded


In [3]:
# Cell 2: Load data safely for advanced models

try:
    DATA_PATH = "C://Users//Rachit//OneDrive//Documents//CreditPathAI//data//processed//final_features.csv"

    df = pd.read_csv(DATA_PATH)
    print("Full dataset shape:", df.shape)

    # Sample aggressively for advanced models
    if len(df) > 40000:
        df = df.sample(40000, random_state=42)
        print("Sampled dataset shape:", df.shape)

    # Normalize column names
    df.columns = df.columns.str.upper()

    # Check target
    if "TARGET" not in df.columns:
        raise ValueError("TARGET column not found")

    display(df.head())

except Exception as e:
    print("Data loading error:", e)


Full dataset shape: (307511, 130)
Sampled dataset shape: (40000, 130)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CREDIT_ACTIVE,CREDIT_TYPE,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,BB_MONTH_BAL_MEAN,BB_STATUS_COUNT
245895,384575,0,Cash loans,M,Y,N,2,207000.0,465457.5,52641.0,...,0.0,1.0,2.0,2.0,970276.5,276988.5,27000.0,0.0,-26.571429,2.0
98194,214010,0,Cash loans,F,Y,Y,0,247500.0,1281712.5,48946.5,...,0.0,3.0,2.0,2.0,4330867.5,0.0,0.0,0.0,,
36463,142232,0,Cash loans,F,Y,N,0,202500.0,495000.0,39109.5,...,0.0,3.0,2.0,2.0,2804085.0,1431166.5,0.0,0.0,-18.25,2.833333
249923,389171,0,Cash loans,F,N,Y,0,247500.0,254700.0,24939.0,...,0.0,0.0,1.0,1.0,252517.05,0.0,0.0,0.0,,
158389,283617,0,Cash loans,M,N,Y,0,112500.0,308133.0,15862.5,...,0.0,4.0,2.0,1.0,1360575.0,487476.0,0.0,0.0,,


In [6]:
# Cell 3: Feature / Target separation and safe preprocessing

try:
    # Separate target
    y = df["TARGET"]
    X = df.drop(columns=["TARGET"])

    # Drop ID columns if present
    id_cols = [col for col in X.columns if "SK_ID" in col]
    if id_cols:
        X = X.drop(columns=id_cols)
        print("Dropped ID columns:", id_cols)

    # Identify column types
    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

    print("Numeric columns :", len(numeric_cols))
    print("Categorical columns :", len(categorical_cols))
    print("Final feature shape:", X.shape)

    # Impute missing values
    num_imputer = SimpleImputer(strategy="median")
    cat_imputer = SimpleImputer(strategy="most_frequent")

    X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])
    X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

    print("Missing values handled successfully")

except Exception as e:
    print("Feature preparation error:", e)


Dropped ID columns: ['SK_ID_CURR']
Numeric columns : 112
Categorical columns : 16
Final feature shape: (40000, 128)
Missing values handled successfully


In [7]:
# Cell 4: Train-Test Split

try:
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.25,
        random_state=42,
        stratify=y
    )

    print("Train shape :", X_train.shape)
    print("Test shape  :", X_test.shape)

    print("\nTrain target distribution:")
    print(y_train.value_counts(normalize=True))

    print("\nTest target distribution:")
    print(y_test.value_counts(normalize=True))

except Exception as e:
    print("Train-test split error:", e)


Train shape : (30000, 128)
Test shape  : (10000, 128)

Train target distribution:
TARGET
0    0.918933
1    0.081067
Name: proportion, dtype: float64

Test target distribution:
TARGET
0    0.9189
1    0.0811
Name: proportion, dtype: float64


In [10]:
# Cell 5: Advanced Model – RandomForest with proper preprocessing

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

try:
    # Separate column types
    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = X_train.select_dtypes(include=["object"]).columns

    # Numeric preprocessing
    num_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median"))
    ])

    # Categorical preprocessing
    cat_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    # Combine
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_pipeline, num_cols),
            ("cat", cat_pipeline, cat_cols)
        ]
    )

    # Full pipeline with RandomForest
    rf_pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestClassifier(
            n_estimators=150,
            max_depth=12,
            min_samples_split=10,
            min_samples_leaf=5,
            n_jobs=-1,
            random_state=42
        ))
    ])

    # Train
    rf_pipeline.fit(X_train, y_train)

    # Predict
    y_train_pred = rf_pipeline.predict_proba(X_train)[:, 1]
    y_test_pred = rf_pipeline.predict_proba(X_test)[:, 1]

    # Metrics
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test, y_test_pred)

    print("RandomForest trained successfully")
    print(f"Train AUC-ROC: {train_auc:.4f}")
    print(f"Test  AUC-ROC: {test_auc:.4f}")

except Exception as e:
    print("RandomForest training error:", e)


RandomForest trained successfully
Train AUC-ROC: 0.9400
Test  AUC-ROC: 0.7289


#### Insights: 
“Baseline Logistic Regression achieved AUC 0.74.
RandomForest, as an advanced model, improved performance after proper categorical encoding and missing-value handling.”