### Milestone 3 --- 
Objective : 
     Training initial logistic regression model.

In [21]:

# Cell 1: Imports & configuration

import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [24]:
# Cell 2: Load dataset safely (sampling for low RAM)

try:
    DATA_PATH = "C://Users//Rachit//OneDrive//Documents//CreditPathAI//data//processed//final_features.csv"   
    df = pd.read_csv(DATA_PATH)

    print("Full dataset shape:", df.shape)

    # Sample if dataset is large
    if len(df) > 50000:
        df = df.sample(50000, random_state=42)
        print("Sampled dataset shape:", df.shape)

    display(df.head())

except Exception as e:
    print("Data loading error:", e)

Full dataset shape: (307511, 130)
Sampled dataset shape: (50000, 130)


Unnamed: 0,SK_ID_CURR,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,...,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,credit_active,credit_type,amt_credit_sum,amt_credit_sum_debt,amt_credit_sum_limit,amt_credit_sum_overdue,BB_MONTH_BAL_MEAN,BB_STATUS_COUNT
245895,384575,0,Cash loans,M,Y,N,2,207000.0,465457.5,52641.0,...,0.0,1.0,2.0,2.0,970276.5,276988.5,27000.0,0.0,-26.571429,2.0
98194,214010,0,Cash loans,F,Y,Y,0,247500.0,1281712.5,48946.5,...,0.0,3.0,2.0,2.0,4330867.5,0.0,0.0,0.0,,
36463,142232,0,Cash loans,F,Y,N,0,202500.0,495000.0,39109.5,...,0.0,3.0,2.0,2.0,2804085.0,1431166.5,0.0,0.0,-18.25,2.833333
249923,389171,0,Cash loans,F,N,Y,0,247500.0,254700.0,24939.0,...,0.0,0.0,1.0,1.0,252517.05,0.0,0.0,0.0,,
158389,283617,0,Cash loans,M,N,Y,0,112500.0,308133.0,15862.5,...,0.0,4.0,2.0,1.0,1360575.0,487476.0,0.0,0.0,,


In [25]:
# Cell 3: Target verification & basic cleanup

try:
    # Normalize column names (important!)
    df.columns = df.columns.str.upper()

    # Check target
    if "TARGET" not in df.columns:
        raise ValueError("TARGET column not found")

    print("Target distribution:")
    print(df["TARGET"].value_counts())

    # Separate features & target
    X = df.drop(columns=["TARGET"])
    y = df["TARGET"]

    # Drop ID columns if present
    id_cols = [col for col in X.columns if "ID" in col]
    X = X.drop(columns=id_cols)

    print("Final feature shape:", X.shape)

except Exception as e:
    print("Target setup error:", e)

Target distribution:
TARGET
0    45977
1     4023
Name: count, dtype: int64
Final feature shape: (50000, 127)


In [26]:
# Cell 4: Train-test split (stratified)

try:
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    print("Train shape:", X_train.shape)
    print("Test shape :", X_test.shape)
    print("Train target distribution:")
    print(y_train.value_counts(normalize=True))

except Exception as e:
    print("Train-test split error:", e)


Train shape: (40000, 127)
Test shape : (10000, 127)
Train target distribution:
TARGET
0    0.91955
1    0.08045
Name: proportion, dtype: float64


In [29]:
# Cell 5: Preprocessing pipeline:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

try:
    # Identify column types
    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = X_train.select_dtypes(include=["object", "category"]).columns

    print("Numeric columns:", len(num_cols))
    print("Categorical columns:", len(cat_cols))

    # Numeric pipeline
    num_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    # Categorical pipeline (FIXED)
    cat_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False,
            max_categories=20
        ))
    ])

    # Combine pipelines
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_pipeline, num_cols),
            ("cat", cat_pipeline, cat_cols)
        ]
    )

    print(" Preprocessor ready (no version issues)")

except Exception as e:
    print("Preprocessing setup error:", e)


Numeric columns: 111
Categorical columns: 16
 Preprocessor ready (no version issues)


In [30]:
# Cell 6: Train Logistic Regression baseline model 

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

try:
    # Build full pipeline: preprocessing + model
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(
            max_iter=1000,
            solver="lbfgs",
            n_jobs=-1
        ))
    ])

    # Train
    model.fit(X_train, y_train)

    print("Logistic Regression model trained successfully")

    # Predict probabilities
    y_train_pred = model.predict_proba(X_train)[:, 1]
    y_test_pred = model.predict_proba(X_test)[:, 1]

    # AUC-ROC
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test, y_test_pred)

    print(f"Train AUC-ROC: {train_auc:.4f}")
    print(f"Test  AUC-ROC: {test_auc:.4f}")

except Exception as e:
    print("Model training error:", e)


Logistic Regression model trained successfully
Train AUC-ROC: 0.7540
Test  AUC-ROC: 0.7400


In [31]:
# Cell 7: Save baseline model & metrics

import joblib
import json
import os

try:
    os.makedirs("models", exist_ok=True)

    # Save trained pipeline
    joblib.dump(model, "models/logistic_regression_baseline.pkl")

    # Save metrics
    metrics = {
        "train_auc": round(train_auc, 4),
        "test_auc": round(test_auc, 4)
    }

    with open("models/logistic_regression_metrics.json", "w") as f:
        json.dump(metrics, f, indent=4)

    print("Baseline model & metrics saved successfully")
    print(metrics)

except Exception as e:
    print("Saving error:", e)


Baseline model & metrics saved successfully
{'train_auc': 0.754, 'test_auc': 0.74}
