In [20]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from datetime import datetime
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
# loading the data
df = pd.read_csv("loan.csv")
df.shape
df_raw = df.copy()

  df = pd.read_csv("loan.csv")


In [6]:
# -------------------------------------------------------------
# Step 1: Create the 3-class target (common for both pipelines)
# -------------------------------------------------------------
def create_target(df):
    """
    Creates the 3-class target:
      - paid_on_time
      - paid_late
      - not_paid

    Also computes helper columns needed to classify paid_late correctly.
    """
    df = df.copy()

    df['issue_d'] = pd.to_datetime(df['issue_d'], format='%b-%Y', errors='coerce')
    df['last_pymnt_d'] = pd.to_datetime(df['last_pymnt_d'], format='%b-%Y', errors='coerce')

    # Extract loan term in months
    df['term_months'] = df['term'].astype(str).str.extract(r'(\d+)').astype(float)

    # Approximate expected end date
    df['expected_end'] = df['issue_d'] + pd.to_timedelta(df['term_months'] * 30, unit='D')

    # Late fully paid flag
    df['paid_late_flag'] = (
        (df['loan_status'] == 'Fully Paid') &
        (df['last_pymnt_d'] > df['expected_end'])
    )

    # Build target variable
    df['target_3class'] = 'paid_on_time'
    df.loc[df['paid_late_flag'], 'target_3class'] = 'paid_late'
    df.loc[df['loan_status'].isin(['Charged Off', 'Default']), 'target_3class'] = 'not_paid'
    # Remove '(future leakage + breaks categorical encoding)
    df = df.drop(columns=['next_pymnt_d', 'paid_late_flag', 'last_pymnt_d'], errors='ignore')


    return df[df['target_3class'].notna()].reset_index(drop=True)


In [None]:
# =============================================================
# FULL CLEAN PIPELINE (30 variables clean)
# =============================================================
leakage_columns = [
    'hardship_flag','debt_settlement_flag','total_pymnt','total_rec_prncp',
    'total_rec_int','total_rec_late_fee','last_pymnt_d','last_pymnt_amnt',
    'recoveries','collection_recovery_fee','out_prncp','total_pymnt_inv',
    'out_prncp_inv','loan_status'
]

high_corr_drop_columns = [
    'funded_amnt','funded_amnt_inv','installment',
    'num_rev_tl_bal_gt_0','tot_hi_cred_lim',
    'total_il_high_credit_limit','num_sats'
]

redundant_columns = ['policy_code','disbursement_method','chargeoff_within_12_mths', 'initial_list_status']


def basic_clean1(df):
    """
    First stage of full clean:
    - Remove leakage, high correlation, and redundant columns
    - Drop columns with >90% missing
    - Clean term and emp_length
    - Remove rows with missing target
    """
    clean_df = df.copy()

    cols_to_drop = [c for c in (leakage_columns + high_corr_drop_columns + redundant_columns) 
                    if c in clean_df.columns]
    clean_df = clean_df.drop(columns=cols_to_drop)

    # Drop columns with excessive missingness
    missing_pct = clean_df.isnull().mean() * 100
    high_missing_cols = missing_pct[missing_pct > 90].index.tolist()
    clean_df = clean_df.drop(columns=high_missing_cols)

    # Remove "Not Verified"
    clean_df = clean_df[clean_df['verification_status'] != 'Not Verified']
    clean_df = clean_df.drop(columns=['verification_status'], errors='ignore')

    # Clean term (36/60)
    clean_df['term'] = clean_df['term'].astype(str).str.extract(r'(\d+)').astype(float)

    # Clean employee length
    emp = clean_df['emp_length'].astype(str)
    emp = emp.str.replace('< 1', '0', regex=False)
    clean_df['emp_length'] = emp.str.extract(r'(\d+)').astype(float)

    return clean_df[clean_df['target_3class'].notna()].reset_index(drop=True)


def basic_clean2(df):
    """
    Second stage of full clean:
    - Remove non-informative columns
    - Normalize text columns
    - Create engineered features (purpose_grouped, home_stability, credit_age_years, etc.)
    - Remove raw columns after feature engineering
    """
    clean_df = df.copy()

    # Non-informative
    for col in ['pymnt_plan','zip_code','collections_12_mths_ex_med']:
        clean_df = clean_df.drop(columns=col, errors='ignore')

    # Lowercase text columns
    for col in ['purpose','home_ownership','addr_state','application_type','emp_title']:
        if col in clean_df:
            clean_df[col] = clean_df[col].astype(str).str.lower().str.strip()

    # Grouped purpose variable
    clean_df['purpose_grouped'] = clean_df['purpose'].replace({
        'debt_consolidation':'debt','credit_card':'debt',
        'home_improvement':'housing','house':'housing',
        'small_business':'business',
        'car':'personal','medical':'personal','vacation':'personal','moving':'personal',
        'wedding':'personal','major_purchase':'personal',
        'renewable_energy':'other','educational':'other','other':'other'
    })

    # Home stability
    clean_df['home_stability'] = clean_df['home_ownership'].replace({
        'mortgage':'stable','own':'stable',
        'rent':'unstable','none':'unstable','other':'unstable'
    })

    # Credit age
    years = clean_df['earliest_cr_line'].astype(str).str.extract(r'(\d{4})')[0].astype(float)
    clean_df['credit_age_years'] = datetime.now().year - years

    # Create engineered count-like fields
    clean_df['bad_records_count'] = clean_df[['pub_rec','pub_rec_bankruptcies','tax_liens']].sum(axis=1, min_count=1)
    clean_df['recent_credit_activity'] = (
        clean_df['inq_last_6mths'] + clean_df['num_tl_op_past_12m'] - (clean_df['mths_since_recent_inq']/12)
    )

    clean_df['total_balance_all'] = clean_df['tot_cur_bal'] + clean_df['total_bal_il']
    clean_df['active_credit_accounts'] = clean_df['num_actv_bc_tl'] + clean_df['num_actv_rev_tl']

    # Binary delinquency flag
    clean_df['any_delinquency'] = (
        (clean_df['num_accts_ever_120_pd']>0) |
        (clean_df['num_tl_120dpd_2m']>0) |
        (clean_df['num_tl_90g_dpd_24m']>0) |
        (clean_df['num_tl_30dpd']>0) |
        (clean_df['delinq_2yrs']>0)
    ).astype(int)

    clean_df['is_joint_app'] = clean_df['application_type'].str.contains('joint').astype(int)
    clean_df['has_current_delinquency'] = (clean_df['acc_now_delinq']>0).astype(int)
    clean_df['has_collections'] = (clean_df['tot_coll_amt']>0).astype(int)

    # Drop raw columns after creating engineered features
    columns_to_remove = [
        'purpose','home_ownership','earliest_cr_line','application_type',
        'acc_now_delinq','tot_coll_amt','title','emp_title',
        'addr_state','issue_d','last_credit_pull_d','loan_status',
        'days_late','open_acc','revol_bal','pub_rec','pub_rec_bankruptcies',
        'tax_liens','inq_last_6mths','num_tl_op_past_12m','mths_since_recent_inq',
        'tot_cur_bal','total_bal_il','num_actv_bc_tl','num_actv_rev_tl',
        'num_accts_ever_120_pd','num_tl_120dpd_2m','num_tl_90g_dpd_24m',
        'delinq_2yrs','num_tl_30dpd'
    ]

    clean_df = clean_df.drop(columns=[c for c in columns_to_remove if c in clean_df.columns])

    return clean_df



def basic_clean3(df):
    """
    Final stage of full clean:
    Removes noisy, sparse, redundant, or low-importance features.
    """
    clean_df = df.copy()

    low_importance = [
        'open_act_il','open_il_12m','open_il_24m','open_rv_12m','open_rv_24m',
        'open_acc_6m','inq_fi','total_cu_tl','acc_open_past_24mths'
    ]

    redundant = [
        'num_bc_sats','num_bc_tl','num_il_tl','num_op_rev_tl',
        'num_rev_accts','total_acc'
    ]

    sparse = [
        'mths_since_last_record','mths_since_recent_bc_dlq',
        'mths_since_recent_revol_delinq','percent_bc_gt_75'
    ]

    noisy = [
        'mo_sin_old_il_acct','mo_sin_old_rev_tl_op',
        'mo_sin_rcnt_rev_tl_op','mo_sin_rcnt_tl',
        'mths_since_rcnt_il'
    ]

    optional = [
        'mths_since_last_delinq','avg_cur_bal','max_bal_bc','all_util','il_util',
        'inq_last_12m','pct_tl_nvr_dlq','mort_acc','total_bc_limit','total_acc'
    ]

    to_drop = low_importance + redundant + sparse + noisy + optional
    clean_df = clean_df.drop(columns=[c for c in to_drop if c in clean_df.columns])

    return clean_df



def full_clean(df):
    """
    Applies the full 3-stage cleaning process:
    basic_clean1 → basic_clean2 → basic_clean3
    """
    df1 = basic_clean1(df)
    df2 = basic_clean2(df1)
    df3 = basic_clean3(df2)
    return df3

In [27]:
# =============================================================
# MINIMAL CLEAN PIPELINE
# Extracted from: data minimal clean.py
# =============================================================
def minimal_clean1(df):
    """
    Minimal clean:
    - Remove leakage
    - Remove columns with >90% missing
    - Remove one feature from each high-correlation (>0.95) pair
    """
    clean_df = df.copy()

    leakage = [
        'hardship_flag','debt_settlement_flag','total_pymnt','total_rec_prncp',
        'total_rec_int','total_rec_late_fee','last_pymnt_d','last_pymnt_amnt',
        'recoveries','collection_recovery_fee','out_prncp','total_pymnt_inv',
        'out_prncp_inv','loan_status','paid_late_flag'
    ]
    clean_df = clean_df.drop(columns=[c for c in leakage if c in clean_df.columns])

    # Drop >90% missing
    missing_pct = clean_df.isnull().mean() * 100
    high_missing = missing_pct[missing_pct > 90].index.tolist()
    clean_df = clean_df.drop(columns=high_missing)

    # High-correlation removal
    num = clean_df.select_dtypes(include='number').columns
    if len(num) > 1:
        corr = clean_df[num].corr().abs()
        to_drop = set()

        missing = clean_df[num].isnull().mean()
        var = clean_df[num].var()

        for i, c1 in enumerate(num):
            for j, c2 in enumerate(num):
                if j <= i:
                    continue
                if corr.loc[c1, c2] > 0.95:
                    if missing[c1] > missing[c2]:
                        to_drop.add(c1)
                    elif missing[c2] > missing[c1]:
                        to_drop.add(c2)
                    else:
                        to_drop.add(c1 if var[c1] < var[c2] else c2)

        clean_df = clean_df.drop(columns=list(to_drop))

    return clean_df



def minimal_clean2(df):
    """
    Additional minimal cleaning:
    Convert term and emp_length to numeric, extract issue year.
    """
    clean_df = df.copy()

    clean_df['term'] = clean_df['term'].astype(str).str.extract(r'(\d+)').astype(float)

    emp = clean_df['emp_length'].astype(str)
    emp = emp.str.replace('< 1', '0', regex=False)
    clean_df['emp_length'] = emp.str.extract(r'(\d+)').astype(float)

    clean_df['issue_year'] = clean_df['issue_d'].astype(str).str.extract(r'(\d{4})')[0].astype(float)

    return clean_df



def minimal_clean(df):
    """
    Applies minimal_clean1 → minimal_clean2
    """
    df1 = minimal_clean1(df)
    df2 = minimal_clean2(df1)
    return df2[df2["target_3class"].notna()].reset_index(drop=True)

In [28]:
def make_preprocess(df):
    """
    Builds preprocessing:
      - Time since event columns (mths_since_*):
          * Impute missing with 0  (interpreted as 'no event')
          * Add missing indicator
          * Scale
      - Other numeric columns:
          * Impute missing with median
          * Add missing indicator
          * Scale
      - Categorical columns (including ordinal-like text):
          * Impute missing with string 'missing'
          * One-hot encode (missing becomes its own category)
    """

    # --- 1. Identify column groups ---
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    time_cols = [col for col in numeric_cols if col.startswith("mths_since_")]
    num_regular = list(set(numeric_cols) - set(time_cols))
    categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()

    # --- 2. Define pipelines for each group ---
    time_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value=0, add_indicator=True)),
        ("scaler", StandardScaler())
    ])

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    # --- 3. Combine all in ColumnTransformer ---

    transformer = ColumnTransformer(
        transformers=[
            ("time", time_pipeline, time_cols),
            ("num", numeric_pipeline, num_regular),
            ("cat", categorical_pipeline, categorical_cols),
        ],
        remainder="drop" 
    )

    return transformer


In [29]:
def build_model_pipeline(preprocess):
    """
    Combines preprocessing + RandomForest classifier into a single Pipeline.
    """
    model = RandomForestClassifier(n_estimators=300, random_state=42)

    pipe = Pipeline([
        ("preprocess", preprocess),
        ("clf", model)
    ])

    return pipe

In [30]:
def run_experiment(df, title, model_type):
    """
    Runs training + test split + preprocessing + model training.
    model_type: "random_forest", "logistic", "xgboost"
    """
    print(f"\n=== Running: {title} ({model_type}) ===")

    # Split into features and target
    X = df.drop(columns=["target_3class"])
    y = df["target_3class"]
    
        # Convert target to numeric for XGBoost
    if model_type == "xgboost":
        y = y.astype("category").cat.codes

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        stratify=y,
        random_state=42,
    )


    # Build preprocessing based only on the feature matrix
    preprocess = make_preprocess(X_train)

    # Choose model
    if model_type == "random_forest":
        model = RandomForestClassifier(
            n_estimators=200,
            max_depth=None,
            n_jobs=-1,
            random_state=42
        )

    elif model_type == "xgboost":
        model = XGBClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            tree_method="hist"   # FAST + GPU compatible
        )

    elif model_type == "logistic":
        model = LogisticRegression(max_iter=500)

    # Create pipeline
    pipe = Pipeline([
        ("pre", preprocess),
        ("model", model)
    ])

    # Fit
    pipe.fit(X_train, y_train)

    # ---------- Evaluation ----------
    # תחזיות על קבוצת ה-test
    y_pred = pipe.predict(X_test)

    # 1. Accuracy – אחוז הדוגמאות שנחזו נכון
    acc = accuracy_score(y_test, y_pred)

    # 2. Macro F1 – ממוצע F1 לכל המחלקות
    macro_f1 = f1_score(y_test, y_pred, average="macro")

    print("Accuracy:", acc)
    print("Macro F1:", macro_f1)

    # 3. Confusion Matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # 4. Classification Report – Precision / Recall / F1 לכל מחלקה
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return acc, pipe


In [None]:
# =============================================================
# Run 3 models on both Full and Minimal datasets
# =============================================================
df_with_target = create_target(df_raw)

df_sample = df_with_target.sample(100000, random_state=42)

df_full_sample = full_clean(df_sample.copy())
df_minimal_sample = minimal_clean(df_sample.copy())


models = ["random_forest", "xgboost", "logistic"]

results = {}

for model_name in models:
    print("\n=============================================================")
    print(f"MODEL: {model_name.upper()}")
    print("=============================================================")

    # ---- FULL CLEAN ----
    acc_full, model_full = run_experiment(
        df_full_sample,
        f"Full Clean Sample - {model_name}",
        model_type=model_name
    )

    # ---- MINIMAL CLEAN ----
    acc_minimal, model_minimal = run_experiment(
        df_minimal_sample,
        f"Minimal Clean Sample - {model_name}",
        model_type=model_name
    )

    # Save results
    results[(model_name, "full")] = acc_full
    results[(model_name, "minimal")] = acc_minimal


# =============================================================
# Print Summary Table
# =============================================================
print("\n\n==================== SUMMARY ====================\n")
print("{:<20} {:<15} {:<15}".format("Model", "Full Clean", "Minimal Clean"))
print("-" * 50)

for model_name in models:
    print("{:<20} {:<15.4f} {:<15.4f}".format(
        model_name,
        results[(model_name, "full")],
        results[(model_name, "minimal")]
    ))


MODEL: RANDOM_FOREST

=== Running: Full Clean Sample - random_forest (random_forest) ===
Accuracy: 0.7725513545698125
Macro F1: 0.3399933140145713

Confusion Matrix:
[[   29    31  1752]
 [   11    84  1132]
 [   40    90 10267]]

Classification Report:
              precision    recall  f1-score   support

    not_paid       0.36      0.02      0.03      1812
   paid_late       0.41      0.07      0.12      1227
paid_on_time       0.78      0.99      0.87     10397

    accuracy                           0.77     13436
   macro avg       0.52      0.36      0.34     13436
weighted avg       0.69      0.77      0.69     13436


=== Running: Minimal Clean Sample - random_forest (random_forest) ===
Accuracy: 0.79405
Macro F1: 0.3172369946905172

Confusion Matrix:
[[   62     4  2240]
 [    2    14  1849]
 [   11    13 15805]]

Classification Report:
              precision    recall  f1-score   support

    not_paid       0.83      0.03      0.05      2306
   paid_late       0.45      0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.774188746650789
Macro F1: 0.35031498946163175

Confusion Matrix:
[[   61    26  1725]
 [   13    81  1133]
 [   76    61 10260]]

Classification Report:
              precision    recall  f1-score   support

    not_paid       0.41      0.03      0.06      1812
   paid_late       0.48      0.07      0.12      1227
paid_on_time       0.78      0.99      0.87     10397

    accuracy                           0.77     13436
   macro avg       0.56      0.36      0.35     13436
weighted avg       0.70      0.77      0.69     13436


=== Running: Minimal Clean Sample - logistic (logistic) ===


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.79915
Macro F1: 0.5374162434009911

Confusion Matrix:
[[  699    70  1537]
 [   71   503  1291]
 [  401   647 14781]]

Classification Report:
              precision    recall  f1-score   support

    not_paid       0.60      0.30      0.40      2306
   paid_late       0.41      0.27      0.33      1865
paid_on_time       0.84      0.93      0.88     15829

    accuracy                           0.80     20000
   macro avg       0.62      0.50      0.54     20000
weighted avg       0.77      0.80      0.78     20000




Model                Full Clean      Minimal Clean  
--------------------------------------------------
random_forest        0.7726          0.7941         
xgboost              0.7746          0.8155         
logistic             0.7742          0.7992         


In [None]:
df_full_sample["target_3class"].value_counts()
df_full_sample["target_3class"].value_counts(normalize=True)

target_3class
paid_on_time    0.77121
not_paid        0.13740
paid_late       0.09139
Name: proportion, dtype: float64

In [19]:
df_minimal_sample["target_3class"].value_counts()
df_minimal_sample["target_3class"].value_counts(normalize=True)

target_3class
paid_on_time    0.79142
not_paid        0.11531
paid_late       0.09327
Name: proportion, dtype: float64