In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shodhh/accepted_2007_to_2018Q4.csv


In [23]:
import pandas as pd
import numpy as np
import os

# --- Configuration ---
print("--- [New Pipeline] Part 1: Loading & Initial Setup ---")
DATA_FILE_PATH = '/kaggle/input/shodhh/accepted_2007_to_2018Q4.csv' # Make sure this path is correct

# --- 1. Load Data ---
try:
    df_new = pd.read_csv(DATA_FILE_PATH, low_memory=False)
    # Drop rows where loan_status is missing
    df_new = df_new.dropna(subset=["loan_status"])
    print(f"✅ Full data loaded. Shape: {df_new.shape}")
except Exception as e:
    print(f"❌ Error loading data: {e}")
    df_new = pd.DataFrame() # Create empty DataFrame on error

if not df_new.empty:
    # --- 2. Sample Data (Replicating Notebook Step) ---
    print("\nSampling 100,000 rows...")
    sampled_df_new = df_new.sample(n=100000, random_state=42)
    print(f"Sampled data shape: {sampled_df_new.shape}")

    # --- 3. Define Target Variable (Notebook's Definition) ---
    print("Defining target variable 'loan_condition_int' (notebook definition)...")
    bad_loan_statuses_nb = [
        "Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off",
        "In Grace Period", "Late (16-30 days)", "Late (31-120 days)"
    ]
    sampled_df_new['loan_condition_int'] = sampled_df_new['loan_status'].apply(
        lambda status: 1 if status in bad_loan_statuses_nb else 0
    ).astype(int)
    print("Target variable defined.")
    print("Target distribution in sample:")
    print(sampled_df_new['loan_condition_int'].value_counts(normalize=True))

    # --- 4. Map emp_length ---
    print("\nMapping 'emp_length' to 'emp_length_int'...")
    emp_length_mapping_nb = {
        '10+ years': 10, '9 years': 9, '8 years': 8, '7 years': 7, '6 years': 6,
        '5 years': 5, '4 years': 4, '3 years': 3, '2 years': 2, '1 year': 1,
        '< 1 year': 0.5, 'n/a': 0
    }
    sampled_df_new['emp_length_int'] = sampled_df_new['emp_length'].map(emp_length_mapping_nb)

    # --- 5. Map Region ---
    print("Mapping 'addr_state' to 'region'...")
    state_to_region_nb = {
        'CA': 'West', 'OR': 'West', 'UT': 'West', 'WA': 'West', 'CO': 'West', 'NV': 'West',
        'AK': 'West', 'MT': 'West', 'HI': 'West', 'WY': 'West', 'ID': 'West', 'AZ': 'SouthWest',
        'TX': 'SouthWest', 'NM': 'SouthWest', 'OK': 'SouthWest', 'GA': 'SouthEast', 'NC': 'SouthEast',
        'VA': 'SouthEast', 'FL': 'SouthEast', 'KY': 'SouthEast', 'SC': 'SouthEast', 'LA': 'SouthEast',
        'AL': 'SouthEast', 'WV': 'SouthEast', 'DC': 'SouthEast', 'AR': 'SouthEast', 'DE': 'SouthEast',
        'MS': 'SouthEast', 'TN': 'SouthEast', 'IL': 'MidWest', 'MO': 'MidWest', 'MN': 'MidWest',
        'OH': 'MidWest', 'WI': 'MidWest', 'KS': 'MidWest', 'MI': 'MidWest', 'SD': 'MidWest',
        'IA': 'MidWest', 'NE': 'MidWest', 'IN': 'MidWest', 'ND': 'MidWest', 'CT': 'NorthEast',
        'NY': 'NorthEast', 'PA': 'NorthEast', 'NJ': 'NorthEast', 'RI': 'NorthEast', 'MA': 'NorthEast',
        'MD': 'NorthEast', 'VT': 'NorthEast', 'NH': 'NorthEast', 'ME': 'NorthEast'
    }
    sampled_df_new['region'] = sampled_df_new['addr_state'].map(state_to_region_nb)

    # Store for next step
    newpipe_step1_df = sampled_df_new
    print("\n✅ Initial setup complete.")

else:
    print("❌ Cannot proceed, data loading failed.")

--- [New Pipeline] Part 1: Loading & Initial Setup ---
✅ Full data loaded. Shape: (2260668, 151)

Sampling 100,000 rows...
Sampled data shape: (100000, 151)
Defining target variable 'loan_condition_int' (notebook definition)...
Target variable defined.
Target distribution in sample:
loan_condition_int
0    0.86628
1    0.13372
Name: proportion, dtype: float64

Mapping 'emp_length' to 'emp_length_int'...
Mapping 'addr_state' to 'region'...

✅ Initial setup complete.


In [24]:
import pandas as pd
import numpy as np

# Assuming 'newpipe_step1_df' is the sampled DataFrame from the previous step

print("--- [New Pipeline] Part 2: Data Cleaning (Exclusions) --")
print("--- (MODIFIED: 'loan_status' column is now KEPT) ---")


if 'newpipe_step1_df' in locals() or 'newpipe_step1_df' in globals():
    df_cleaning_new = newpipe_step1_df.copy()
    original_shape = df_cleaning_new.shape
    print(f"Shape before cleaning: {original_shape}")

    # --- 1. Remove 'Current' and 'Issued' loan_status --
    print("\nRemoving 'Current' and 'Issued' loan statuses...")
    initial_rows = len(df_cleaning_new)
    df_cleaning_new = df_cleaning_new[~df_cleaning_new['loan_status'].isin(['Current', 'Issued'])]
    rows_removed = initial_rows - len(df_cleaning_new)
    print(f"Removed {rows_removed} rows. New shape: {df_cleaning_new.shape}")

    # --- 2. Drop columns with > 80% missing values --
    print("\nDropping columns with > 80% missing values...")
    initial_cols = df_cleaning_new.shape[1]
    # Keep columns with at least 20% non-missing data
    df_cleaning_new = df_cleaning_new.dropna(axis=1, thresh=int(0.20 * len(df_cleaning_new)))
    cols_dropped = initial_cols - df_cleaning_new.shape[1]
    print(f"Dropped {cols_dropped} columns. New shape: {df_cleaning_new.shape}")

    # --- 3. Drop direct indicator columns (as defined in notebook) --
    print("\nDropping direct indicator columns...")
    direct_indicators_nb = [
        'collection_recovery_fee', 'last_pymnt_amnt', 'out_prncp', 'out_prncp_inv',
        'recoveries', 'total_pymnt', 'total_pymnt_inv', 'total_rec_int',
        'total_rec_late_fee', 'total_rec_prncp', 'next_pymnt_d'
    ]
    # Ensure columns exist before dropping
    direct_indicators_to_drop = [col for col in direct_indicators_nb if col in df_cleaning_new.columns]
    df_cleaning_new.drop(columns=direct_indicators_to_drop, inplace=True, errors='ignore')
    print(f"Dropped {len(direct_indicators_to_drop)} indicator columns. New shape: {df_cleaning_new.shape}")


    # --- 4. Drop repetitive/useless object columns (as defined in notebook) --
    print("\nDropping repetitive/useless object columns...")
    
    # --- [MODIFICATION] ---
    # 'loan_status' has been REMOVED from this list. We need it.
    misc_cols_to_drop_nb = [
        'emp_length', # Keeping emp_length_int
        'id', 'emp_title', 'url', 'title', 'zip_code',
        # 'loan_status', # <-- KEPT FOR RL REWARD
        'addr_state' # Keep region
    ]
     # Ensure columns exist before dropping
    misc_cols_to_drop = [col for col in misc_cols_to_drop_nb if col in df_cleaning_new.columns]
    df_cleaning_new.drop(columns=misc_cols_to_drop, inplace=True, errors='ignore')
    print(f"Dropped {len(misc_cols_to_drop)} misc columns. New shape: {df_cleaning_new.shape}")

    # Store for next step
    newpipe_step2_df = df_cleaning_new
    print("\n✅ Exclusion steps complete.")

else:
    print("❌ Error: 'newpipe_step1_df' not found. Please re-run Part 1.")

--- [New Pipeline] Part 2: Data Cleaning (Exclusions) --
--- (MODIFIED: 'loan_status' column is now KEPT) ---
Shape before cleaning: (100000, 154)

Removing 'Current' and 'Issued' loan statuses...
Removed 38822 rows. New shape: (61178, 154)

Dropping columns with > 80% missing values...
Dropped 40 columns. New shape: (61178, 114)

Dropping direct indicator columns...
Dropped 10 indicator columns. New shape: (61178, 104)

Dropping repetitive/useless object columns...
Dropped 7 misc columns. New shape: (61178, 97)

✅ Exclusion steps complete.


In [25]:
import pandas as pd
import numpy as np

# Assuming 'newpipe_step2_df' is the DataFrame after the exclusion steps

print("--- [New Pipeline] Part 3: Missing Value Imputation ---")

if 'newpipe_step2_df' in locals() or 'newpipe_step2_df' in globals():
    fillna_df_new = newpipe_step2_df.copy()
    print(f"Shape before imputation: {fillna_df_new.shape}")
    print(f"Total missing values before: {fillna_df_new.isnull().sum().sum()}")

    # --- Impute Object Columns (Mode by Region) ---
    print("\nImputing object columns by region mode...")
    object_cols_to_impute = ["last_pymnt_d", "last_credit_pull_d"]
    for column in object_cols_to_impute:
        if column in fillna_df_new.columns:
            # Calculate mode for each region
            mode_map = fillna_df_new.groupby("region")[column].agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
            # Fill NaNs using the map
            fillna_df_new[column] = fillna_df_new.apply(lambda row: mode_map[row['region']] if pd.isnull(row[column]) else row[column], axis=1)
            # Fallback for any regions that might have had only NaNs (fill with overall mode)
            overall_mode = fillna_df_new[column].mode()[0] if not fillna_df_new[column].mode().empty else 'Unknown'
            fillna_df_new[column].fillna(overall_mode, inplace=True)


    # --- Impute Numerical Columns (Median by Region) ---
    print("Imputing specific numerical columns by region median...")
    median_cols_to_impute = ["pub_rec", "total_acc", "emp_length_int"]
    for column in median_cols_to_impute:
        if column in fillna_df_new.columns:
            fillna_df_new[column] = fillna_df_new.groupby("region")[column].transform(lambda x: x.fillna(x.median()))
            # Fallback for any remaining NaNs (e.g., if a whole region was NaN)
            fillna_df_new[column].fillna(fillna_df_new[column].median(), inplace=True)


    # --- Impute Numerical Columns (Mean by Region) ---
    print("Imputing specific numerical columns by region mean...")
    mean_cols_to_impute = ["annual_inc", "delinq_2yrs"]
    for column in mean_cols_to_impute:
         if column in fillna_df_new.columns:
            fillna_df_new[column] = fillna_df_new.groupby("region")[column].transform(lambda x: x.fillna(x.mean()))
            # Fallback for any remaining NaNs
            fillna_df_new[column].fillna(fillna_df_new[column].mean(), inplace=True)

    # --- Fill Remaining NaNs with Zero (as per notebook) ---
    print("Filling all remaining NaNs with 0...")
    initial_nan_count = fillna_df_new.isnull().sum().sum()
    fillna_df_new.fillna(0, inplace=True)
    final_nan_count = fillna_df_new.isnull().sum().sum()
    print(f"Filled {initial_nan_count - final_nan_count} remaining NaN values.")

    # Store for next step
    newpipe_step3_df = fillna_df_new
    print(f"\n✅ Imputation complete. Final shape: {newpipe_step3_df.shape}")
    print(f"Total missing values after: {newpipe_step3_df.isnull().sum().sum()}")

else:
    print("❌ Error: 'newpipe_step2_df' not found. Please re-run Part 2.")

--- [New Pipeline] Part 3: Missing Value Imputation ---
Shape before imputation: (61178, 97)
Total missing values before: 784318

Imputing object columns by region mode...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fillna_df_new[column].fillna(overall_mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fillna_df_new[column].fillna(overall_mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

Imputing specific numerical columns by region median...
Imputing specific numerical columns by region mean...
Filling all remaining NaNs with 0...
Filled 780549 remaining NaN values.

✅ Imputation complete. Final shape: (61178, 97)
Total missing values after: 0


In [26]:
import pandas as pd
import numpy as np

# Assuming 'newpipe_step3_df' is the DataFrame after imputation

print("--- [New Pipeline] Part 4: Removing Outliers ---")

if 'newpipe_step3_df' in locals() or 'newpipe_step3_df' in globals():
    RemoveOutlier_df_new = newpipe_step3_df.copy()
    print(f"Shape before removing outliers: {RemoveOutlier_df_new.shape}")

    # Apply custom thresholds as used in the notebook
    initial_rows = len(RemoveOutlier_df_new)

    if 'annual_inc' in RemoveOutlier_df_new.columns:
        RemoveOutlier_df_new = RemoveOutlier_df_new[RemoveOutlier_df_new['annual_inc'] <= 250000]
    if 'dti' in RemoveOutlier_df_new.columns:
        RemoveOutlier_df_new = RemoveOutlier_df_new[RemoveOutlier_df_new['dti'] <= 50]
    if 'open_acc' in RemoveOutlier_df_new.columns:
        RemoveOutlier_df_new = RemoveOutlier_df_new[RemoveOutlier_df_new['open_acc'] <= 40]
    if 'total_acc' in RemoveOutlier_df_new.columns:
        RemoveOutlier_df_new = RemoveOutlier_df_new[RemoveOutlier_df_new['total_acc'] <= 80]
    if 'revol_util' in RemoveOutlier_df_new.columns:
        RemoveOutlier_df_new = RemoveOutlier_df_new[RemoveOutlier_df_new['revol_util'] <= 120]
    if 'revol_bal' in RemoveOutlier_df_new.columns:
        RemoveOutlier_df_new = RemoveOutlier_df_new[RemoveOutlier_df_new['revol_bal'] <= 250000]

    # Reset index after filtering
    RemoveOutlier_df_new.reset_index(drop=True, inplace=True)

    rows_removed = initial_rows - len(RemoveOutlier_df_new)
    print(f"Removed {rows_removed} rows due to outlier thresholds.")
    print(f"Shape after removing outliers: {RemoveOutlier_df_new.shape}")

    # Store for next step
    newpipe_step4_df = RemoveOutlier_df_new
    print("\n✅ Outlier removal complete.")

else:
    print("❌ Error: 'newpipe_step3_df' not found. Please re-run Part 3.")

--- [New Pipeline] Part 4: Removing Outliers ---
Shape before removing outliers: (61178, 97)
Removed 925 rows due to outlier thresholds.
Shape after removing outliers: (60253, 97)

✅ Outlier removal complete.


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
# Ensure category_encoders is installed
try:
    from category_encoders import TargetEncoder
    print("✅ category_encoders imported successfully.")
except ImportError:
    print("Warning: category_encoders not found. Attempting install...")
    try:
        import sys
        # Need to ensure scikit-learn is compatible first, as per previous errors
        !{sys.executable} -m pip install scikit-learn==1.5.2 --force-reinstall --quiet
        !{sys.executable} -m pip install category_encoders --quiet
        from category_encoders import TargetEncoder
        print("✅ Installation successful.")
    except Exception as e:
        print(f"❌ Error installing category_encoders: {e}")
        TargetEncoder = None
from sklearn.preprocessing import StandardScaler

# Assuming 'newpipe_step4_df' is the DataFrame after outlier removal
# Assuming 'loan_condition_int' is the target column name

print("--- [New Pipeline] Part 5: Feature Engineering ---")

if 'newpipe_step4_df' in locals() or 'newpipe_step4_df' in globals():
    if TargetEncoder is None:
         print("❌ Cannot proceed, TargetEncoder failed to import or install.")
    else:
        FE_df_new = newpipe_step4_df.copy()
        target_col_nb = 'loan_condition_int'

        # --- 1. Identify Feature Types ---
        original_cols_fe = FE_df_new.columns.tolist()
        cat_cols_fe = FE_df_new.select_dtypes(include=['object']).columns.tolist()
        # Exclude the target variable from numerical columns
        num_cols_fe = FE_df_new.select_dtypes(exclude=['object']).columns.drop(target_col_nb, errors='ignore').tolist()

        # Separate categorical into binary and multi-category
        dual_cat_cols_fe = [col for col in cat_cols_fe if FE_df_new[col].nunique() <= 2]
        multi_cat_cols_fe = [col for col in cat_cols_fe if FE_df_new[col].nunique() > 2]

        print(f"Numerical columns found: {len(num_cols_fe)}")
        print(f"Binary categorical columns: {dual_cat_cols_fe}")
        print(f"Multi-categorical columns: {multi_cat_cols_fe}")

        # --- 2. Binary Encoding (get_dummies) ---
        print("\nApplying Binary Encoding (get_dummies)...")
        FE_df_new = pd.get_dummies(FE_df_new, columns=dual_cat_cols_fe, drop_first=True)
        # Get names of newly created binary columns
        new_binary_cols = [col for col in FE_df_new.columns if col not in original_cols_fe and col != target_col_nb]
        print(f"Created {len(new_binary_cols)} new binary columns.")

        # --- 3. Train/Test Split (Stratified) ---
        print("\nSplitting data into training (80%) and test (20%) sets...")
        stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

        # Perform the split
        for train_idx, test_idx in stratified_split.split(FE_df_new, FE_df_new[target_col_nb]):
            train_df_nb = FE_df_new.loc[train_idx]
            test_df_nb = FE_df_new.loc[test_idx]

        # Separate features and target
        train_y_nb = train_df_nb[[target_col_nb]]
        test_y_nb = test_df_nb[[target_col_nb]]
        train_X_nb = train_df_nb.drop(target_col_nb, axis=1)
        test_X_nb = test_df_nb.drop(target_col_nb, axis=1)

        print(f"Training set shape: X={train_X_nb.shape}, y={train_y_nb.shape}")
        print(f"Test set shape:     X={test_X_nb.shape}, y={test_y_nb.shape}")

        # --- 4. Target Encoding ---
        print("\nApplying Target Encoding...")
        # Ensure only existing multi-cat columns are processed
        multi_cat_cols_to_encode = [col for col in multi_cat_cols_fe if col in train_X_nb.columns]
        target_encoder_nb = TargetEncoder(cols=multi_cat_cols_to_encode, smoothing=0.2) # Notebook used smoothing=0.2

        # Fit ONLY on training data
        target_encoder_nb.fit(train_X_nb, train_y_nb.values.ravel()) # .values.ravel() converts to 1D array

        # Transform both train and test data
        train_X_encoded = target_encoder_nb.transform(train_X_nb)
        test_X_encoded = target_encoder_nb.transform(test_X_nb)
        print("Target Encoding applied.")
        # Store list of newly numerical columns from target encoding
        target_encoded_numeric_cols = multi_cat_cols_to_encode

        # --- 5. Normalization (StandardScaler) ---
        print("\nApplying Normalization (StandardScaler)...")
        scaler_nb = StandardScaler()

        # Identify all numerical columns for scaling (original + target encoded + new binary)
        cols_to_scale = num_cols_fe + target_encoded_numeric_cols + new_binary_cols
        # Filter out any columns that might have been dropped or don't exist
        cols_to_scale = [col for col in cols_to_scale if col in train_X_encoded.columns]


        # Fit ONLY on training data
        print(f"Fitting scaler on {len(cols_to_scale)} numerical features...")
        scaler_nb.fit(train_X_encoded[cols_to_scale])

        # Transform both train and test data (in place)
        train_X_scaled = train_X_encoded.copy()
        test_X_scaled = test_X_encoded.copy()

        train_X_scaled[cols_to_scale] = scaler_nb.transform(train_X_encoded[cols_to_scale])
        test_X_scaled[cols_to_scale] = scaler_nb.transform(test_X_encoded[cols_to_scale])
        print("Normalization applied.")

        # Store final datasets for next step
        newpipe_step5_train_X = train_X_scaled
        newpipe_step5_train_y = train_y_nb
        newpipe_step5_test_X = test_X_scaled
        newpipe_step5_test_y = test_y_nb

        print(f"\n✅ Feature Engineering complete.")
        print(f"Final training X shape: {newpipe_step5_train_X.shape}")
        print(f"Final test X shape: {newpipe_step5_test_X.shape}")

else:
    print("❌ Error: 'newpipe_step4_df' not found. Please re-run Part 4.")

✅ category_encoders imported successfully.
--- [New Pipeline] Part 5: Feature Engineering ---
Numerical columns found: 78
Binary categorical columns: ['term', 'pymnt_plan', 'initial_list_status', 'application_type', 'hardship_flag', 'disbursement_method', 'debt_settlement_flag']
Multi-categorical columns: ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'purpose', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d', 'region']

Applying Binary Encoding (get_dummies)...
Created 7 new binary columns.

Splitting data into training (80%) and test (20%) sets...
Training set shape: X=(48202, 96), y=(48202, 1)
Test set shape:     X=(12051, 96), y=(12051, 1)

Applying Target Encoding...
Target Encoding applied.

Applying Normalization (StandardScaler)...
Fitting scaler on 96 numerical features...
Normalization applied.

✅ Feature Engineering complete.
Final training X shape: (48202, 96)
Final test X shape: (12051, 96)


In [28]:
# Install compatible scikit-learn
!pip install scikit-learn==1.5.2 --force-reinstall --quiet
# Install imbalanced-learn
!pip install imbalanced-learn --quiet

print("✅ Installation block complete. Please RESTART YOUR KERNEL now.")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.3.4 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.16.3 which is incompatible.
mkl-umath 0.1.1 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
mkl-random 1.2.4 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
mkl-fft 1.3.8 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.4 which is incompatible.
datasets 4.1.1 requires pyarrow>=21.0.0, but you have pyarrow 19.0.1 which is incompatible.
onnx 1.18.0 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which 

In [29]:
import pandas as pd
import numpy as np
# We will try to import *after* the kernel restart
try:
    from imblearn.under_sampling import RandomUnderSampler
    print("✅ imblearn imported successfully.")
    _imblearn_installed = True
except ImportError:
    print("❌ Error: imbalanced-learn still not found or import failed.")
    print("Please ensure you RESTARTED THE KERNEL after running the install block.")
    _imblearn_installed = False
except Exception as e:
    print(f"❌ An unexpected error occurred during import: {e}")
    _imblearn_installed = False

# --- Setup ---
# Assuming newpipe_step5_train_X, newpipe_step5_train_y, etc. exist

print("--- [New Pipeline] Part 6: Applying Random Undersampling ---")

# Check if inputs exist and RandomUnderSampler is available
if _imblearn_installed:
    if 'newpipe_step5_train_X' not in locals() or 'newpipe_step5_train_y' not in locals():
        print("❌ Error: Input training data not found. Please ensure Steps 1-5 were rerun successfully after restarting.")
    else:
        # --- 1. Initialize Undersampler ---
        rus_nb = RandomUnderSampler(random_state=42, sampling_strategy='auto')

        # --- 2. Apply Undersampling ONLY to Training Data ---
        print(f"Original training data shape: X={newpipe_step5_train_X.shape}, y={newpipe_step5_train_y.shape}")
        print("Original training target distribution:")
        print(newpipe_step5_train_y['loan_condition_int'].value_counts()) # Access column in DataFrame

        try:
            # Pass DataFrame/Series directly
            X_train_undersampled_nb, y_train_undersampled_nb = rus_nb.fit_resample(
                newpipe_step5_train_X, newpipe_step5_train_y['loan_condition_int'] # Pass Series
            )

            print(f"\nUndersampled training data shape: X={X_train_undersampled_nb.shape}, y={y_train_undersampled_nb.shape}")
            print("Undersampled training target distribution:")
            print(y_train_undersampled_nb.value_counts()) # Now it's a Series

            # --- Store Final Datasets for Modeling ---
            newpipe_step6_train_X = X_train_undersampled_nb
            newpipe_step6_train_y = y_train_undersampled_nb # This is now a Series
            newpipe_step6_test_X = newpipe_step5_test_X
            newpipe_step6_test_y = newpipe_step5_test_y

            print("\n✅ Undersampling complete. Datasets ready for feature selection.")

        except Exception as e:
            print(f"❌ An error occurred during fit_resample: {e}")
else:
    print("❌ Cannot proceed because imblearn failed to import.")

✅ imblearn imported successfully.
--- [New Pipeline] Part 6: Applying Random Undersampling ---
Original training data shape: X=(48202, 96), y=(48202, 1)
Original training target distribution:
loan_condition_int
0    37664
1    10538
Name: count, dtype: int64

Undersampled training data shape: X=(21076, 96), y=(21076,)
Undersampled training target distribution:
loan_condition_int
0    10538
1    10538
Name: count, dtype: int64

✅ Undersampling complete. Datasets ready for feature selection.


In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

# --- Setup ---
# Assuming newpipe_step6_train_X (undersampled, scaled, encoded training features) exists
# Assuming newpipe_step6_test_X (original scaled, encoded test features) exists

print("--- [New Pipeline] Part 7: Feature Selection ---")

if 'newpipe_step6_train_X' not in locals() or 'newpipe_step6_test_X' not in locals():
    print("❌ Error: Input data (newpipe_step6_train_X or newpipe_step6_test_X) not found.")
    print("Please ensure Step 6 (Undersampling) completed successfully.")
else:
    X_train_fs = newpipe_step6_train_X.copy()
    X_test_fs = newpipe_step6_test_X.copy() # Apply selection to test set too
    # The notebook implicitly uses the undersampled y_train for wrapper, but we only need X for VarianceThreshold
    # y_train_fs = newpipe_step6_train_y # Undersampled training target

    print(f"Shape before VarianceThreshold: {X_train_fs.shape}")

    # --- 1. Apply VarianceThreshold ---
    # The notebook used threshold=1 on the *scaled* data
    selector = VarianceThreshold(threshold=1)
    # Fit on the training data
    selector.fit(X_train_fs)

    # Get the names of the features kept by the threshold
    filtered_feature_names = X_train_fs.columns[selector.get_support()]
    # Apply filter to train X
    X_train_variance_filtered = X_train_fs[filtered_feature_names]
    # Apply the same filter to test X
    X_test_variance_filtered = X_test_fs[filtered_feature_names]

    cols_removed = X_train_fs.shape[1] - X_train_variance_filtered.shape[1]
    print(f"Applied VarianceThreshold(1). Removed {cols_removed} features.")
    print(f"Shape after VarianceThreshold: {X_train_variance_filtered.shape}")

    # --- 2. Select Final Features (Based on Notebook's Wrapper Result) ---
    # The notebook ran a time-consuming wrapper (SFS) and identified these 9 features.
    # We will directly select these for replication purposes.
    vars_final_nb = [
        'delinq_2yrs',
        'last_fico_range_high',
        'last_fico_range_low',
        'acc_now_delinq',
        'open_acc_6m',
        'total_bal_il',
        'il_util',
        'open_rv_12m',
        'all_util'
     ]
    print(f"\nSelecting the final {len(vars_final_nb)} features identified by the notebook's wrapper method...")

    # Ensure these final columns actually exist after variance thresholding
    final_cols_exist = [col for col in vars_final_nb if col in X_train_variance_filtered.columns]

    if len(final_cols_exist) != len(vars_final_nb):
        print(f"⚠️ Warning: Not all expected final features ({vars_final_nb}) were present after VarianceThreshold.")
        print(f"Features missing: {list(set(vars_final_nb) - set(final_cols_exist))}")
        print(f"Proceeding with the {len(final_cols_exist)} available features: {final_cols_exist}")
        final_selected_cols = final_cols_exist
    else:
        final_selected_cols = vars_final_nb
        print("All expected final features found.")


    # Apply the final selection to both train and test sets
    X_train_final_selected = X_train_variance_filtered[final_selected_cols]
    X_test_final_selected = X_test_variance_filtered[final_selected_cols]


    # Store final datasets for modeling
    newpipe_step7_train_X = X_train_final_selected
    newpipe_step7_train_y = newpipe_step6_train_y # Use the undersampled y from step 6
    newpipe_step7_test_X = X_test_final_selected
    newpipe_step7_test_y = newpipe_step6_test_y # Use the original test y from step 6

    print(f"\n✅ Feature Selection complete.")
    print(f"Final training X shape: {newpipe_step7_train_X.shape}")
    print(f"Final test X shape: {newpipe_step7_test_X.shape}")

--- [New Pipeline] Part 7: Feature Selection ---
Shape before VarianceThreshold: (21076, 96)
Applied VarianceThreshold(1). Removed 37 features.
Shape after VarianceThreshold: (21076, 59)

Selecting the final 9 features identified by the notebook's wrapper method...
All expected final features found.

✅ Feature Selection complete.
Final training X shape: (21076, 9)
Final test X shape: (12051, 9)


In [31]:
# --- [New Pipeline] Part 8: Task 2 - DL Predictive Model ---

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import AUC, Precision, Recall
from sklearn.metrics import f1_score, roc_auc_score, classification_report
import numpy as np

print("--- [New Pipeline] Part 8: Task 2 - DL Predictive Model ---")

# --- 1. Load Data (from your previous step) ---
X_train_dl = newpipe_step7_train_X
y_train_dl = newpipe_step7_train_y
X_test_dl = newpipe_step7_test_X
y_test_dl = newpipe_step7_test_y

print(f"DL Model Input Shapes: X_train={X_train_dl.shape}, y_train={y_train_dl.shape}")
print(f"DL Model Test Shapes:  X_test={X_test_dl.shape}, y_test={y_test_dl.shape}")

# --- 2. Define the DL Model (MLP) ---
# We have 9 input features
n_features = X_train_dl.shape[1]

dl_model = Sequential([
    Dense(64, activation='relu', input_shape=(n_features,)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid') # Binary output
])

dl_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        AUC(name='auc'),
        Precision(name='precision'),
        Recall(name='recall')
    ]
)

dl_model.summary()

# --- 3. Train the Model ---
print("\nTraining the DL model...")
history = dl_model.fit(
    X_train_dl,
    y_train_dl,
    epochs=20, # You can tune this
    batch_size=128,
    validation_split=0.2, # Use part of training data for validation
    verbose=1
)

# --- 4. Evaluate the Model (as per assignment) ---
print("\nEvaluating model on the hold-out test set...")
# Get predicted probabilities
y_pred_proba_dl = dl_model.predict(X_test_dl).ravel()
# Get predicted classes (using 0.5 threshold)
y_pred_class_dl = (y_pred_proba_dl > 0.5).astype(int)

# Calculate required metrics [cite: 33]
auc_score = roc_auc_score(y_test_dl, y_pred_proba_dl)
f1 = f1_score(y_test_dl, y_pred_class_dl)

print("\n--- Task 2 Evaluation Metrics ---")
print(f"✅ AUC (Area Under the ROC Curve): {auc_score:.4f}")
print(f"✅ F1-Score: {f1:.4f}")

print("\nFull Classification Report on Test Set:")
print(classification_report(y_test_dl, y_pred_class_dl, target_names=['Fully Paid (0)', 'Defaulted (1)']))

--- [New Pipeline] Part 8: Task 2 - DL Predictive Model ---
DL Model Input Shapes: X_train=(21076, 9), y_train=(21076,)
DL Model Test Shapes:  X_test=(12051, 9), y_test=(12051, 1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training the DL model...
Epoch 1/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 27ms/step - auc: 0.7197 - loss: 0.5903 - precision: 0.6041 - recall: 0.5857 - val_auc: 0.0000e+00 - val_loss: 0.4622 - val_precision: 1.0000 - val_recall: 0.8067
Epoch 2/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.9196 - loss: 0.3533 - precision: 0.8279 - recall: 0.8176 - val_auc: 0.0000e+00 - val_loss: 0.4475 - val_precision: 1.0000 - val_recall: 0.8430
Epoch 3/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.9295 - loss: 0.3297 - precision: 0.8224 - recall: 0.8425 - val_auc: 0.0000e+00 - val_loss: 0.4922 - val_precision: 1.0000 - val_recall: 0.8283
Epoch 4/20
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc: 0.9242 - loss: 0.3386 - precision: 0.8130 - recall: 0.8411 - val_auc: 0.0000e+00 - val_loss: 0.4843 - val_precision: 1.0000 - val_recall: 0.8349
Epoch 5/20
[1m132/13