In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2, SelectKBest

In [2]:
# --- 1. MOUNTING & PATH CONFIGURATION ---
def log_audit(message):
    print(f"[AUDIT LOG] {message}")

try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    BASE_PATH = '/content/drive/MyDrive/Capstone/'
    log_audit("Drive mounted successfully.")
except Exception as e:
    BASE_PATH = './'
    log_audit(f"Manual mount required or local run. Error: {e}")

INPUT_PATH = os.path.join(BASE_PATH, 'cleaned_data.csv')
OUTPUT_PATH = os.path.join(BASE_PATH, 'selected_features.csv')

Mounted at /content/drive
[AUDIT LOG] Drive mounted successfully.


In [4]:
# --- 2. DATA LOADING & PRE-PROCESSING ---
log_audit("Loading cleaned dataset...")
df = pd.read_csv(INPUT_PATH)

# Categorical columns from variables list
cat_cols = ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']
# Numerical columns excluding the ID and Target
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'PROSPECTID' in num_cols: num_cols.remove('PROSPECTID')

[AUDIT LOG] Loading cleaned dataset...


In [5]:
# --- 3. MULTICOLLINEARITY TREATMENT (VIF) ---
log_audit("Calculating Variance Inflation Factor (VIF) for numerical features...")

def calculate_vif(data_frame, features):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = features
    # Dropping NaNs for VIF calculation as it doesn't handle them
    temp_df = data_frame[features].dropna()
    vif_data["VIF"] = [variance_inflation_factor(temp_df.values, i) for i in range(len(features))]
    return vif_data

# Iteratively remove features with VIF > 10 (Standard Banking Threshold)
vif_list = num_cols.copy()
vif_df = calculate_vif(df, vif_list)

while vif_df['VIF'].max() > 10:
    drop_feat = vif_df.sort_values('VIF', ascending=False)['Feature'].iloc[0]
    log_audit(f"Dropping {drop_feat} due to high VIF: {vif_df['VIF'].max():.2f}")
    vif_list.remove(drop_feat)
    vif_df = calculate_vif(df, vif_list)

log_audit(f"Remaining numerical features after VIF: {len(vif_list)}")

[AUDIT LOG] Calculating Variance Inflation Factor (VIF) for numerical features...


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping Total_TL due to high VIF: inf


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


[AUDIT LOG] Dropping Tot_Closed_TL due to high VIF: inf


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping pct_closed_tl due to high VIF: inf


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping pct_active_tl due to high VIF: inf


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


[AUDIT LOG] Dropping Secured_TL due to high VIF: inf


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping num_deliq_6_12mts due to high VIF: inf


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping num_std_12mts due to high VIF: 44.34


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping Unsecured_TL due to high VIF: 41.68


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping enq_L12m due to high VIF: 37.58


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping Total_TL_opened_L12M due to high VIF: 24.30


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping Credit_Score due to high VIF: 22.54


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping max_deliq_12mts due to high VIF: 18.58


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping Tot_TL_closed_L12M due to high VIF: 17.02


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping enq_L6m due to high VIF: 15.91


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping num_times_30p_dpd due to high VIF: 13.11


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


[AUDIT LOG] Dropping Tot_Active_TL due to high VIF: 10.98
[AUDIT LOG] Remaining numerical features after VIF: 65


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


In [6]:
# --- 4. CATEGORICAL SELECTION (CHI-SQUARE) ---
log_audit("Performing Chi-Square test for categorical importance...")

# Label Encode for Chi-Square requirement
le = LabelEncoder()
df_cat = df[cat_cols].copy()
for col in cat_cols:
    df_cat[col] = le.fit_transform(df_cat[col].astype(str))

# Chi-Square against Target (Approved_Flag)
target_le = le.fit_transform(df['Approved_Flag'])
chi_scores = chi2(df_cat, target_le)

p_values = pd.Series(chi_scores[1], index=cat_cols)
log_audit(f"Categorical P-Values (lower is better):\n{p_values.sort_values()}")

# Keep features with p-value < 0.05
selected_cats = p_values[p_values < 0.05].index.tolist()

[AUDIT LOG] Performing Chi-Square test for categorical importance...
[AUDIT LOG] Categorical P-Values (lower is better):
last_prod_enq2     8.519660e-209
MARITALSTATUS      4.396817e-189
first_prod_enq2     2.134860e-63
EDUCATION           1.328182e-02
GENDER              5.160549e-01
dtype: float64


In [7]:
# --- 5. DATASET CONSOLIDATION ---
final_features = vif_list + selected_cats + ['Approved_Flag']
df_final = df[final_features]

log_audit(f"Final Feature Count: {len(final_features)}")

[AUDIT LOG] Final Feature Count: 70


In [8]:
# --- 6. EXPORT ---
df_final.to_csv(OUTPUT_PATH, index=False)
log_audit(f"Feature Selection Complete. Saved to {OUTPUT_PATH}. Proceed to Notebook 4.")

[AUDIT LOG] Feature Selection Complete. Saved to /content/drive/MyDrive/Capstone/selected_features.csv. Proceed to Notebook 4.
