In [53]:
# Notebook initialization for consistent paths (repo-aware)
import os, sys, pathlib

# Resolve repo root by walking up until we find 'src'
CWD = pathlib.Path.cwd()
ROOT = CWD
for _ in range(6):
    if (ROOT / 'src').exists():
        break
    ROOT = ROOT.parent
# Fallback to current if not found
if not (ROOT / 'src').exists():
    ROOT = CWD

PROJECT_ROOT = ROOT.resolve()
PROJECT_SRC = PROJECT_ROOT / 'src'
DATA_DIR = PROJECT_SRC / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

if str(PROJECT_SRC) not in sys.path:
    sys.path.insert(0, str(PROJECT_SRC))

print(f'PROJECT_ROOT={PROJECT_ROOT}')
print(f'PROJECT_SRC={PROJECT_SRC}')
print(f'RAW_DATA_DIR={RAW_DATA_DIR}')
print(f'PROCESSED_DATA_DIR={PROCESSED_DATA_DIR}')


PROJECT_ROOT=/Users/pierce.bucknerwolfso/Desktop/embeddings_paper/embeddings-service-exploration
PROJECT_SRC=/Users/pierce.bucknerwolfso/Desktop/embeddings_paper/embeddings-service-exploration/src
RAW_DATA_DIR=/Users/pierce.bucknerwolfso/Desktop/embeddings_paper/embeddings-service-exploration/src/data/raw
PROCESSED_DATA_DIR=/Users/pierce.bucknerwolfso/Desktop/embeddings_paper/embeddings-service-exploration/src/data/processed


In [54]:
import pandas as pd
import numpy as np

In [55]:
# Download kaggle Lending Club dataset and place in src/data/raw
df_accepted = pd.read_csv(f"{RAW_DATA_DIR}/accepted_2007_to_2018Q4.csv")
df = df_accepted.dropna(subset=["loan_status"])
loan_counts = df['loan_status'].value_counts()
print(loan_counts)

  df_accepted = pd.read_csv(f"{RAW_DATA_DIR}/accepted_2007_to_2018Q4.csv")


loan_status
Fully Paid                                             1076751
Current                                                 878317
Charged Off                                             268559
Late (31-120 days)                                       21467
In Grace Period                                           8436
Late (16-30 days)                                         4349
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     40
Name: count, dtype: int64


In [35]:
# Map Loan Outcome
target_map = {
    'Fully Paid': 0,
    'Does not meet the credit policy. Status:Fully Paid': 0,
    'Charged Off': 1,
    'Default': 1,
    'Late (31-120 days)': 1,
    'Does not meet the credit policy. Status:Charged Off': 1
}

# Apply the map to the 'loan_status' column
df['loan_outcome'] = df['loan_status'].map(target_map)

# Remove rows that were not in our map (i.e., 'Current', 'In Grace Period', etc.)
df.dropna(subset=['loan_outcome'], inplace=True)

# 4. Convert the new column to an integer type
df['loan_outcome'] = df['loan_outcome'].astype(int)

# Now, check the new value counts
print(df['loan_outcome'].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['loan_outcome'] = df['loan_status'].map(target_map)


loan_outcome
0    1078739
1     290827
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['loan_outcome'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['loan_outcome'] = df['loan_outcome'].astype(int)


In [36]:
# Show all columns
pd.set_option('display.max_columns', None)

# Show all rows
pd.set_option('display.max_rows', None)

# Set the maximum column width to display full content
pd.set_option('display.max_colwidth', None)

# Set the maximum sequence items to display full list/array content
pd.set_option('display.max_seq_items', None)

# Calculate the percentage of missing values
missing_percentages = (df.isnull().sum() / len(df)) * 100

# Sort the results in descending order
print(missing_percentages.sort_values(ascending=False))

member_id                                     100.000000
orig_projected_additional_accrued_interest     99.619734
hardship_payoff_balance_amount                 99.461216
hardship_amount                                99.461216
hardship_start_date                            99.461216
hardship_reason                                99.461216
hardship_type                                  99.461216
hardship_end_date                              99.461216
payment_plan_start_date                        99.461216
hardship_length                                99.461216
hardship_dpd                                   99.461216
hardship_loan_status                           99.461216
deferral_term                                  99.461216
hardship_status                                99.461216
hardship_last_payment_amount                   99.461216
sec_app_mths_since_last_major_derog            99.454134
sec_app_revol_util                             98.519166
revol_bal_joint                

In [37]:
# We will drop all columns with more than 60% missing values
for col in df.columns:
    if df[col].isnull().sum() / len(df) > 0.60:
        print(f"Removed column: {col}")
        df.drop(col, axis=1, inplace=True)

Removed column: member_id


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: desc


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: mths_since_last_record


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: next_pymnt_d


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: mths_since_last_major_derog


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: annual_inc_joint


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: dti_joint


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: verification_status_joint


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: mths_since_rcnt_il


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: il_util


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: mths_since_recent_bc_dlq


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: mths_since_recent_revol_delinq


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: revol_bal_joint


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_fico_range_low


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_fico_range_high


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_earliest_cr_line


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_inq_last_6mths


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_mort_acc


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_open_acc


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_revol_util


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_open_act_il


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_num_rev_accts


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_chargeoff_within_12_mths


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_collections_12_mths_ex_med


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: sec_app_mths_since_last_major_derog


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_type


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_reason


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_status


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: deferral_term


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_amount


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_start_date


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_end_date


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: payment_plan_start_date


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_length


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_dpd


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_loan_status


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: orig_projected_additional_accrued_interest


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_payoff_balance_amount


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: hardship_last_payment_amount


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: debt_settlement_flag_date


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: settlement_status


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: settlement_date


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: settlement_amount


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: settlement_percentage


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


Removed column: settlement_term


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(col, axis=1, inplace=True)


In [38]:
# Check out columns again
missing_percentages = (df.isnull().sum() / len(df)) * 100
print(missing_percentages.sort_values(ascending=False))

all_util                      59.296376
open_acc_6m                   59.292360
inq_last_12m                  59.292360
total_cu_tl                   59.292360
open_il_12m                   59.292287
open_rv_12m                   59.292287
open_act_il                   59.292287
open_il_24m                   59.292287
total_bal_il                  59.292287
open_rv_24m                   59.292287
max_bal_bc                    59.292287
inq_fi                        59.292287
mths_since_last_delinq        50.390124
mths_since_recent_inq         13.038729
num_tl_120dpd_2m               8.872154
mo_sin_old_il_acct             7.966684
emp_title                      6.423130
emp_length                     5.870546
pct_tl_nvr_dlq                 5.142505
avg_cur_bal                    5.132940
mo_sin_old_rev_tl_op           5.131334
mo_sin_rcnt_rev_tl_op          5.131334
num_rev_accts                  5.131334
tot_cur_bal                    5.131261
mo_sin_rcnt_tl                 5.131261


In [39]:
# Remove low variance columns & unique identifier columns
variances = df.select_dtypes(include=np.number).var()

variance_threshold = 0.01
low_variance_cols = variances[variances < variance_threshold].index.tolist()

print(f"Columns with variance below {variance_threshold}: {low_variance_cols}")

nunique_counts = df.nunique()
single_value_cols = nunique_counts[nunique_counts <= 1].index.tolist()

print(f"Columns with only one unique value: {single_value_cols}")

# Find unique identifier columns
num_rows = len(df)
nunique_counts = df.nunique()
unique_id_cols = nunique_counts[nunique_counts == num_rows].index.tolist()

print(f"Unique identifier columns: {unique_id_cols}")

Columns with variance below 0.01: ['policy_code', 'acc_now_delinq', 'num_tl_120dpd_2m', 'num_tl_30dpd']
Columns with only one unique value: ['policy_code']
Unique identifier columns: ['id', 'url']


In [40]:
# Remove the low variance & unique identifer columns
remove_cols = list(set(low_variance_cols + unique_id_cols))
df.drop(columns=remove_cols, inplace=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=remove_cols, inplace=True)


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,application_type,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,total_bal_il,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,disbursement_method,debt_settlement_flag,loan_outcome
0,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,n,debt_consolidation,Debt consolidation,190xx,PA,5.91,0.0,Aug-2003,675.0,679.0,1.0,30.0,7.0,0.0,2765.0,29.7,13.0,w,0.0,0.0,4421.723917,4421.72,3600.0,821.72,0.0,0.0,0.0,Jan-2019,122.67,Mar-2019,564.0,560.0,0.0,Individual,722.0,144904.0,2.0,2.0,0.0,1.0,4981.0,3.0,3.0,722.0,34.0,9300.0,3.0,1.0,4.0,4.0,20701.0,1506.0,37.2,0.0,0.0,148.0,128.0,3.0,3.0,1.0,4.0,4.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,N,Cash,N,0
1,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,n,small_business,Business,577xx,SD,16.06,1.0,Dec-1999,715.0,719.0,4.0,6.0,22.0,0.0,21470.0,19.2,38.0,w,0.0,0.0,25679.66,25679.66,24700.0,979.66,0.0,0.0,0.0,Jun-2016,926.35,Mar-2019,699.0,695.0,0.0,Individual,0.0,204396.0,1.0,1.0,0.0,1.0,18005.0,2.0,3.0,6472.0,29.0,111800.0,0.0,0.0,6.0,4.0,9733.0,57830.0,27.1,0.0,0.0,113.0,192.0,2.0,2.0,4.0,2.0,0.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,N,Cash,N,0
2,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,n,home_improvement,,605xx,IL,10.78,0.0,Aug-2000,695.0,699.0,0.0,,6.0,0.0,7869.0,56.2,18.0,w,0.0,0.0,22705.924294,22705.92,20000.0,2705.92,0.0,0.0,0.0,Jun-2017,15813.3,Mar-2019,704.0,700.0,0.0,Joint App,0.0,189699.0,0.0,1.0,0.0,4.0,10827.0,0.0,2.0,2081.0,65.0,14000.0,2.0,5.0,1.0,6.0,31617.0,2737.0,55.9,0.0,0.0,125.0,184.0,14.0,14.0,5.0,101.0,10.0,0.0,2.0,3.0,2.0,4.0,6.0,4.0,7.0,3.0,6.0,0.0,0.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0,N,Cash,N,0
4,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,n,major_purchase,Major purchase,174xx,PA,25.37,1.0,Jun-1998,695.0,699.0,3.0,12.0,12.0,0.0,21929.0,64.5,35.0,w,0.0,0.0,11740.5,11740.5,10400.0,1340.5,0.0,0.0,0.0,Jul-2016,10128.96,Mar-2018,704.0,700.0,0.0,Individual,0.0,331730.0,1.0,3.0,0.0,3.0,73839.0,4.0,7.0,9702.0,78.0,34000.0,2.0,1.0,3.0,10.0,27644.0,4567.0,77.5,0.0,0.0,128.0,210.0,4.0,4.0,6.0,4.0,1.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,N,Cash,N,0
5,11950.0,11950.0,11950.0,36 months,13.44,405.18,C,C3,Veterinary Tecnician,4 years,RENT,34000.0,Source Verified,Dec-2015,Fully Paid,n,debt_consolidation,Debt consolidation,300xx,GA,10.2,0.0,Oct-1987,690.0,694.0,0.0,,5.0,0.0,8822.0,68.4,6.0,w,0.0,0.0,13708.94853,13708.95,11950.0,1758.95,0.0,0.0,0.0,May-2017,7653.56,May-2017,759.0,755.0,0.0,Individual,0.0,12798.0,0.0,1.0,0.0,0.0,3976.0,0.0,0.0,4522.0,76.0,12900.0,0.0,0.0,0.0,0.0,2560.0,844.0,91.0,0.0,0.0,338.0,54.0,32.0,32.0,0.0,36.0,,0.0,2.0,3.0,2.0,2.0,2.0,4.0,4.0,3.0,5.0,0.0,0.0,100.0,100.0,0.0,0.0,16900.0,12798.0,9400.0,4000.0,N,Cash,N,0


In [41]:
# Remove leading/trailing whitespaces from objects
df = df.apply(lambda col: col.str.strip() if col.dtypes == 'object' else col)

In [42]:
df_corr_prep = df.copy()
for col in df_corr_prep.select_dtypes(include='object').columns:
    df_corr_prep[col] = df_corr_prep[col].astype('category').cat.codes

In [43]:
def get_sorted_correlated_groups(df, target_col, threshold=0.85):
    """
    Finds groups of highly correlated features and, for each group,
    sorts the members by their correlation to the target variable.
    """
    
    # Calculate the correlation matrix
    corr_matrix = df.corr().abs()
    # Get NA counts from the original DataFrame
    na_counts = df.isnull().sum()
    
    # Get correlation with the target variable
    target_corr = corr_matrix.get(target_col, pd.Series())

    # Get the upper triangle of the correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find all pairs above the threshold
    high_corr_pairs = [
        (column, index) for column in upper.columns
        for index in upper.index if upper.loc[index, column] > threshold
    ]

    # Group overlapping pairs
    groups = []
    for pair in high_corr_pairs:
        found = False
        for group in groups:
            if pair[0] in group or pair[1] in group:
                group.update(pair)
                found = True
                break
        if not found:
            groups.append(set(pair))

    # Process and sort each group
    sorted_groups = []
    for group in groups:
        # Create a list of (feature, target_corr, na_count) tuples
        group_with_info = [
            (feature, target_corr.get(feature, 0), na_counts.get(feature, 0)) 
            for feature in group
        ]
        # Sort the group by the target correlation (ascending)
        group_with_info.sort(key=lambda x: x[1])
        sorted_groups.append(group_with_info)
        
    return sorted_groups


# Get the sorted groups of correlated features
correlated_groups = get_sorted_correlated_groups(df_corr_prep, 'loan_outcome')

# --- Display the Results ---
print("--- Correlated Feature Groups (Sorted by Lowest Target Correlation) ---")
for i, group in enumerate(correlated_groups):
    print(f"\nGroup {i+1}:")
    for feature, corr_val, na_count in group:
        print(f"  - {feature} (Target Corr: {corr_val:.3f}, NA Count: {na_count})")



--- Correlated Feature Groups (Sorted by Lowest Target Correlation) ---

Group 1:
  - installment (Target Corr: 0.057, NA Count: 0)
  - funded_amnt_inv (Target Corr: 0.071, NA Count: 0)
  - loan_amnt (Target Corr: 0.071, NA Count: 0)
  - funded_amnt (Target Corr: 0.072, NA Count: 0)

Group 2:
  - grade (Target Corr: 0.262, NA Count: 0)
  - int_rate (Target Corr: 0.263, NA Count: 0)
  - sub_grade (Target Corr: 0.268, NA Count: 0)

Group 3:
  - fico_range_high (Target Corr: 0.130, NA Count: 0)
  - fico_range_low (Target Corr: 0.130, NA Count: 0)

Group 4:
  - out_prncp_inv (Target Corr: 0.195, NA Count: 0)
  - out_prncp (Target Corr: 0.195, NA Count: 0)

Group 5:
  - total_pymnt_inv (Target Corr: 0.320, NA Count: 0)
  - total_pymnt (Target Corr: 0.321, NA Count: 0)
  - total_rec_prncp (Target Corr: 0.445, NA Count: 0)

Group 6:
  - collection_recovery_fee (Target Corr: 0.457, NA Count: 0)
  - recoveries (Target Corr: 0.482, NA Count: 0)

Group 7:
  - revol_util (Target Corr: 0.055, NA Co

In [44]:
features_to_drop = [
    'installment',
    #'loan_amnt',
    'funded_amnt',
    'funded_amnt_inv',
    'grade',
    # 'sub_grade',
    # 'int_rate',
    # 'fico_range_high',
    'out_prncp_inv',
    'total_pymnt_inv',
    'total_pymnt',
    'collection_recovery_fee',
    'bc_util',
    'num_rev_tl_bal_gt_0',
    'num_sats',
    'tot_cur_bal',
    'total_il_high_credit_limit',
    'pymnt_plan'
]

print(features_to_drop)

['installment', 'funded_amnt', 'funded_amnt_inv', 'grade', 'out_prncp_inv', 'total_pymnt_inv', 'total_pymnt', 'collection_recovery_fee', 'bc_util', 'num_rev_tl_bal_gt_0', 'num_sats', 'tot_cur_bal', 'total_il_high_credit_limit', 'pymnt_plan']


In [45]:
# Find and Remove Direct Indicators (Leaky Features)
print("\n--- Finding Leaky Features ---")
# Calculate correlation with the target variable
target_corr = df_corr_prep.corr()['loan_outcome'].abs().sort_values(ascending=False)
print("Correlation with Target:\n", target_corr)

# Identify features with correlation > 0.8 to the target
leaky_to_drop = target_corr[target_corr > 0.8].index.tolist()
# Remove the target column itself from this list if it's there
if 'loan_outcome' in leaky_to_drop:
    leaky_to_drop.remove('loan_outcome')
print(f"\nLeaky features to drop: {leaky_to_drop}")


--- Finding Leaky Features ---
Correlation with Target:
 loan_outcome                  1.000000
loan_status                   0.925868
last_fico_range_high          0.667449
last_fico_range_low           0.570190
recoveries                    0.481679
collection_recovery_fee       0.457210
total_rec_prncp               0.445317
last_pymnt_amnt               0.356457
total_pymnt                   0.321063
total_pymnt_inv               0.320491
debt_settlement_flag          0.308144
sub_grade                     0.268325
int_rate                      0.263009
grade                         0.262187
out_prncp                     0.194811
out_prncp_inv                 0.194801
term                          0.180987
total_rec_late_fee            0.153840
fico_range_low                0.129576
fico_range_high               0.129575
acc_open_past_24mths          0.099789
verification_status           0.087294
num_tl_op_past_12m            0.085194
dti                           0.084940
all_ut

In [46]:
# Combine and Drop All Identified Columns
features_to_drop = list(set(leaky_to_drop + features_to_drop))

print(f"\nTotal features to drop: {features_to_drop}")
df_cleaned = df.drop(columns=features_to_drop)
df_cleaned.head()


Total features to drop: ['tot_cur_bal', 'funded_amnt_inv', 'installment', 'total_pymnt_inv', 'total_il_high_credit_limit', 'pymnt_plan', 'num_rev_tl_bal_gt_0', 'loan_status', 'funded_amnt', 'collection_recovery_fee', 'bc_util', 'out_prncp_inv', 'num_sats', 'total_pymnt', 'grade']


Unnamed: 0,loan_amnt,term,int_rate,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,application_type,tot_coll_amt,open_acc_6m,open_act_il,open_il_12m,open_il_24m,total_bal_il,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,hardship_flag,disbursement_method,debt_settlement_flag,loan_outcome
0,3600.0,36 months,13.99,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,debt_consolidation,Debt consolidation,190xx,PA,5.91,0.0,Aug-2003,675.0,679.0,1.0,30.0,7.0,0.0,2765.0,29.7,13.0,w,0.0,3600.0,821.72,0.0,0.0,Jan-2019,122.67,Mar-2019,564.0,560.0,0.0,Individual,722.0,2.0,2.0,0.0,1.0,4981.0,3.0,3.0,722.0,34.0,9300.0,3.0,1.0,4.0,4.0,20701.0,1506.0,0.0,0.0,148.0,128.0,3.0,3.0,1.0,4.0,4.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,N,Cash,N,0
1,24700.0,36 months,11.99,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,small_business,Business,577xx,SD,16.06,1.0,Dec-1999,715.0,719.0,4.0,6.0,22.0,0.0,21470.0,19.2,38.0,w,0.0,24700.0,979.66,0.0,0.0,Jun-2016,926.35,Mar-2019,699.0,695.0,0.0,Individual,0.0,1.0,1.0,0.0,1.0,18005.0,2.0,3.0,6472.0,29.0,111800.0,0.0,0.0,6.0,4.0,9733.0,57830.0,0.0,0.0,113.0,192.0,2.0,2.0,4.0,2.0,0.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,N,Cash,N,0
2,20000.0,60 months,10.78,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,home_improvement,,605xx,IL,10.78,0.0,Aug-2000,695.0,699.0,0.0,,6.0,0.0,7869.0,56.2,18.0,w,0.0,20000.0,2705.92,0.0,0.0,Jun-2017,15813.3,Mar-2019,704.0,700.0,0.0,Joint App,0.0,0.0,1.0,0.0,4.0,10827.0,0.0,2.0,2081.0,65.0,14000.0,2.0,5.0,1.0,6.0,31617.0,2737.0,0.0,0.0,125.0,184.0,14.0,14.0,5.0,101.0,10.0,0.0,2.0,3.0,2.0,4.0,6.0,4.0,7.0,0.0,0.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,N,Cash,N,0
4,10400.0,60 months,22.45,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,major_purchase,Major purchase,174xx,PA,25.37,1.0,Jun-1998,695.0,699.0,3.0,12.0,12.0,0.0,21929.0,64.5,35.0,w,0.0,10400.0,1340.5,0.0,0.0,Jul-2016,10128.96,Mar-2018,704.0,700.0,0.0,Individual,0.0,1.0,3.0,0.0,3.0,73839.0,4.0,7.0,9702.0,78.0,34000.0,2.0,1.0,3.0,10.0,27644.0,4567.0,0.0,0.0,128.0,210.0,4.0,4.0,6.0,4.0,1.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,N,Cash,N,0
5,11950.0,36 months,13.44,C3,Veterinary Tecnician,4 years,RENT,34000.0,Source Verified,Dec-2015,debt_consolidation,Debt consolidation,300xx,GA,10.2,0.0,Oct-1987,690.0,694.0,0.0,,5.0,0.0,8822.0,68.4,6.0,w,0.0,11950.0,1758.95,0.0,0.0,May-2017,7653.56,May-2017,759.0,755.0,0.0,Individual,0.0,0.0,1.0,0.0,0.0,3976.0,0.0,0.0,4522.0,76.0,12900.0,0.0,0.0,0.0,0.0,2560.0,844.0,0.0,0.0,338.0,54.0,32.0,32.0,0.0,36.0,,0.0,2.0,3.0,2.0,2.0,2.0,4.0,4.0,0.0,0.0,100.0,100.0,0.0,0.0,16900.0,12798.0,9400.0,N,Cash,N,0


In [47]:
# Leaky features (i.e. features not available at loan origination)
leaks = [
    'out_prncp',
    'total_rec_int',
    'total_rec_late_fee',
    'total_rec_prncp',
    'recoveries',
    'last_pymnt_amnt',
    'last_pymnt_d',
    'last_credit_pull_d',
    'last_fico_range_high',
    'last_fico_range_low',
    'debt_settlement_flag',
    'disbursement_method',
]

In [48]:
df_cleaned = df_cleaned.drop(columns = (leaks))

In [49]:
print(len(list(df_cleaned.columns)))
list(df_cleaned.columns)

74


['loan_amnt',
 'term',
 'int_rate',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'issue_d',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'collections_12_mths_ex_med',
 'application_type',
 'tot_coll_amt',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'total_bal_il',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'total_rev_hi_lim',
 'inq_fi',
 'total_cu_tl',
 'inq_last_12m',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths_since_recent_bc',
 'mths_since_recent_inq',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl

In [50]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1369566 entries, 0 to 2260697
Data columns (total 74 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   loan_amnt                   1369566 non-null  float64
 1   term                        1369566 non-null  object 
 2   int_rate                    1369566 non-null  float64
 3   sub_grade                   1369566 non-null  object 
 4   emp_title                   1281597 non-null  object 
 5   emp_length                  1289165 non-null  object 
 6   home_ownership              1369566 non-null  object 
 7   annual_inc                  1369562 non-null  float64
 8   verification_status         1369566 non-null  object 
 9   issue_d                     1369566 non-null  object 
 10  purpose                     1369566 non-null  object 
 11  title                       1352622 non-null  object 
 12  zip_code                    1369565 non-null  object 
 13  ad

In [51]:
df_cleaned.isnull().sum()

loan_amnt                          0
term                               0
int_rate                           0
sub_grade                          0
emp_title                      87969
emp_length                     80401
home_ownership                     0
annual_inc                         4
verification_status                0
issue_d                            0
purpose                            0
title                          16944
zip_code                           1
addr_state                         0
dti                              398
delinq_2yrs                       29
earliest_cr_line                  29
fico_range_low                     0
fico_range_high                    0
inq_last_6mths                    30
mths_since_last_delinq        690126
open_acc                          29
pub_rec                           29
revol_bal                          0
revol_util                       924
total_acc                         29
initial_list_status                0
c

In [52]:
# save df_cleaned
df_cleaned.to_csv(f'{PROCESSED_DATA_DIR}/base_loan_data_cleaned.csv', index=False)