***Credit Risk Prediction: A Machine Learning Analysis of Lending Club Loans***

A project by Anna Rakayev and Tamar Shemesh

In [1]:
import pandas as pd

In [None]:
# loading the data
df = pd.read_csv("loan.csv")
df.shape
df_clean = df.copy()

  df = pd.read_csv("loan.csv")


(2260668, 145)

In [3]:
df['last_pymnt_d'].isna().mean() * 100

0.1073134135574087

In [None]:
# Keep only relevant rows
df_clean = df[df['loan_status'].isin(['Fully Paid', 'Charged Off', 'Default'])].copy()

# Convert dates once (vectorized)
df_clean['issue_d'] = pd.to_datetime(df_clean['issue_d'], format='%b-%Y', errors='coerce')
df_clean['last_pymnt_d'] = pd.to_datetime(df_clean['last_pymnt_d'], format='%b-%Y', errors='coerce')

# Extract term in months
df_clean['term_months'] = df_clean['term'].str.extract(r'(\d+)').astype(float)

# Expected loan end date
df_clean['expected_end'] = df_clean['issue_d'] + pd.to_timedelta(df_clean['term_months'] * 30, unit='D')

# Flag late fully paid
df_clean['paid_late_flag'] = (
    (df_clean['loan_status'] == 'Fully Paid') &
    (df_clean['last_pymnt_d'] > df_clean['expected_end'])
)

# Build 3-class target
df_clean['target_3class'] = 'paid_on_time'
df_clean.loc[df_clean['paid_late_flag'], 'target_3class'] = 'paid_late'
df_clean.loc[df_clean['loan_status'].isin(['Charged Off', 'Default']), 'target_3class'] = 'not_paid'

# Check distribution
print(df_clean['target_3class'].value_counts())

target_3class
paid_on_time    831511
not_paid        261686
paid_late       210441
Name: count, dtype: int64


In [5]:
df_clean['days_late'] = (df_clean['last_pymnt_d'] - df_clean['expected_end']).dt.days

late_fully_paid = df_clean[
    (df_clean['loan_status'] == 'Fully Paid') &
    (df_clean['paid_late_flag'] == True) &
    (df_clean['target_3class'] == 'paid_late') &
    (df_clean['days_late'] > 100)
]

print(late_fully_paid.shape)
late_fully_paid[['issue_d', 'expected_end', 'last_pymnt_d', 'days_late', 'loan_status', 'target_3class']].head(10)


helper_cols_to_drop = [
    'term_months',
    'expected_end',
    'paid_late_flag',
    'last_pymnt_d',
]

df_clean = df_clean.drop(columns=[c for c in helper_cols_to_drop if c in df_clean.columns])

(950, 150)


***Handling Correlation***

In [6]:
# --- Correlation check (numeric columns only) ---

# Select numeric columns
numeric_cols = df_clean.select_dtypes(include=['number']).columns

# Compute absolute correlation matrix
corr = df_clean[numeric_cols].corr().abs()

# Find highly correlated pairs (r > 0.9)
high_corr_pairs = [
    (c1, c2, corr.loc[c1, c2])
    for c1 in corr.columns
    for c2 in corr.columns
    if c1 != c2 and corr.loc[c1, c2] > 0.9
]

# Sort by correlation strength and show if > 0.9
high_corr_pairs = sorted(list(set(tuple(sorted(p[:2])) for p in high_corr_pairs)), key=lambda x: corr.loc[x[0], x[1]], reverse=True)
print("Highly correlated numeric column pairs (corr > 0.9):")
for c1, c2 in high_corr_pairs[:30]:
    print(f"{c1:30} ↔ {c2:30}  corr = {corr.loc[c1, c2]:.2f}")


# Extract a flat list of columns that appear in high-correlation pairs
high_corr_cols = sorted({c for c1, c2 in high_corr_pairs for c in (c1, c2)})
print("\nColumns involved in high correlations (>0.9):")
print(high_corr_cols)



Highly correlated numeric column pairs (corr > 0.9):
out_prncp                      ↔ out_prncp_inv                   corr = 1.00
hardship_amount                ↔ orig_projected_additional_accrued_interest  corr = 1.00
funded_amnt                    ↔ loan_amnt                       corr = 1.00
total_pymnt                    ↔ total_pymnt_inv                 corr = 1.00
funded_amnt                    ↔ funded_amnt_inv                 corr = 1.00
num_sats                       ↔ open_acc                        corr = 1.00
funded_amnt_inv                ↔ loan_amnt                       corr = 1.00
num_actv_rev_tl                ↔ num_rev_tl_bal_gt_0             corr = 0.98
tot_cur_bal                    ↔ tot_hi_cred_lim                 corr = 0.97
collection_recovery_fee        ↔ recoveries                      corr = 0.97
total_pymnt                    ↔ total_rec_prncp                 corr = 0.97
total_pymnt_inv                ↔ total_rec_prncp                 corr = 0.97
total_bal_i

In [7]:
# אחוזי חוסרים לכל עמודה
missing_percent = df_clean.isnull().mean() * 100

# עמודות עם יותר מ-90% חוסרים
high_missing_90 = missing_percent[missing_percent > 90].sort_values(ascending=False)

print(high_missing_90)
high_missing_cols_90 = list(high_missing_90.index)

print(high_missing_cols_90)    


id                                            100.000000
url                                           100.000000
member_id                                     100.000000
next_pymnt_d                                   99.997622
orig_projected_additional_accrued_interest     99.736660
hardship_end_date                              99.590761
hardship_status                                99.590761
deferral_term                                  99.590761
hardship_amount                                99.590761
hardship_start_date                            99.590761
hardship_dpd                                   99.590761
payment_plan_start_date                        99.590761
hardship_length                                99.590761
hardship_loan_status                           99.590761
hardship_payoff_balance_amount                 99.590761
hardship_last_payment_amount                   99.590761
hardship_reason                                99.590761
hardship_type                  

In [8]:
# Columns that contain post-loan information 
leakage_columns = [
    'hardship_flag', 'debt_settlement_flag',
    'total_pymnt', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee',
    'last_pymnt_d', 'last_pymnt_amnt', 'recoveries',
    'collection_recovery_fee', 'out_prncp', 'total_pymnt_inv', 'out_prncp_inv'
]

#not important and highly correlated
high_corr_drop_columns = [
    'funded_amnt', 'funded_amnt_inv', 'installment',
    'num_rev_tl_bal_gt_0', 'tot_hi_cred_lim',
    'total_il_high_credit_limit','num_sats'
]

#zero variance
redundant_columns = ['policy_code','disbursement_method','chargeoff_within_12_mths']

#didnt remove 
not_leakage = ['grade', 'sub_grade', 'issue_d']
important_columns = ['total_rev_hi_lim', 'initial_list_status']
check_before_remove = ['revol_bal', 'open_acc', 'title', 'last_credit_pull_d']
high_corr_but_relevant = ['total_bal_ex_mort']

In [9]:
# ---------- Cleaning function used inside the Pipeline ----------
def basic_clean1(df: pd.DataFrame) -> pd.DataFrame:
    clean_df = df.copy()
    
    # 1. deleting leakage columns + high correlation columns + zero variance columns
    cols_to_drop_fixed = [c for c in (leakage_columns + high_corr_drop_columns + redundant_columns) if c in clean_df.columns]
    clean_df = clean_df.drop(columns=cols_to_drop_fixed)

    # 2. deleting columns with more than 90% missing values
    missing_percent = clean_df.isnull().mean() * 100
    high_missing_cols_90 = missing_percent[missing_percent > 90].index.tolist()
    clean_df = clean_df.drop(columns=[c for c in high_missing_cols_90 if c in clean_df.columns])

    # 3. removing 'Not Verified' from verification_status
    clean_df = clean_df[clean_df['verification_status'] != 'Not Verified'].reset_index(drop=True)
    clean_df = clean_df.drop(columns=['verification_status'])

    # 4. cleaning term – keep only the number (36, 60, etc.)
    clean_df['term'] = (clean_df['term'].astype(str).str.extract(r'(\d+)')[0].astype(float))

    # 5. cleaning emp_length
    emp = clean_df['emp_length'].astype(str)
    emp = emp.str.replace('< 1', '0', regex=False)
    emp = emp.str.extract(r'(\d+)')[0]
    clean_df['emp_length'] = emp.astype(float)

    # 6. removing rows with missing target_3class
    clean_df = clean_df[clean_df['target_3class'].notna()].reset_index(drop=True)
        
    return clean_df


df_clean = basic_clean1(df_clean)


In [10]:
#checking data before removing non informative columns
# pymnt_plan – check distribution (if almost all 'n' → not informative)
print(df['pymnt_plan'].value_counts(dropna=False, normalize=True) * 100)

# zip_code –  many unique values = high-cardinality
print("zip_code unique:", df['zip_code'].nunique())

# collections_12_mths_ex_med – check % of zeros (very high → not informative)
print("collections_12_mths_ex_med zeros(%):", (df['collections_12_mths_ex_med'] == 0).mean() * 100)

# application_type – check category balance ( 'INDIVIDUAL' 94.7% - leave for now)
print(df['application_type'].value_counts(dropna=False, normalize=True) * 100)

# tot_coll_amt – check missing and zeros 3% missing, 85% zeros - leave for now
print("tot_coll_amt missing(%):", df['tot_coll_amt'].isna().mean() * 100)
print("tot_coll_amt zeros(%):", (df['tot_coll_amt'].fillna(0) == 0).mean() * 100)

# num_sats – check correlation with total_acc (if r≈1.0 → redundant)
print(df[['num_sats', 'total_acc']].corr())

pymnt_plan
n    99.969832
y     0.030168
Name: proportion, dtype: float64
zip_code unique: 956
collections_12_mths_ex_med zeros(%): 98.33752678411868
application_type
Individual    94.660428
Joint App      5.339572
Name: proportion, dtype: float64
tot_coll_amt missing(%): 3.1086386855566586
tot_coll_amt zeros(%): 85.21397215336351
           num_sats  total_acc
num_sats   1.000000   0.715618
total_acc  0.715618   1.000000


In [11]:
region_map = {
        'me': 'northeast', 'nh': 'northeast', 'vt': 'northeast', 'ma': 'northeast',
        'ri': 'northeast', 'ct': 'northeast', 'ny': 'northeast', 'nj': 'northeast', 'pa': 'northeast',
        'oh': 'midwest', 'in': 'midwest', 'il': 'midwest', 'mi': 'midwest', 'wi': 'midwest',
        'mn': 'midwest', 'ia': 'midwest', 'mo': 'midwest', 'nd': 'midwest', 'sd': 'midwest',
        'ne': 'midwest', 'ks': 'midwest',
        'de': 'south', 'md': 'south', 'dc': 'south', 'va': 'south', 'wv': 'south',
        'nc': 'south', 'sc': 'south', 'ga': 'south', 'fl': 'south',
        'ky': 'south', 'tn': 'south', 'al': 'south', 'ms': 'south',
        'ar': 'south', 'la': 'south', 'tx': 'south', 'ok': 'south',
        'mt': 'west', 'id': 'west', 'wy': 'west', 'co': 'west', 'nm': 'west',
        'az': 'west', 'ut': 'west', 'nv': 'west', 'wa': 'west',
        'or': 'west', 'ca': 'west', 'hi': 'west', 'ak': 'west',
    }

df_clean['state_region'] = df_clean['addr_state'].map(region_map)

In [12]:
from datetime import datetime

# deleting non informative columns + simplifying columns
def basic_clean2(df: pd.DataFrame) -> pd.DataFrame:
    clean_df = df.copy()

    # remove non-informative
    non_informative = [
        'pymnt_plan',
        'zip_code',
        'collections_12_mths_ex_med',
    ]
    clean_df = clean_df.drop(columns=[c for c in non_informative if c in clean_df.columns])

    # clean text columns
    cols_to_clean = ['purpose', 'home_ownership', 'addr_state', 'application_type', 'emp_title']
    for c in cols_to_clean:
        clean_df[c] = clean_df[c].astype(str).str.lower().str.strip()

    # grouped purpose
    clean_df['purpose_grouped'] = clean_df['purpose'].replace({
        'debt_consolidation': 'debt',
        'credit_card': 'debt',
        'home_improvement': 'housing',
        'house': 'housing',
        'small_business': 'business',
        'car': 'personal',
        'medical': 'personal',
        'vacation': 'personal',
        'moving': 'personal',
        'wedding': 'personal',
        'major_purchase': 'personal',
        'renewable_energy': 'other',
        'educational': 'other',
        'other': 'other'
    })

    # home stability
    clean_df['home_stability'] = clean_df['home_ownership'].replace({
        'mortgage': 'stable',
        'own': 'stable',
        'rent': 'unstable',
        'none': 'unstable',
        'other': 'unstable'
    })

    # credit age
    years = (clean_df['earliest_cr_line'].astype(str).str.extract(r'(\d{4})')[0].astype(float))
    clean_df['credit_age_years'] = datetime.now().year - years

    # recorded public issues
    clean_df['bad_records_count'] = clean_df[['pub_rec', 'pub_rec_bankruptcies', 'tax_liens']].sum(axis=1, min_count=1)

    # recent credit activity
    clean_df['recent_credit_activity'] = (
        clean_df['inq_last_6mths']
        + clean_df['num_tl_op_past_12m']
        - (clean_df['mths_since_recent_inq'] / 12)
    )

    # combined balances
    clean_df['total_balance_all'] = clean_df['tot_cur_bal'] + clean_df['total_bal_il']

    # combined active accounts
    clean_df['active_credit_accounts'] = clean_df['num_actv_bc_tl'] + clean_df['num_actv_rev_tl']

    # delinquency severity (any delinquency indicator)
    clean_df['any_delinquency'] = (
        (clean_df['num_accts_ever_120_pd'] > 0) |
        (clean_df['num_tl_120dpd_2m'] > 0) |
        (clean_df['num_tl_90g_dpd_24m'] > 0) |
        (clean_df['num_tl_30dpd'] > 0) |
        (clean_df['delinq_2yrs'] > 0)
    ).astype(int)


    clean_df['is_joint_app'] = (clean_df['application_type'].str.contains('joint')).astype(int)
    clean_df['has_current_delinquency'] = (clean_df['acc_now_delinq'] > 0).astype(int)
    clean_df['has_collections'] = (clean_df['tot_coll_amt'] > 0).astype(int)


    # columns that have been used (remove raw columns after creating features)
    columns_to_remove = [
    # inputs for engineered features
    'pub_rec', 'pub_rec_bankruptcies', 'tax_liens',
    'inq_last_6mths', 'num_tl_op_past_12m', 'mths_since_recent_inq',
    'tot_cur_bal', 'total_bal_il',
    'num_actv_bc_tl', 'num_actv_rev_tl',
    'num_accts_ever_120_pd', 'num_tl_120dpd_2m',
    'num_tl_90g_dpd_24m', 'delinq_2yrs', 'num_tl_30dpd',

    # raw cols replaced by new engineered features
    'purpose',
    'home_ownership',
    'earliest_cr_line',
    'application_type',
    'acc_now_delinq',
    'tot_coll_amt',

    # additional raw / text / redundant columns
    'title',
    'emp_title',
    'addr_state',
    'issue_d',
    'last_credit_pull_d',
    'loan_status',
    'days_late',
    'open_acc',
    'revol_bal'
]
    
    
    clean_df = clean_df.drop(columns=columns_to_remove)
    return clean_df



df_clean = basic_clean2(df_clean)


In [None]:
# Low predictive power – features with minimal contribution to credit risk prediction
def basic_clean3(df: pd.DataFrame) -> pd.DataFrame:
    df_clean = df.copy()
    low_importance_features = [
        'open_act_il',             # rarely informative, few borrowers have active installment accounts
        'open_il_12m',             # very sparse – number of installment loans opened last 12 months
        'open_il_24m',             # similar to above; low predictive value
        'open_rv_12m',             # sparse revolving account activity – unstable feature
        'open_rv_24m',             # redundant and weak
        'open_acc_6m',             # number of accounts opened last 6 months – unstable, noisy
        'inq_fi',                  # financial inquiries – almost always zero
        'total_cu_tl',             # credit union trades – rare, low impact
        'acc_open_past_24mths'     # redundant with more useful credit activity indicators
    ]


    # Redundant or highly correlated features – information already captured by engineered features
    redundant_features = [
        'num_bc_sats',             # count of satisfactory bankcard accounts – correlated with num_bc_tl
        'num_bc_tl',               # total bankcard trade lines – highly correlated with other credit totals
        'num_il_tl',               # total installment accounts – redundant with derived totals
        'num_op_rev_tl',           # revolving accounts – captured by active_credit_accounts
        'num_rev_accts',           # total revolving accounts – redundant, high correlation
        'total_acc'                # total credit accounts – overly broad, correlated with totals
    ]

    # Sparse or missing-heavy features – too many NaN values or extremely rare events
    sparse_features = [
        'mths_since_last_record',          # many missing, very rare credit events
        'mths_since_recent_bc_dlq',        # very sparse delinquency timing
        'mths_since_recent_revol_delinq',  # rarely populated
        'percent_bc_gt_75'                 # unstable distribution, extreme sparsity
    ]

    # Noisy or unstable features – inconsistent behavior, low reliability
    noisy_features = [
        'mo_sin_old_il_acct',      # months since oldest installment account – noisy
        'mo_sin_old_rev_tl_op',    # months since oldest revolving account – correlated & unstable
        'mo_sin_rcnt_rev_tl_op',   # recent revolving age – too volatile
        'mo_sin_rcnt_tl',          # age of recent trade line – noisy, redundant
        'mths_since_rcnt_il'       # months since recent installment loan – sparse/unstable
    ]

    columns_optional = [
    'mths_since_last_delinq', 'avg_cur_bal', 'max_bal_bc', 'all_util',
    'il_util', 'inq_last_12m', 'pct_tl_nvr_dlq',
    'mort_acc', 'total_bc_limit', 'total_acc'
]

    # Combine all removable feature groups
    features_to_remove = (
        low_importance_features +
        redundant_features +
        sparse_features +
        noisy_features+
        columns_optional
    )

    df_clean = df_clean.drop(columns=[c for c in features_to_remove if c in df_clean.columns])

    if 'target_3class' in clean_df.columns:
        clean_df = clean_df[clean_df['target_3class'].notna()].reset_index(drop=True)

    return df_clean


df_clean = basic_clean3(df_clean)

In [14]:
print(len(df_clean.columns))
print(df_clean.columns)

30
Index(['loan_amnt', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_length',
       'annual_inc', 'dti', 'revol_util', 'initial_list_status',
       'mths_since_last_major_derog', 'total_rev_hi_lim', 'bc_open_to_buy',
       'bc_util', 'delinq_amnt', 'mths_since_recent_bc', 'total_bal_ex_mort',
       'target_3class', 'state_region', 'purpose_grouped', 'home_stability',
       'credit_age_years', 'bad_records_count', 'recent_credit_activity',
       'total_balance_all', 'active_credit_accounts', 'any_delinquency',
       'is_joint_app', 'has_current_delinquency', 'has_collections'],
      dtype='object')
