In [31]:
"""
Task 1: Exploratory Data Analysis (EDA) and Preprocessing

Loads the raw LendingClub data, cleans it, selects pre-loan features,
engineers new features, and performs a time-based train/test split.

Saves the following files for subsequent tasks:
- data/X_train_final.pkl
- data/y_train.pkl
- data/X_test_final.pkl
- data/y_test.pkl
- data/df_model_for_rewards.pkl
- data/train_indices.pkl
- data/test_indices.pkl
"""

'\nTask 1: Exploratory Data Analysis (EDA) and Preprocessing\n\nLoads the raw LendingClub data, cleans it, selects pre-loan features,\nengineers new features, and performs a time-based train/test split.\n\nSaves the following files for subsequent tasks:\n- data/X_train_final.pkl\n- data/y_train.pkl\n- data/X_test_final.pkl\n- data/y_test.pkl\n- data/df_model_for_rewards.pkl \n- data/train_indices.pkl\n- data/test_indices.pkl\n'

In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib
import os
import warnings

warnings.filterwarnings('ignore')

# --- 0. Configuration ---
DATA_FILE = 'accepted_2007_to_2018Q4.csv'
SAMPLE_SIZE = None  # Use None for full dataset
RANDOM_SEED = 42
TIME_SPLIT_DATE = '2015-01-01' # Split point for time-based validation // for testing puporse
OUTPUT_DIR = 'data' # Directory to save processed files

# Set Random Seed
np.random.seed(RANDOM_SEED)

In [33]:
# --- 1. Helper Functions ---
def clean_emp_length(emp_length_str):
    """Converts 'emp_length' string to a number."""
    if pd.isna(emp_length_str):
        return np.nan
    s = str(emp_length_str).strip()
    if s == '10+ years':
        return 10
    elif s == '< 1 year':
        return 0
    elif 'year' in s:
        return int(s.split(' ')[0])
    return np.nan

def calculate_credit_history_months(issue_d, earliest_cr_line_str):
    """Calculates credit history in months at the time of loan issue."""
    if pd.isna(issue_d) or pd.isna(earliest_cr_line_str):
        return np.nan
    credit_line_date = pd.to_datetime(earliest_cr_line_str, format='%b-%Y', errors='coerce')
    if pd.isna(credit_line_date):
        return np.nan
    month_diff = (issue_d.year - credit_line_date.year) * 12 + (issue_d.month - credit_line_date.month)
    return max(0, month_diff)

In [34]:
# Create output directory if it doesn't exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"Created directory: {OUTPUT_DIR}")

# --- 2. Data Loading and Initial Cleaning ---
print(f"Loading data from {DATA_FILE}...")
df = pd.read_csv(DATA_FILE, nrows=SAMPLE_SIZE, low_memory=False)
print(f"Loaded {len(df)} rows.")

df['issue_d'] = pd.to_datetime(df['issue_d'], format='%b-%Y', errors='coerce')

Loading data from accepted_2007_to_2018Q4.csv...
Loaded 2260701 rows.


In [35]:
df.sample(20)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
392949,39651438,,32000.0,32000.0,32000.0,60 months,10.49,687.65,B,B3,...,,,Cash,N,,,,,,
1273506,16411620,,9600.0,9600.0,9600.0,36 months,12.99,323.42,C,C1,...,,,Cash,N,,,,,,
324024,45122316,,4000.0,4000.0,4000.0,36 months,6.68,122.93,A,A3,...,,,Cash,N,,,,,,
2066630,125356772,,6025.0,6025.0,6025.0,36 months,10.91,197.0,B,B4,...,,,Cash,N,,,,,,
477199,128490686,,25000.0,25000.0,25000.0,60 months,26.3,752.96,E,E5,...,,,Cash,N,,,,,,
1975547,88273234,,20000.0,20000.0,20000.0,36 months,9.49,640.57,B,B2,...,,,Cash,N,,,,,,
302723,46824999,,1000.0,1000.0,1000.0,36 months,8.18,31.42,B,B1,...,,,Cash,N,,,,,,
1972188,88725013,,24575.0,24575.0,24575.0,60 months,14.49,578.08,C,C4,...,,,Cash,N,,,,,,
2251549,88939475,,7000.0,7000.0,7000.0,36 months,6.99,216.11,A,A2,...,,,Cash,N,,,,,,
641990,109655666,,16000.0,16000.0,16000.0,36 months,7.07,494.55,A,A2,...,,,Cash,N,,,,,,


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2260701 entries, 0 to 2260700
Columns: 151 entries, id to settlement_term
dtypes: datetime64[ns](1), float64(113), object(37)
memory usage: 2.5+ GB


In [37]:
# ---  Define Target Variable & Filter Data ---
print("Step 1: Defining target variable 'is_default' and filtering...")
final_status_map = {
    'Fully Paid': 0,
    'Charged Off': 1,
    'Default': 1
}
df_model = df[df['loan_status'].isin(final_status_map.keys())].copy()
df_model['is_default'] = df_model['loan_status'].map(final_status_map)
print(f"Filtered for final statuses. Kept {len(df_model)} rows.")

# --- Feature Selection & Engineering (Pre-Loan Only) ---
print("Step 2: Selecting and engineering PRE-LOAN features...")

# Keep post-loan columns in df_model for reward calculation,
# but select only pre-loan features for df_clean
features_to_keep = [
    'loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
    'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
    'issue_d', 'purpose', 'addr_state', 'dti', 'earliest_cr_line',
    'fico_range_low', 'fico_range_high', 'open_acc', 'pub_rec', 'revol_bal',
    'revol_util', 'total_acc', 'initial_list_status', 'application_type',
    'is_default' # Target variable
]

available_cols = [col for col in features_to_keep if col in df_model.columns]
df_clean = df_model[available_cols].copy()

# --- Feature Engineering ---
df_clean['emp_length'] = df_clean['emp_length'].apply(clean_emp_length)
df_clean['credit_history_months'] = df_clean.apply(
    lambda row: calculate_credit_history_months(row['issue_d'], row['earliest_cr_line']),
    axis=1
)
df_clean['term'] = df_clean['term'].astype(str).str.extract('(\d+)').astype(float)
df_clean['fico_score'] = df_clean['fico_range_low'] # Use low-bound as FICO

# Drop columns used only for engineering or splitting
df_clean = df_clean.drop(columns=['earliest_cr_line', 'fico_range_low', 'fico_range_high'])
print("Pre-loan feature engineering complete.")

# --- Time-Based Train-Test Split ---
print("Step 3: Splitting data using time-based split...")
df_clean = df_clean.dropna(subset=['issue_d'])

train_df = df_clean[df_clean['issue_d'] < TIME_SPLIT_DATE]
test_df = df_clean[df_clean['issue_d'] >= TIME_SPLIT_DATE]

X_train = train_df.drop(columns=['is_default', 'issue_d'])
y_train = train_df['is_default']
X_test = test_df.drop(columns=['is_default', 'issue_d'])
y_test = test_df['is_default']

# Store indices for mapping back to df_model (for rewards)
train_indices = X_train.index
test_indices = X_test.index

print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")
if len(X_train) == 0 or len(X_test) == 0:
      raise ValueError("Train or test set is empty. Check TIME_SPLIT_DATE and data range.")

numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=np.number).columns.tolist()

# --- Preprocessing (Imputation, Encoding, Scaling) ---
print("Step 4: Preprocessing features (Impute, Encode, Scale)...")

# Numeric Imputation
num_imputer = SimpleImputer(strategy='median')
X_train[numeric_features] = num_imputer.fit_transform(X_train[numeric_features])
X_test[numeric_features] = num_imputer.transform(X_test[numeric_features])

# Categorical Imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_features] = cat_imputer.fit_transform(X_train[categorical_features])
X_test[categorical_features] = cat_imputer.transform(X_test[categorical_features])

# One-Hot Encoding
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat_encoded = ohe.fit_transform(X_train[categorical_features])
X_test_cat_encoded = ohe.transform(X_test[categorical_features])
ohe_feature_names = ohe.get_feature_names_out(categorical_features)
X_train_ohe_df = pd.DataFrame(X_train_cat_encoded, columns=ohe_feature_names, index=X_train.index)
X_test_ohe_df = pd.DataFrame(X_test_cat_encoded, columns=ohe_feature_names, index=X_test.index)

# Scaling Numeric Features
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[numeric_features])
X_test_num_scaled = scaler.transform(X_test[numeric_features])
X_train_scaled_df = pd.DataFrame(X_train_num_scaled, columns=numeric_features, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_num_scaled, columns=numeric_features, index=X_test.index)

# Combine Processed Features
X_train_final = pd.concat([X_train_scaled_df, X_train_ohe_df], axis=1)
X_test_final = pd.concat([X_test_scaled_df, X_test_ohe_df], axis=1)

print(f"Final training data shape: {X_train_final.shape}")
print(f"Final test data shape: {X_test_final.shape}")

# --- Save Processed Data ---
print(f"Saving processed data to '{OUTPUT_DIR}' directory...")
joblib.dump(X_train_final, os.path.join(OUTPUT_DIR, 'X_train_final.pkl'))
joblib.dump(y_train, os.path.join(OUTPUT_DIR, 'y_train.pkl'))
joblib.dump(X_test_final, os.path.join(OUTPUT_DIR, 'X_test_final.pkl'))
joblib.dump(y_test, os.path.join(OUTPUT_DIR, 'y_test.pkl'))

# Save data needed for reward calculation
joblib.dump(df_model, os.path.join(OUTPUT_DIR, 'df_model_for_rewards.pkl'))
joblib.dump(train_indices, os.path.join(OUTPUT_DIR, 'train_indices.pkl'))
joblib.dump(test_indices, os.path.join(OUTPUT_DIR, 'test_indices.pkl'))

# Save fitted transformers
joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'scaler.pkl'))
joblib.dump(ohe, os.path.join(OUTPUT_DIR, 'ohe.pkl'))
joblib.dump(num_imputer, os.path.join(OUTPUT_DIR, 'num_imputer.pkl'))
joblib.dump(cat_imputer, os.path.join(OUTPUT_DIR, 'cat_imputer.pkl'))


Step 1: Defining target variable 'is_default' and filtering...
Filtered for final statuses. Kept 1345350 rows.
Step 2: Selecting and engineering PRE-LOAN features...
Pre-loan feature engineering complete.
Step 3: Splitting data using time-based split...
Train set: (451060, 22), Test set: (894290, 22)
Step 4: Preprocessing features (Impute, Encode, Scale)...
Final training data shape: (451060, 132)
Final test data shape: (894290, 132)
Saving processed data to 'data' directory...


['data/cat_imputer.pkl']