# Credit Risk Modelling – Data Preprocessing

## 1. Load Data
Load loan dataset and create backup copy.

## 2. Sanity Checks
Inspect shape, missing values, and data types.

## 3. Feature Cleaning
- Employment length (`emp_length`)
- Earliest credit line (`earliest_cr_line`)
- Loan status / default variable

## 4. Train-Test Split
Prepare features and labels for modelling.

In [1]:
import pandas as pd

file_path = r"loan_data.csv"
df = pd.read_csv(file_path)

# Clean DataFrame
df = pd.read_csv("loan_data.csv", low_memory=False)
df = df.dropna(how="all")
df = df.loc[:, ~df.columns.str.startswith("Unnamed")]

loan_backup = df.copy()

df.head()

  df = pd.read_csv(file_path)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501.0,1296599.0,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430.0,1314167.0,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,
2,1077175.0,1313524.0,2400.0,2400.0,2400.0,36 months,15.96,84.33,C,C5,...,,,,,,,,,,
3,1076863.0,1277178.0,10000.0,10000.0,10000.0,36 months,13.49,339.31,C,C1,...,,,,,,,,,,
4,1075358.0,1311748.0,3000.0,3000.0,3000.0,60 months,12.69,67.79,B,B5,...,,,,,,,,,,


In [None]:
df.tail()

In [None]:
pd.options.display.max_columns = None

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns.values

In [None]:
df.info()

In [None]:
df['emp_length'].unique()

In [None]:
#df['emp_length'] = df['emp_length_int']

df['emp_length_int'] = df['emp_length'].str.replace('+ years', '')
df['emp_length_int'] = df['emp_length'].str.replace('+< years', str(0))
df['emp_length_int'] = df['emp_length'].str.replace(' years', str(0))

In [None]:
type(df['emp_length_int'][0])

In [None]:
# Convert str to integer
df['emp_length_int'] = pd.to_numeric(df['emp_length_int'], errors='coerce')

In [None]:
type(df['emp_length_int'][0])

In [None]:
df['earliest_cr_line']

In [None]:
df['earliest_cr_line_date'] = pd.to_datetime(df['earliest_cr_line'], format='%b-%y', errors='coerce')

# Fixing years that are incorrectly parsed (assuming dates before 2025 should be in the 1900s)
df.loc[df['earliest_cr_line_date'].dt.year > 2025, 'earliest_cr_line_date'] -= pd.DateOffset(years=100)

In [None]:
type(df['earliest_cr_line_date'][0])

In [None]:
# Calculate overall time that has passed after the earliest credit line
pd.to_datetime('2025-09-14') - df['earliest_cr_line_date']

In [None]:
df['earliest_cr_line_date_month'] = round(pd.to_datetime('2025-09-14') - df['earliest_cr_line_date']).dt.days/30.44

In [None]:
df['earliest_cr_line_date_month'].describe()

In [None]:

# Create a new binary column 'default': 1 if risky status, else 0
df['default'] = df['loan_status'].apply(
    lambda x: 1 if x in ['Charged Off', 'Default', 'Late (31-120 days)', 'Late (16-30 days)'] else 0
)

# Print the count of defaults (1) vs non-defaults (0)
print("Default variable distribution:")
print(df['default'].value_counts())

# Print how many loans fall into each original loan_status category
print("\nLoan status distribution:")
print(df['loan_status'].value_counts())

In [None]:
from sklearn.model_selection import train_test_split

# Drop rows where target column 'loan_status' is missing (can't be used in training/ testing)
df = df.dropna(subset=['loan_status'])

# Seperate features (X) from the target variable(y)
X = df.drop(columns='loan_status')
y = df['loan_status']

# Split dataset into traning (80%) and testing sets (20%) with a fixed state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=16)

print("Training features shape:", X_train.shape)  
print("Training labels shape:", X_train.shape)    
print("Test features shape:", X_test.shape)       
print("Test labels shape:", X_test.shape)      