# Data

| Feature Name               | Description                           |
|----------------------------|-------------------------------------|
| person_age                 | Age                                 |
| person_income              | Annual Income                       |
| person_home_ownership      | Home ownership                      |
| person_emp_length          | Employment length (in years)        |
| loan_intent                | Loan intent                        |
| loan_grade                 | Loan grade                        |
| loan_amnt                 | Loan amount                       |
| loan_int_rate              | Interest rate                     |
| loan_status                | Loan status (0 is non default, 1 is default) |
| loan_percent_income        | Percent income                    |
| cb_person_default_on_file  | Historical default                 |
| cb_preson_cred_hist_length | Credit history length              |


# Imports


In [None]:
!pip install xgboost 

In [76]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestRegressor 
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

In [21]:
original_data=pd.read_csv('credit_risk_dataset.csv')

In [22]:
print(original_data.shape)
32581*0.04

(32581, 12)


1303.24

# Transformations

## removing outliers

In [23]:
mask = (
    (original_data['person_age'] > 85) |
    (original_data['person_income'] > 200000) |
    (original_data['person_emp_length'] > 18) |
    (original_data['loan_amnt'] > 28000)
)

In [24]:
print("Number of rows to be removed:", mask.sum())
print("Original shape:", original_data.shape)
print("Filtered shape:", original_data[~mask].shape)
data=original_data[~mask]

Number of rows to be removed: 941
Original shape: (32581, 12)
Filtered shape: (31640, 12)


## normalizing 

In [25]:
data['loan_amnt']=np.log(data['loan_amnt'])
data['person_income']=np.log(data['person_income'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['loan_amnt']=np.log(data['loan_amnt'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['person_income']=np.log(data['person_income'])


## one hot encoding

In [26]:
# Create one-hot encoded columns for person_home_ownership
home_ownership_encoded = pd.get_dummies(data['person_home_ownership'], prefix='home_ownership',  dtype=int)

# Create one-hot encoded columns for loan_intent  
loan_intent_encoded = pd.get_dummies(data['loan_intent'], prefix='loan_intent',  dtype=int)

# Concatenate the encoded columns with the original data
data_encoded = pd.concat([data, home_ownership_encoded, loan_intent_encoded], axis=1)

# Drop the original categorical columns
data_encoded = data_encoded.drop(['person_home_ownership','home_ownership_OTHER', 'loan_intent', 'loan_intent_HOMEIMPROVEMENT'], axis=1)

data = data_encoded


## ordinal and binary transformation

In [None]:
# Binary transformation for cb_person_default_on_file
data['cb_person_default_on_file'] = (data['cb_person_default_on_file'] == 'Y').astype(int)

# Ordinal transformation for loan_grade (A=1, B=2, C=3, D=4, E=5, F=6, G=7)
grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
data['loan_grade'] = data['loan_grade'].map(grade_mapping)


# Missing values

In [34]:
# Create a dataset without missing values(train dataset)
data_no_missing = data.dropna()

# Create a dataset with records that contain missing values(evaluate dataset)
data_with_missing = data[data.isnull().any(axis=1)]

# Print the shapes of both datasets
print("Dataset without missing values shape:", data_no_missing.shape)
print("Dataset with missing values shape:", data_with_missing.shape)

Dataset without missing values shape: (27794, 18)
Dataset with missing values shape: (3846, 18)


In [36]:
targets = data_no_missing[['person_emp_length', 'loan_int_rate']]

In [38]:
scoring_function = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

## Person_emp_lenght

In [99]:
X = data_no_missing.drop('person_emp_length', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, data_no_missing['person_emp_length'], test_size=0.2, random_state=22)

random forest

In [101]:
model = RandomForestRegressor(random_state=42, n_estimators=300, max_depth=5)
scores = cross_validate(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, return_train_score=True)

print("Cross-validation scores:", -scores['test_score'])


Cross-validation scores: [2.85826353 2.88019276 2.82612494 2.84445746 2.85346545]


lgbmr 

In [102]:
model = lgb.LGBMRegressor(
    n_estimators=300,
    max_depth=5,
    random_state=42
)

# Cross-validation
cv_results = cross_validate(
    model,
    X_train,
    y_train,
    cv=5,
    scoring='neg_mean_absolute_error',
    return_train_score=True,
    n_jobs=-1
)

mae_scores = -cv_results['test_score']
print("Cross-validation MAEs:", mae_scores)

Cross-validation MAEs: [2.83002769 2.85933514 2.79789742 2.81143921 2.7982004 ]


XGBR

In [None]:
model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    verbosity=0
)

# Cross-validation
cv_results = cross_validate(
    model,
    X_train,
    y_train,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    return_train_score=True
)

# Convert to positive MAE
mae_scores = -cv_results['test_score']
print("Cross-validation MAEs:", mae_scores)


Cross-validation MAEs: [2.83263668 2.85904282 2.7876798  2.81434441 2.81092383]
Average MAE: 2.8209255090973095


In [96]:
print(data['person_emp_length'].mean())
2.8209/4.6*100

4.631039864733043


61.323913043478264

## loan_int_rate

In [67]:
X = data_no_missing.drop('loan_int_rate', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, data_no_missing['loan_int_rate'], test_size=0.2, random_state=22)

random forest

In [68]:
model = RandomForestRegressor(random_state=42, n_estimators=300, max_depth=5)
scores = cross_validate(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, return_train_score=True)

print("Cross-validation scores:", -scores['test_score'])

Cross-validation scores: [0.78918031 0.78084566 0.79441703 0.77463867 0.78113804]


lgmr

In [None]:
model = lgb.LGBMRegressor(
    n_estimators=300,
    max_depth=5,
    random_state=22
)

# Cross-validation
cv_results = cross_validate(
    model,
    X_train,
    y_train,
    cv=5,
    scoring='neg_mean_absolute_error',
    return_train_score=True,
    n_jobs=-1
)

mae_scores = -cv_results['test_score']
print("Cross-validation MAEs:", mae_scores)

Cross-validation MAEs: [0.79432965 0.78461909 0.79363067 0.77717533 0.78502012]


XGBR

In [None]:
model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.01,
    random_state=22,
    verbosity=0
)

# Cross-validation
cv_results = cross_validate(
    model,
    X_train,
    y_train,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    return_train_score=True
)

# Convert to positive MAE
mae_scores = -cv_results['test_score']
print("Cross-validation MAEs:", mae_scores)
print("Training scores:", cv_results['train_score'])

Cross-validation MAEs: [0.79073387 0.78179577 0.79554029 0.78172074 0.78182814]


In [77]:
# Fit on full training set
model.fit(X_train, y_train)

# Predict and evaluate on test set
y_pred = model.predict(X_test)
test_mae = mean_absolute_error(y_test, y_pred)
print("Test MAE:", test_mae)

Test MAE: 0.7889745032236203


In [95]:
print(data['loan_int_rate'].mean())
0.78/10.990000 *100

10.981918358787963


7.097361237488626

Our model’s average error is about 7% of the typical value

# Imputing Missing Values

In [None]:
missing_loan_int_rate = data[data['loan_int_rate'].isnull()]

X_missing = missing_loan_int_rate.drop('loan_int_rate', axis=1)
predicted_loan_int_rate = model.predict(X_missing)
    

data_imputed = data.copy()   
# Fill missing values with predictions
data_imputed.loc[data['loan_int_rate'].isnull(), 'loan_int_rate'] = predicted_loan_int_rate
