## Training a Loan Approval Model
### Objective
The objective of this notebook is to train a binary classification model that can predict fulfilment of loans and hence, automate loan approvals.

### 1. Exploratory Data Analysis

In [1]:
conda install -n base ipykernel --update-deps --force-reinstall

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... Solving environment: ...working... done
done

## Package Plan ##

  environment location: D:\miniconda

  added / updated specs:
    - asttokens
    - backcall
    - bzip2
    - ca-certificates
    - colorama
    - comm
    - debugpy
    - decorator
    - executing
    - importlib-metadata
    - importlib_metadata
    - ipykernel
    - ipython
    - jedi
    - jupyter_client
    - jupyter_core
    - libffi
    - libsodium
    - libsqlite
    - libzlib
    - matplotlib-inline
    - nest-asyncio
    - openssl
    - packaging
    - parso
    - pickleshare
    - pip
    - platformdirs
    - prompt-toolkit
    - prompt_toolkit
    - psutil
    - pure_eval
    - pygments
    - python-dateutil
    - python=3.9
    - python_abi
    - pywin32
    - pyzmq
    - setuptools
    - six
    - stack_data
    - tk
    - tornado
    - traitlets
    - typing_extensions
    - tzdata
    - ucrt
    - 



  current version: 4.12.0
  latest version: 24.1.2

Please update conda by running

    $ conda update -n base -c defaults conda



EnvironmentNotWritableError: The current user does not have write permissions to the target environment.
  environment location: D:\miniconda




In [3]:
pip install seaborn


Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
     -------------------------------------- 294.9/294.9 kB 4.5 MB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from joblib import dump, load
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

sns.set_theme(style="ticks", palette="pastel")

def remove_outliers(df, col: str):
    q3 = np.quantile(df[col], 0.75)
    q1 = np.quantile(df[col], 0.25)
    l_lmt = q1 - 1.5 * (q3 - q1)
    u_lmt = q3 + 1.5 * (q3 - q1)
    return df[(df[col] >= l_lmt) & (df[col] <= u_lmt)].copy()
    
#Read Sample Dataset
raw_df = pd.read_csv('../data/loan_dataset.csv')
raw_df.head()

Unnamed: 0,int_rate,emp_length,annual_inc,delinq_2yrs,fico_range_high,revol_bal,open_acc,loan_amnt,purpose,fully_paid
0,12.74,4 years,14000.0,0.0,689.0,9360.0,4.0,6150.0,credit_card,Yes
1,9.17,9 years,75000.0,0.0,679.0,16140.0,11.0,2000.0,debt_consolidation,Yes
2,10.99,10+ years,78000.0,0.0,729.0,23356.0,14.0,32000.0,credit_card,Yes
3,13.98,4 years,60000.0,0.0,679.0,20956.0,25.0,18000.0,debt_consolidation,No
4,19.99,< 1 year,35000.0,0.0,699.0,6848.0,5.0,4600.0,debt_consolidation,Yes


In [None]:
#Check number of records and columns
print(raw_df.shape)

: 

In [None]:
#Check of missing values
raw_df.isnull().sum()

: 

In [None]:
raw_df.describe(include='all')

: 

In [None]:
def log_txf(df, cols: list):
    for col in cols:
        df['log_'+col] = np.log(df[col]+1)
    return df

raw_df = log_txf(raw_df, ['annual_inc','revol_bal'])
raw_df.describe(include='all')

: 

In [None]:
plt.figure(figsize=(8,5))
g = sns.countplot(raw_df, x="emp_length", hue="fully_paid")
g = g.set_xticklabels(g.get_xticklabels(), rotation=30)

: 

In [None]:
plt_df = raw_df.sample(frac=0.05)
g = sns.pairplot(plt_df[['int_rate','fico_range_high','open_acc','log_annual_inc','log_revol_bal','loan_amnt','fully_paid']], 
                 hue='fully_paid')

: 

In [None]:
plt.figure(figsize=(5,3))
g = raw_df['fully_paid'].value_counts().plot.bar(title='Check Class Distribution', rot=0)

: 

### 2. Feature Engineering

In [None]:
def remap_emp_length(x):
    if x in ['< 1 year','1 year','2 years']:
        return 'less_than_3yr'
    if x in ['3 years','4 years','5 years']:
        return '3_to_5yr'
    if x in ['6 years','7 years','8 years','9 years']:
        return '6_to_9yr'
    return 'more_than_9yr'

raw_df['emp_len'] = raw_df['emp_length'].map(remap_emp_length)
g = sns.countplot(raw_df, x="emp_len", hue="fully_paid")

: 

In [None]:
plt.figure(figsize=(8,4))
g = sns.boxplot(x="emp_len", y="log_annual_inc", hue="fully_paid", data=raw_df, showfliers = False)

: 

In [None]:
numeric_cols = ['int_rate','log_annual_inc','fico_range_high','loan_amnt']
g = sns.heatmap(raw_df[numeric_cols].corr(), vmin=-1, vmax=1, annot=True)

: 

### 3. Data Preprocessing

In [None]:
#Select features that are discriminatory and uncorrelated
features = ['emp_len','int_rate','log_annual_inc','fico_range_high','loan_amnt']
numeric_features = ['int_rate','log_annual_inc','fico_range_high','loan_amnt']
categorical_features = ['emp_len']
label = 'fully_paid'

numeric_transformer = MinMaxScaler()

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

dat = preprocessor.fit_transform(raw_df)
new_cols = numeric_features + \
list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))

trf_df = pd.DataFrame(dat, columns=new_cols)
trf_df.head()

: 

### 4. Modeling
#### 4.1 Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(raw_df[features], \
                                                    raw_df[label], \
                                                    test_size=0.2, \
                                                    random_state=0)
X_train.head()

: 

#### 4.2 Train Classifier

In [None]:
random_forest = RandomForestClassifier(n_estimators=100,
                                       max_depth=4, 
                                       class_weight = "balanced",
                                       n_jobs=2)

clf = Pipeline(steps=[("preprocessor", preprocessor),\
                      ("binary_classifier", random_forest)]
              )

clf.fit(X_train, y_train)

: 

### 5. Evaluation

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

: 