In [84]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from imblearn.over_sampling import RandomOverSampler

| Variable             | Description                                                                                                             |
|----------------------|-------------------------------------------------------------------------------------------------------------------------|
| credit.policy        | 1 if the customer meets the credit underwriting criteria of LendingClub.com, and 0 otherwise.                            |
| purpose              | The purpose of the loan (takes values "credit_card", "debt_consolidation", "educational", "major_purchase", "small_business", and "all_other"). |
| int.rate             | The interest rate of the loan, as a proportion (a rate of 11% would be stored as 0.11). Borrowers judged by LendingClub.com to be more risky are assigned higher interest rates. |
| installment          | The monthly installments owed by the borrower if the loan is funded.                                                    |
| log.annual.inc       | The natural log of the self-reported annual income of the borrower.                                                      |
| dti                  | The debt-to-income ratio of the borrower (amount of debt divided by annual income).                                       |
| fico                 | The FICO credit score of the borrower.                                                                                  |
| days.with.cr.line    | The number of days the borrower has had a credit line.                                                                   |
| revol.bal            | The borrower's revolving balance (amount unpaid at the end of the credit card billing cycle).                           |
| revol.util           | The borrower's revolving line utilization rate (the amount of the credit line used relative to total credit available). |
| inq.last.6mths       | The borrower's number of inquiries by creditors in the last 6 months.                                                    |
| delinq.2yrs          | The number of times the borrower had been 30+ days past due on a payment in the past 2 years.                            |
| pub.rec              | The borrower's number of derogatory public records (bankruptcy filings, tax liens, or judgments).                       |

<a href='https://www.kaggle.com/datasets/itssuru/loan-data' target='_blank'>Dataset Source</a>

In [85]:
# All the data is clean and ready to be used for analysis
df = pd.read_csv('loan_data.csv')

# A brief look on the data
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [86]:
len(df[df['not.fully.paid'] == 0]), len(df[df['not.fully.paid'] == 1])

(8045, 1533)

We have more data for loans that were not fully paid than for loans that were fully paid.<br>
To better represent the data, we will use histnorm='probability' in the px.histogram function.<br>
This will normalize the histogram such that the sum of the bar's heights equals to 1,<br>
and each bar height represents the proportion of data points in that bin.

In [87]:
for col in df.columns[:-1]:
    fig = px.histogram(df, x=col, color='not.fully.paid', nbins=50, barmode='overlay',
                       opacity=0.8, histnorm='probability', title=col,
                       labels={'not.fully.paid': 'Loan Status'}
                       )
    
    fig.update_layout(xaxis_title=col, yaxis_title='Probability')
    
    
    newnames = {'0':'Fully paid' , '1':'NOT Fully paid'}
    fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))
    
    fig.show()

In [88]:
# All data is numrical except for the purpose column
df['purpose'].value_counts()

# Use pandas' get_dummies function to one-hot encode the purpose column
one_hot = pd.get_dummies(df['purpose'])

# Add the one-hot encoded columns back to the original dataframe
df = pd.concat([df, one_hot], axis=1)

# Drop the original purpose column, since it's no longer needed
df = df.drop('purpose', axis=1)

# We will also drop the 'credit.policy' column, since we are building an app
# and we don't expect the user to know whether or not person meets the credit criteria on a lendingclub.com
df = df.drop('credit.policy', axis=1)

# I trained the model with the 'credit.policy' column and without it, and the results without are slightly worse, by 0.23%.
# This is insignificant, so I will drop the column to make the app more user friendly.

In [89]:
# This one is actually doesn't have real-world interpretation, but as we didn't introduce new information
# we only transformed the existing info.
# It helped improve the model accuracy without overfitting
df['dti_pct'] = df['dti'] * 100 / df['log.annual.inc']

# Create column for total credit utilization
df['credit_utilization'] = df['revol.bal'] * df['revol.util']

# Adding these two columns helped increase the accuracy of the model by 2~3% without overfitting.

In [90]:
print(df.columns)

columns = ['int.rate', 'installment', 'log.annual.inc', 'dti',
       'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec',
       'all_other', 'credit_card', 'debt_consolidation', 'educational',
       'home_improvement', 'major_purchase', 'small_business', 'dti_pct', 'credit_utilization']

# We print the columns to make sure that the columns we want to use are in the correct order,
# and we can easily copy and paste them into the columns variable.

Index(['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico',
       'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths',
       'delinq.2yrs', 'pub.rec', 'not.fully.paid', 'all_other', 'credit_card',
       'debt_consolidation', 'educational', 'home_improvement',
       'major_purchase', 'small_business', 'dti_pct', 'credit_utilization'],
      dtype='object')


In [91]:
len(columns)

# We will need this value to determine the number of nodes in the input layer of our neural network.

20

In [92]:
X = df[columns].values
y = df['not.fully.paid'].values

# Converting to numpy arrays, as this is what the model will expect

In [94]:
max_values = df.max()
formatted_max_values = max_values.apply(lambda x: f'{x:.2f}')
print(formatted_max_values)

# As we can see we have a significant difference in magnitudes, so we will need to scale the data.

int.rate                     0.22
installment                940.14
log.annual.inc              14.53
dti                         29.96
fico                       827.00
days.with.cr.line        17639.96
revol.bal              1207359.00
revol.util                 119.00
inq.last.6mths              33.00
delinq.2yrs                 13.00
pub.rec                      5.00
not.fully.paid               1.00
all_other                    1.00
credit_card                  1.00
debt_consolidation           1.00
educational                  1.00
home_improvement             1.00
major_purchase               1.00
small_business               1.00
dti_pct                    294.47
credit_utilization    94249287.00
dtype: object


In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

# I saved the scaling parameters to a file, so that we can use them to scale the data in the app.

We saw that the values in different columns have a significant difference in their magnitudes.<br>To make sure that the model doesn't give more importance to the columns with higher magnitudes,<br>we will scale the data using the StandardScaler.

The formula for standardization using the StandardScaler can be expressed as:

Z = (X - μ) / σ

Where:
- Z is the standardized value
- X is the original value
- μ is the mean of the column
- σ is the standard deviation of the column

In [28]:
print('Before balancing values proportion:', len(y[y == 0]), len(y[y == 1]))
over = RandomOverSampler()
X, y = over.fit_resample(X, y)
print('After balancing values proportion:', len(y[y == 0]), len(y[y == 1]))

# To further optimize our neural network I will balance the data using the RandomOverSampler
# as we have a big difference in cases quantity.

Before balancing values proportion: 8045 1533
After balancing values proportion: 8045 8045


In [29]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# We split our data into 3 sets: training (60%), validation (20%), and testing (20%).

In [43]:
model = tf.keras.Sequential([tf.keras.layers.Dense(20, activation='relu'),
                             tf.keras.layers.Dropout(0.05),
                             tf.keras.layers.Dense(40, activation='relu'),
                             tf.keras.layers.Dropout(0.05),
                             tf.keras.layers.Dense(80, activation='relu'),
                             tf.keras.layers.Dropout(0.05),
                             tf.keras.layers.Dense(160, activation='relu'),
                             tf.keras.layers.Dropout(0.05),
                             tf.keras.layers.Dense(320, activation='relu'),
                             tf.keras.layers.Dropout(0.05),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy']
              )

# Here I created the model with 5 hidden layers, input layer equals to number of columns and each next is double the previous.
# This is not a firt itteration of a model, I tried many different combinations of layers, nodes, optimizers, data augumentations etc.,
# and this one gave me the best results.

In [44]:
model.evaluate(X_test, y_test)

# Here we check the model before trainig as a good practice.
# We can see that the accuracy is 50%, which is expected as we have a balanced dataset.



[0.6920536756515503, 0.4990677535533905]

In [45]:
model.fit(X_train, y_train, batch_size=128, epochs=1000, validation_data=(X_valid, y_valid))

# Here we train the model, I used 1000 epochs as I noticed that the model is still learning after 500 epochs.

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0x18cd81a9060>

In [46]:
model.evaluate(X_test, y_test)

# Great accuracy on a test set that the model has never seen before!



[0.34348544478416443, 0.8673089146614075]

accuracy: 0.90 -  val_accuracy: 0.87 ~ 3%

The difference between the training and validation accuracy is less than 5%!<br>
This means that the model is definitely not overfitting.<br>
This is actually a very good model, with an accuracy of 87% on the test set.<br>