In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


## Data preprocesing

In [3]:
df = df.drop(['id'], axis=1)
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


### Encode categorical features

In [4]:
## OHE - Cols
df.head()
ohe = OneHotEncoder(sparse_output=False) #if it don't conver it to sparse, then convert it to array (toarray())
person_home_ownership_ohe = ohe.fit_transform(df[['person_home_ownership']])
person_home_ownership_ohe

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]])

In [5]:
person_home_ownership_ohe_df = pd.DataFrame(person_home_ownership_ohe, columns=ohe.get_feature_names_out(['person_home_ownership']))
#combine ohe_data with the data
df = pd.concat([df.drop('person_home_ownership', axis=1), person_home_ownership_ohe_df], axis=1)

In [6]:
df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT
0,37,35000,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0,0.0,0.0,0.0,1.0
1,22,56000,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0,0.0,0.0,1.0,0.0
2,29,28800,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0,0.0,0.0,1.0,0.0
3,30,70000,14.0,VENTURE,B,12000,11.11,0.17,N,5,0,0.0,0.0,0.0,1.0
4,22,60000,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0,0.0,0.0,0.0,1.0


In [7]:
ohe = OneHotEncoder(sparse_output=False)
def convert_to_ohe(dataframe, col_name, ohe):
    if len(col_name) == 1:
        col_name_ohe = ohe.fit_transform(dataframe[[col_name]])
        col_name_ohe_df = pd.DataFrame(col_name_ohe, columns=ohe.get_feature_names_out([col_name]))
        return pd.concat([dataframe.drop(col_name, axis=1), col_name_ohe_df], axis=1)
    
    else:
        for i in col_name:
            ohe = OneHotEncoder(sparse_output=False)
            col_name_ohe = ohe.fit_transform(dataframe[[i]])
            col_name_ohe_df = pd.DataFrame(col_name_ohe, columns=ohe.get_feature_names_out([i]))
            dataframe = pd.concat([dataframe.drop(i, axis=1), col_name_ohe_df], axis=1)

        return dataframe

In [8]:
df = convert_to_ohe(df, ['loan_intent', 'loan_grade', 'cb_person_default_on_file'], ohe)

In [9]:
df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,37,35000,0.0,6000,11.49,0.17,14,0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,22,56000,6.0,4000,13.35,0.07,2,0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,29,28800,8.0,6000,8.9,0.21,10,0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,30,70000,14.0,12000,11.11,0.17,5,0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,22,60000,2.0,6000,6.92,0.1,3,0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
with open('encoder.pkl', 'wb') as file:
    pickle.dump(ohe, file)

## Dividing data to dependent and independent features

In [11]:
X = df.drop('loan_status', axis=1)
y= df['loan_status']


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)

# Scaling these features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

## ANN Implementation

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

In [15]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)) ,
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
model.summary()

In [17]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)

In [18]:
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])


In [19]:
log_dir = "logs/fit" + datetime.datetime.now().strftime("%Y%m%d - %H%M%S")

In [20]:
tf_callbacks=TensorBoard(log_dir = log_dir, histogram_freq=1)
es = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

In [28]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, callbacks=[tf_callbacks, es])

Epoch 1/100
[1m1467/1467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 426us/step - accuracy: 0.9466 - loss: 0.1779 - val_accuracy: 0.9477 - val_loss: 0.1784
Epoch 2/100
[1m1467/1467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 412us/step - accuracy: 0.9467 - loss: 0.1778 - val_accuracy: 0.9464 - val_loss: 0.1771
Epoch 3/100
[1m1467/1467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 405us/step - accuracy: 0.9494 - loss: 0.1707 - val_accuracy: 0.9467 - val_loss: 0.1813
Epoch 4/100
[1m1467/1467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 400us/step - accuracy: 0.9461 - loss: 0.1803 - val_accuracy: 0.9482 - val_loss: 0.1830
Epoch 5/100
[1m1467/1467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 398us/step - accuracy: 0.9474 - loss: 0.1744 - val_accuracy: 0.9475 - val_loss: 0.1811
Epoch 6/100
[1m1467/1467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 390us/step - accuracy: 0.9477 - loss: 0.1724 - val_accuracy: 0.9477 - val_loss: 0.183

In [29]:
model.save('model.h5')



In [30]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [37]:
%tensorboard --logdir logs/logs/logs/fit20241004-183552

In [None]:
#Loading the prediction file for end to end predictions
