In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
#define the data path 
path = '/content/drive/MyDrive/Mukund/ML_DL/technocolabs/major_project/data/LC_loan_approval_optimization.csv'

In [7]:
df = pd.read_csv(path).iloc[: , 1:]
df.columns = ['Amount_Requested', 'Risk_Score', 'DTI_Ratio', 'Employment_Length', 'Target']
df.head()

Unnamed: 0,Amount_Requested,Risk_Score,DTI_Ratio,Employment_Length,Target
0,3600.0,677.0,5.91,10,1
1,24700.0,717.0,16.06,10,1
2,20000.0,697.0,10.78,10,1
3,10400.0,697.0,25.37,3,1
4,11950.0,692.0,10.2,4,1


In [8]:
X = df.iloc[:,:4]
Y = df.iloc[:,4:]

In [38]:
print(f'Shape of the independent dataset: {X.shape}')
print(f'Shape of the Target column: {Y.shape}')

Shape of the independent dataset: (2072806, 4)
Shape of the Target column: (2072806, 1)


In [9]:
# Split the dataset into train:validation:test = 80:10:10
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.1, shuffle = True, random_state = 1)

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.111, shuffle = True, random_state = 1)

In [11]:
print(f'Shape of the train dataset: {X_train.shape}')
print(f'Shape of the validation data: {X_val.shape}')
print(f'Shape of the test dataset: {X_test.shape}')

Shape of the train dataset: (1658451, 4)
Shape of the validation data: (207074, 4)
Shape of the test dataset: (207281, 4)


# Data normalization: StandarScaler()

Standardization is a scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.
standardization does not have a bounding range. So, even if data have outliers, they will not be affected by standardization.

In [12]:
scaler = StandardScaler()

In [13]:
df_scaled = scaler.fit_transform(X_train)

In [14]:
df_scaled = pd.DataFrame(df_scaled, columns=X_train.columns)

In [15]:
df_scaled.head()

Unnamed: 0,Amount_Requested,Risk_Score,DTI_Ratio,Employment_Length
0,-0.107113,1.177179,-0.021953,-0.924049
1,-1.105285,-1.31903,-0.017697,-0.924049
2,-0.205076,0.302339,-0.022419,1.547039
3,0.536272,0.53563,-0.021014,-0.924049
4,-0.070045,0.53563,-0.019461,1.547039


In [16]:
#save the standardScaler object to the std_scaler.bin file

from sklearn.externals.joblib import dump, load
dump(scaler, 'std_scaler.bin', compress=True)



['std_scaler.bin']

In [17]:
std=load('std_scaler.bin')

In [18]:
df_val_scaled = std.transform(X_val)

# Hyperparameter tuning and Model building 

Each model has its own sets of parameters that need to be tuned to get optimal output. For every model, our goal is to minimize the error or say to have predictions as close as possible to actual values. This is one of the major objective of hyperparameter tuning.


------------------------------------------------------------------------------

### **Hyperparameters should be tuned**
1. How many number of hidden layers we should have?
2. How many number of neurons we should have in each hidden layer?
3. Learning rate

In [None]:
!pip install keras-tuner

In [22]:
import tensorflow as tf
from tensorflow import keras
from kerastuner.tuners import Hyperband
from tensorflow.keras import layers

In [23]:
'''install the keras-tuner for neural network hyperparameter tuning'''

!pip install -q -U keras-tuner

In [24]:
print(tf.__version__)

2.4.1


In [25]:
def build_model(hp):
  model = keras.Sequential()
  
  for i in range(hp.Int('number_of_layers',2,7)):
    model.add(
        layers.Dense(
            units= hp.Int(
                'unit_'+str(i), 
                min_value = 2, 
                max_value = 100, 
                step = 32), 
                activation = 'relu'))
  
  model.add(layers.Dense(20, activation = 'linear'))   
  model.add(layers.Dense(1, activation='sigmoid'))
  
  model.compile( 
      keras.optimizers.Adam(hp.Choice(
          'learning_rate', [1e-2, 1e-3, 1e-4])), 
          loss='binary_crossentropy', 
          metrics=['accuracy'])
  
  return model

In [26]:
tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=5,
    hyperband_iterations=2,
    directory='my_dir',
    project_name='model_loan1'
    )


In [27]:
tuner.search_space_summary()

Search space summary
Default search space size: 4
number_of_layers (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 7, 'step': 1, 'sampling': None}
unit_0 (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 100, 'step': 32, 'sampling': None}
unit_1 (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 100, 'step': 32, 'sampling': None}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}


In [28]:
tuner.search(df_scaled, y_train,epochs = 5, validation_data = (df_val_scaled,y_val))

Trial 20 Complete [00h 09m 36s]
val_accuracy: 0.885963499546051

Best val_accuracy So Far: 0.8876053690910339
Total elapsed time: 01h 39m 03s
INFO:tensorflow:Oracle triggered exit


In [29]:
print(tuner.get_best_hyperparameters()[0].values)

{'number_of_layers': 4, 'unit_0': 66, 'unit_1': 2, 'learning_rate': 0.001, 'unit_2': 98, 'unit_3': 2, 'unit_4': 66, 'tuner/epochs': 5, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}


In [30]:
print(tuner.get_best_models(1)[0])

<tensorflow.python.keras.engine.sequential.Sequential object at 0x7f4961d44cd0>


In [31]:
tuner.results_summary()

Results summary
Results in my_dir/model_loan1
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
number_of_layers: 4
unit_0: 66
unit_1: 2
learning_rate: 0.001
unit_2: 98
unit_3: 2
unit_4: 66
tuner/epochs: 5
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0
Score: 0.8876053690910339
Trial summary
Hyperparameters:
number_of_layers: 7
unit_0: 66
unit_1: 66
learning_rate: 0.001
unit_2: 34
unit_3: 34
unit_4: 34
tuner/epochs: 5
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0
unit_5: 2
unit_6: 2
Score: 0.887528121471405
Trial summary
Hyperparameters:
number_of_layers: 4
unit_0: 66
unit_1: 34
learning_rate: 0.001
unit_2: 2
unit_3: 66
unit_4: 98
tuner/epochs: 2
tuner/initial_epoch: 0
tuner/bracket: 1
tuner/round: 0
Score: 0.8875136375427246
Trial summary
Hyperparameters:
number_of_layers: 4
unit_0: 66
unit_1: 34
learning_rate: 0.001
unit_2: 2
unit_3: 66
unit_4: 98
tuner/epochs: 5
tuner/initial_epoch: 2
tuner/bracket: 1
tuner/round: 

In [32]:
#get the best model from the trials

model = tuner.get_best_models(1)[0]

In [33]:
from sklearn.metrics import accuracy_score

'''standardize the test data before prediction
    and predict the result.'''

df_test_scaled = std.transform(X_test)
y_prediction = model.predict_classes(df_test_scaled)
print("\n\nThe Test Accuracy of the model is: {} %".format(accuracy_score(y_test, y_prediction) * 100.))





The Test Accuracy of the model is: 88.78720191430956 %


# Save and Load the Model

In [34]:
# Save the entire model to a HDF5 file.
# The '.h5' extension indicates that the model should be saved to HDF5.

model.save('/content/drive/MyDrive/Mukund/ML_DL/technocolabs/major_project/model/model.h5')

In [35]:
# load the exact same model, including its weights and the optimizer
new_model = tf.keras.models.load_model('/content/drive/MyDrive/Mukund/ML_DL/technocolabs/major_project/model/model.h5')

In [36]:
#predict result and find accuracy
y_pred = new_model.predict_classes(df_test_scaled)
print("\n\nThe Test Accuracy of the model is: {} %".format(accuracy_score(y_test, y_pred) * 100.)) 





The Test Accuracy of the model is: 88.78720191430956 %


In [37]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
print()

from sklearn.metrics import classification_report
target_names = ['Fully Paid', 'Default']
print(classification_report(y_test, y_pred, target_names=target_names))

[[ 80488  23152]
 [    90 103551]]

              precision    recall  f1-score   support

  Fully Paid       1.00      0.78      0.87    103640
     Default       0.82      1.00      0.90    103641

    accuracy                           0.89    207281
   macro avg       0.91      0.89      0.89    207281
weighted avg       0.91      0.89      0.89    207281

