# Credit Modelling Random Forest Classifier and ANN

Import libary and dataset

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data = pd.read_csv('../input/credit-risk/original.csv')

In [None]:
data.head()

Check data structure:
* age has null values

In [None]:
data.info()

* age has values < 0. Assuming this is due to fat finger (enter negative accidentally), change the values back to positive

In [None]:
data[data['age'] <0]

In [None]:
data.loc[data['age'] < 0, 'age'] = data['age']*-1


*  To prevent loss of data, replace the age with null values with the mean of the age

In [None]:
data[data.age.isnull()]

In [None]:
age_mean = data[data.age.isnull() == False]['age'].mean()
data['age'] = data['age'].fillna(age_mean)

*  Check the data again. There is no null values. We can proceed to examine the variables.

In [None]:
data.info()

**Examine the distribution of the variables**

* Check the Income data. Income data appears to be evenly distributed across 20k to 70k and there is no clear difference between distribution of income for defaulters and non-defaulters.

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))

sns.distplot( data["income"], bins=50,ax=axes[0]).set_title("Histogram of Income")
viz_1=sns.violinplot(data=data, x='default', y='income', ax=axes[1])
viz_1.set_title('Density and distribution of income for default')


* Check the Age data. Income data appears to be evenly distributed around 20 to 60. The range of defaulters' age is lower and concentrates around 30, while range of non-defaulters is higher and concentrates around 50 to 60. 

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))

sns.distplot( data["age"], bins=20, ax=axes[0]).set_title("Histogram of Age")
viz_2=sns.violinplot(data=data, x='default', y='age', ax=axes[1])
viz_2.set_title('Density and distribution of age for default')

* Check the Loan data. Loan data appears to be right-skewed and concentrates around 0 to 2.5k. The range of defaulters' loan amount is higher and concentrates around 6k to 8k, while range of non-defaulters is lower and concentrates around 2k. 

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))

sns.distplot( data["loan"], bins=20, ax=axes[0] ).set_title("Histogram of Loan")
viz_3=sns.violinplot(data=data, x='default', y='loan', ax=axes[1] )
viz_3.set_title('Density and distribution of loan for default')

* The default in the data seems to around 250 out of 2000 (12.5%)

In [None]:
grouped1 = pd.DataFrame(data.groupby(['default'])['clientid'].count()).reset_index()

label = list(grouped1['clientid'])
plt.bar(grouped1['default'], grouped1['clientid'])
for i in range(len(grouped1)):
    plt.text(x = grouped1['default'][i]-0.1 , y = grouped1['clientid'][i]+0.3, s = label[i], size = 10)

plt.xticks(np.arange(0, 2, 1))
plt.title('Count of defaut')
plt.show()

**Model to predict the default**
* Split data into training and validation set

In [None]:
dataset = data.drop(columns=['clientid'])


X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values

# Splitting the dataset into the Training set and Validation set
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 1)

* Create a Random Forest Classifier Model with number of estimators = 50, although the number of estimator is high, it does not seems to have create a overfitting issues as the out of sample prediction is high

In [None]:
# Fitting Random Forest Classifier to the dataset

from sklearn.ensemble import RandomForestClassifier
regressor = RandomForestClassifier(n_estimators = 50, random_state = 0)
regressor.fit(X_train, y_train)
# Predicting result for training set and validation set
predict_train_rf = regressor.predict(X_train)
predict_val_rf = regressor.predict(X_val)

# Model Performance 
from sklearn.metrics import accuracy_score 
print("Train Score : ", accuracy_score(y_train, predict_train_rf) *  100) 
print("Val Score : ", accuracy_score(y_val, predict_val_rf) *  100) 

* Create an ANN model

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 3, kernel_initializer = 'uniform', activation = 'relu', input_dim = 3))

# Adding the second hidden layer
classifier.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 10, epochs = 50)

In [None]:
# Predicting result for training set and validation set
predict_train_ann = classifier.predict(X_train)
predict_val_ann = classifier.predict(X_val)
predict_train_ann = predict_train_ann.flatten()
predict_val_ann = predict_val_ann.flatten()
train_df = pd.DataFrame({'y_train': y_train, 'predict_train_ann': predict_train_ann})
val_df = pd.DataFrame({'y_val': y_val, 'predict_val_ann': predict_val_ann})
train_df['predict_train_binary_ann'] = train_df['predict_train_ann'].apply(lambda x: 1 if x >= 0.5 else 0)
val_df['predict_val_binary_ann'] = val_df['predict_val_ann'].apply(lambda x: 1 if x >= 0.5 else 0)


# Model Performance 
from sklearn.metrics import accuracy_score 
print("Train Score : ", accuracy_score(y_train, train_df['predict_train_binary_ann']) *  100) 
print("Val Score : ", accuracy_score(y_val, val_df['predict_val_binary_ann']) *  100) 

2-steps ANN model provides a better prediction compared to Random Forest Classifier. The Val Score is different every time it is generated but is consistently above 99%

* Plot the ‘Cumulative Accuracy Profile' (CAP) of the models

In [None]:
#Training Set
total = len(y_train) 
  
# Counting '1' labels in test data 
one_count = np.sum(y_train) 
  
# counting '0' lables in test data  
zero_count = total - one_count 
  
plt.figure(figsize = (10, 6)) 
  
# x-axis ranges from 0 to total number of data
# y-axis ranges from 0 to the total defaulters. 
  
plt.plot([0, total], [0, one_count], c = 'b',  
         linestyle = '--', label = 'Random Model') 


plt.plot([0, one_count, total], [0, one_count, one_count], 
         c = 'grey', linewidth = 2, label = 'Perfect Model') 

lm = [y for _, y in sorted(zip(predict_train_ann, y_train), reverse = True)] 
x = np.arange(0, total + 1) 
y = np.append([0], np.cumsum(lm)) 
plt.plot(x, y, c = 'b', label = 'ANN', linewidth = 2) 

lm = [y for _, y in sorted(zip(predict_train_rf, y_train), reverse = True)] 
x = np.arange(0, total + 1) 
y = np.append([0], np.cumsum(lm)) 
plt.plot(x, y, c = 'red', label = 'Random Forest', linewidth = 2) 

plt.legend() 

In [None]:
#Validation Set
total = len(y_val) 
  
# Counting '1' labels in test data 
one_count = np.sum(y_val) 
  
# counting '0' lables in test data  
zero_count = total - one_count 
  
plt.figure(figsize = (10, 6)) 
  
# x-axis ranges from 0 to total number of data
# y-axis ranges from 0 to the total defaulters. 
  
plt.plot([0, total], [0, one_count], c = 'b',  
         linestyle = '--', label = 'Random Model') 


plt.plot([0, one_count, total], [0, one_count, one_count], 
         c = 'grey', linewidth = 2, label = 'Perfect Model') 

lm = [y for _, y in sorted(zip(predict_val_ann, y_val), reverse = True)] 
x = np.arange(0, total + 1) 
y = np.append([0], np.cumsum(lm)) 
plt.plot(x, y, c = 'b', label = 'ANN', linewidth = 2) 
lm = [y for _, y in sorted(zip(predict_val_rf, y_val), reverse = True)] 
x = np.arange(0, total + 1) 
y = np.append([0], np.cumsum(lm)) 
plt.plot(x, y, c = 'red', label = 'Random Forest', linewidth = 2) 

plt.legend() 