In [1]:
# Imports.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read in data.
df = pd.read_csv('Resources/loans_data_encoded.csv')
df.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [3]:
# Split target column.
y = df['bad']
X = df.drop(columns='bad', axis=1)
X.head()

Unnamed: 0,amount,term,age,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,6,0,1,0,0,0,1
1,1000,30,50,7,1,0,0,0,1,0
2,1000,30,33,8,1,0,0,0,1,0
3,1000,15,27,9,0,0,0,1,0,1
4,1000,30,28,10,0,0,0,1,1,0


In [4]:
# Set up target variable to contain only the values of the column.
y = y.ravel()
y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [5]:
# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [6]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(375, 10)
(125, 10)
(375,)
(125,)


In [7]:
# Create scaler instance.
scaler = StandardScaler()

# Fit the traing data to the scaler.
X_scaler = scaler.fit(X_train)

# Scale the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Check mean and standard deviation of first column.
import numpy as np
print(np.mean(X_train_scaled[:,0]))
print(np.mean(X_test_scaled[:,0]))
print(np.std(X_train_scaled[:,0]))
print(np.std(X_test_scaled[:,0]))

3.931669804539221e-16
0.08040483006321758
1.0
0.8450480061575104


In [9]:
# Instantiate the RFC.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [10]:
# Fit the data.
rf_model = rf_model.fit(X_train_scaled, y_train)

In [11]:
# Make predictions with the test data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1], dtype=int64)

In [12]:
# Start to evaluate the model using the covered metrics - Confusion Matrix first.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,50,34
Actual 1,26,15


In [13]:
# Calculate accuracy.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.52

In [14]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,50,34
Actual 1,26,15


Accuracy Score : 0.52
Classification Report
              precision    recall  f1-score   support

           0       0.66      0.60      0.62        84
           1       0.31      0.37      0.33        41

    accuracy                           0.52       125
   macro avg       0.48      0.48      0.48       125
weighted avg       0.54      0.52      0.53       125



In [15]:
# Calculate feature importance in the RF model.
importances = rf_model.feature_importances_
importances

array([0.05479564, 0.083734  , 0.42871538, 0.32290918, 0.01986699,
       0.02382607, 0.0025937 , 0.0238055 , 0.01947929, 0.02027425])

In [16]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.42871537959784756, 'age'),
 (0.322909180511852, 'month_num'),
 (0.08373400008559587, 'term'),
 (0.054795639837519644, 'amount'),
 (0.023826066130178958, 'education_High School or Below'),
 (0.023805501611235368, 'education_college'),
 (0.020274246776222464, 'gender_male'),
 (0.01986699331709372, 'education_Bachelor'),
 (0.019479294982300867, 'gender_female'),
 (0.0025936971501533874, 'education_Master or Above')]

## Run RF model with bottom 6 features removed.

In [17]:
# Drop the bottom 6 features (below 0.05) and rerun the RF model.
X2 = X.copy()
X2 = X2.drop(columns=['education_High School or Below', 'education_college', 'gender_male', 'education_Bachelor', 'gender_female', 'education_Master or Above'], axis=1)
X2.head()

Unnamed: 0,amount,term,age,month_num
0,1000,30,45,6
1,1000,30,50,7
2,1000,30,33,8
3,1000,15,27,9
4,1000,30,28,10


In [18]:
# Split the data into training and testing sets.
X2_train, X2_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [19]:
# Create scaler instance.
scaler = StandardScaler()

# Fit the traing data to the scaler.
X2_scaler = scaler.fit(X2_train)

# Scale the data.
X2_train_scaled = X2_scaler.transform(X_train)
X2_test_scaled = X2_scaler.transform(X_test)

In [20]:
# Instantiate the RFC.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [21]:
# Fit the data.
rf_model = rf_model.fit(X2_train_scaled, y_train)

In [22]:
# Make predictions with the test data.
predictions = rf_model.predict(X2_test_scaled)
predictions

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1], dtype=int64)

In [23]:
# Review classification report.
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.66      0.60      0.62        84
           1       0.31      0.37      0.33        41

    accuracy                           0.52       125
   macro avg       0.48      0.48      0.48       125
weighted avg       0.54      0.52      0.53       125

