In [1]:
# Dependencies and data.
import pandas as pd

df = pd.read_csv('Resources/loans_data_encoded.csv')
df.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [2]:
# Separate the columns, and split the data for training and testing.
X = df.copy().drop('bad', axis=1)
y = df['bad'].values

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale the data on X_train.
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [3]:
# Use for loop to identify the learning rate that yields best performance.
# Import model.
from sklearn.ensemble import GradientBoostingClassifier

# Create list of learning rates.
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]

# Run loop to determine best rate.
for learning_rate in learning_rates:
    # Create a classifier each time to test each rate, and fit the data to it.
    classifier = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate,
                                                max_features=5, max_depth=3, random_state=0)
    classifier.fit(X_train_scaled, y_train.ravel())
    
    # Get the accuracy score for each classifier and print.
    print("Learning Rate: ", learning_rate)
    print("Accuracy Score TRAINING: {0:.3f}".format(classifier.score(X_train_scaled, y_train)))
    print("Accuracy Score TESTING: {0:.3f}".format(classifier.score(X_test_scaled, y_test)))

Learning Rate:  0.05
Accuracy Score TRAINING: 0.629
Accuracy Score TESTING: 0.512
Learning Rate:  0.1
Accuracy Score TRAINING: 0.656
Accuracy Score TESTING: 0.520
Learning Rate:  0.25
Accuracy Score TRAINING: 0.723
Accuracy Score TESTING: 0.536
Learning Rate:  0.5
Accuracy Score TRAINING: 0.755
Accuracy Score TESTING: 0.560
Learning Rate:  0.75
Accuracy Score TRAINING: 0.781
Accuracy Score TESTING: 0.520
Learning Rate:  1
Accuracy Score TRAINING: 0.797
Accuracy Score TESTING: 0.472


In [4]:
# Using the learning rate of 0.5, we instantiate a model, train it, and create predictions.
classifier = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5,
                                        max_features=5, max_depth=3, random_state=0)

# Fit the data to the model.
classifier.fit(X_train_scaled, y_train)

# Create the predictions.
predictions = classifier.predict(X_test_scaled)

In [5]:
# Assess the model's performance.
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

accuracy_score(y_test, predictions)
# Returns the same value as the classifier.score value in the for loop.

0.56

In [6]:
# Display confusion matrix.
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,49,16
Actual 1,39,21


In [7]:
# And the classification report.
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.56      0.75      0.64        65
           1       0.57      0.35      0.43        60

    accuracy                           0.56       125
   macro avg       0.56      0.55      0.54       125
weighted avg       0.56      0.56      0.54       125

