In [1]:
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

ModuleNotFoundError: No module named 'pydotplus'

### Preprocessing

In [None]:
# Load data
file_path = Path(r"C:\Users\TribThapa\Desktop\Thapa\ResearchFellow\Courses\FinTech_Bootcamp_MonashUni2021\monu-mel-virt-fin-pt-05-2021-u-c\Activities\Week 11\2\07-Ins_Gradient_Boosted_Tree\Resources\loans_data_encoded.csv")
loans_df = pd.read_csv(file_path)
loans_df.head()

In [None]:
# Define features set
X = loans_df.copy()
X.drop("bad", axis=1, inplace=True)
X.head()

In [None]:
# Define target vector
y = loans_df["bad"].values.reshape(-1, 1)
y[:5]

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=78)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Choose best learning rate

In [None]:
# Iterate over learning rate to identify the best classifier learning rate
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)
    
    # Fit the model 
    classifier.fit(X_train_scaled, y_train.ravel())
    print(f"Learning rate: {learning_rate}")
    
    # Score the model
    print("Accuracy score (training): {0:.3f}".format(classifier.score(X_train_scaled, y_train.ravel())))
    print("Accuracy score (testing): {0:.3f}".format(classifier.score(X_test_scaled, y_test.ravel())))
    print()

### Create Gradient Boosting Classifier

In [None]:
# Choose a learning rate and create the classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.75,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
classifier.fit(X_train_scaled, y_train.ravel())

# Make Prediction
prediction = classifier.predict(X_test_scaled)

pd.DataFrame({"Predicted": prediction,
              "Actual": y_test.ravel()})

### Evaluate the model

In [None]:
# Calculating the accuracy score
acc = accuracy_score(y_test, prediction)

print(f"Accuracy score: {acc}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, prediction)

cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Generate classification report
print(classification_report(y_test, prediction))

### Visualize the decision tree

In [None]:
# Graph tree
dot_data = tree.export_graphviz(classifier.estimators_[9, 0],
                                out_file=None, 
                                filled=True,
                                rounded=True,
                                special_characters=True,
                                proportion=True)

graph = pydotplus.graph_from_dot_data(dot_data)

Image(graph.create_png())