# Instructor Do: Decision Trees

In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Loading and Preprocessing Loans Encoded Data

In [2]:
# Loading data
file_path = Path("../Resources/loans_data_encoded.csv")
df_loans = pd.read_csv(file_path)
df_loans.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [3]:
# Define features set
X = df_loans.copy()
X = X.drop("bad", axis=1)
X.head()

Unnamed: 0,amount,term,age,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,6,0,1,0,0,0,1
1,1000,30,50,7,1,0,0,0,1,0
2,1000,30,33,8,1,0,0,0,1,0
3,1000,15,27,9,0,0,0,1,0,1
4,1000,30,28,10,0,0,0,1,1,0


In [4]:
# Define target vector
y = df_loans["bad"].values.reshape(-1, 1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(375, 10)
(125, 10)
(375, 1)
(125, 1)


In [7]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [8]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(400, 10)
(100, 10)
(400, 1)
(100, 1)


In [9]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [10]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [11]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Fitting the Decision Tree Model

In [12]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [13]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

## Making Predictions Using the Tree Model

In [14]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

## Model Evaluation

In [15]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [16]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,53,31
Actual 1,22,19


Accuracy Score : 0.576
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.63      0.67        84
           1       0.38      0.46      0.42        41

    accuracy                           0.58       125
   macro avg       0.54      0.55      0.54       125
weighted avg       0.60      0.58      0.58       125



In [17]:
# Now lets try to do this shit ourselves AGAIN

# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [18]:
# Loading data
file_path = Path("../Resources/loans_data_encoded.csv")
df_loans = pd.read_csv(file_path)
df_loans.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [19]:
# Define the features set.
X = df_loans.copy()
X = X.drop("bad", axis=1)
X.head()


Unnamed: 0,amount,term,age,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,6,0,1,0,0,0,1
1,1000,30,50,7,1,0,0,0,1,0
2,1000,30,33,8,1,0,0,0,1,0
3,1000,15,27,9,0,0,0,1,0,1
4,1000,30,28,10,0,0,0,1,1,0


In [20]:
# Generate the target set data (Bad column)
# Define the target set.
y = df_loans["bad"].values
y[:5]


#Result: 5 Good loan worthy applications

array([0, 0, 0, 0, 0])

In [21]:
#Split our data into training and teting sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [22]:
# By defaulting, training and testing are 75% and 25% of the data. We can see it here

# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(375, 10)
(125, 10)
(375,)
(125,)


In [23]:
# Splitting into Train and Test sets into an 80/20 split.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [24]:
# Determine the shape of our training and testing sets. # See the 80/20 shape split
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(400, 10)
(100, 10)
(400,)
(100,)


In [25]:
# Creating a StandardScaler instance.

scaler = StandardScaler()
#fitting the standard scaler to training data
X_scaler = scaler.fit(X_train)

#Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled=X_scaler.transform(X_test)

In [26]:
import numpy as np
print(np.mean(X_train_scaled[:,0]))
print(np.mean(X_test_scaled[:,0]))
print(np.std(X_train_scaled[:,0]))
print(np.std(X_test_scaled[:,0]))

3.931669804539221e-16
0.08040483006321758
1.0
0.8450480061575104


In [27]:
# Creating the decision tree classifier instance.   #Now doing this with the scaled data so we can be ready to make predictions
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [28]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [29]:
predictions

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0])

In [30]:
#Caluclating the confusion matrix
cm=confusion_matrix(y_test,predictions)

#Create a DF fromt he confusion matrix
cm_df=pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [31]:
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51,33
Actual 1,23,18


In [32]:
# Calculating the accuracy score. # lets see how accurate the model is
acc_score = accuracy_score(y_test, predictions)

In [33]:
acc_score

0.552

In [34]:
# Other way to predict acc_score (True Positives (TP) + True Negatives (TN)) / Total = (50 + 19)/125 = 0.552

In [35]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51,33
Actual 1,23,18


Accuracy Score : 0.552
Classification Report
              precision    recall  f1-score   support

           0       0.69      0.61      0.65        84
           1       0.35      0.44      0.39        41

    accuracy                           0.55       125
   macro avg       0.52      0.52      0.52       125
weighted avg       0.58      0.55      0.56       125



In [36]:
########

In [37]:
### 17.8: Lets do ENSEMBLE RANDOM FOREST

In [38]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [39]:
# Loading data
file_path = Path("../Resources/loans_data_encoded.csv")
df_loans = pd.read_csv(file_path)
df_loans.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [55]:
### Pre-Processing
# define the features set
X = df_loans.copy()
X=X.drop(columns=["bad","gender_male","gender_female"])
X.head()

Unnamed: 0,amount,term,age,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college
0,1000,30,45,6,0,1,0,0
1,1000,30,50,7,1,0,0,0
2,1000,30,33,8,1,0,0,0
3,1000,15,27,9,0,0,0,1
4,1000,30,28,10,0,0,0,1


In [56]:
#Define the target set
y=df_loans["bad"].ravel()
y[:5]

array([0, 0, 0, 0, 0])

In [57]:
#Split into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=78)

In [58]:
#Create the standard scaler instance
scaler=StandardScaler()
#Fitting the scaler to the training data
X_scaler=scaler.fit(X_train)

#Scaling the data
X_train_scaled=X_scaler.transform(X_train)
X_test_scaled=X_scaler.transform(X_test)


In [59]:
# Create a random forest classifier.
rf_model= RandomForestClassifier(n_estimators=500, random_state=78)

In [60]:
#Fit the model
rf_model=rf_model.fit(X_train_scaled, y_train)

In [61]:
#Make the predictions with scaled data
predictions=rf_model.predict(X_test_scaled)

In [62]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1])

In [63]:
#Calculating the confusion matrix
cm=confusion_matrix(y_test, predictions)

#Create a dataframe
cm_df = pd.DataFrame (
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,54,30
Actual 1,26,15


In [64]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.552

In [65]:
#Display the results
print("Confusion MTRX")
display(cm_df)

print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test,predictions))

Confusion MTRX


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,54,30
Actual 1,26,15


Accuracy Score: 0.552
Classification Report
              precision    recall  f1-score   support

           0       0.68      0.64      0.66        84
           1       0.33      0.37      0.35        41

    accuracy                           0.55       125
   macro avg       0.50      0.50      0.50       125
weighted avg       0.56      0.55      0.56       125



In [66]:
### One byproduct of Random Forest is we can rank the features to see which had more of an impact on performance
#Calculate importance of features

importances = rf_model.feature_importances_
importances

array([0.05446627, 0.0830374 , 0.49170228, 0.31730637, 0.01659394,
       0.01700045, 0.00322487, 0.01666841])

In [67]:
#We can sort feat by importance with column
sorted(zip(rf_model.feature_importances_,X.columns), reverse=True)

[(0.4917022849543478, 'age'),
 (0.31730637064517403, 'month_num'),
 (0.08303739718250097, 'term'),
 (0.054466271077120494, 'amount'),
 (0.017000451821704066, 'education_High School or Below'),
 (0.01666841178404515, 'education_college'),
 (0.016593940752497937, 'education_Bachelor'),
 (0.003224871782609585, 'education_Master or Above')]

In [69]:
#Note code has now been run with the gender dropped ^^^^