# Question Two - Bank 

##### Import Required Libraries

In [1]:
# Reading / Writing Files
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split

# Encoding
from sklearn.preprocessing import OneHotEncoder

# Models / Functions / Ensembles
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Accuracy Metrics
from sklearn.metrics import classification_report, confusion_matrix

# Miscellaneous
import time
import warnings
warnings.filterwarnings("ignore")

##### Load in Data, Create Dataframes and Perform Basic Data Cleaning & Operations

In [2]:
bank_fraud_train_df = pd.read_csv('data/assignment_two/portugese_bank_train.csv').dropna(axis='columns', how='all')
bank_fraud_test_df = pd.read_csv('data/assignment_two/portugese_bank_test_one.csv').dropna(axis='columns', how='all')

In [3]:
print(f"Observations in Train DF: {len(bank_fraud_train_df)}.")
print(f"Observations in Test DF: {len(bank_fraud_test_df)}.")

# Again, the training dataset is much smaller than the test dataset.
# Going to go ahead and do my own splits to improve accuracy..

bank_fraud_train_df.head(1)

Observations in Train DF: 4521.
Observations in Test DF: 45211.


Unnamed: 0,age,balance,duration,job,marital,education,default,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,30,1787,79,unemployed,married,primary,no,no,no,cellular,19,oct,1,-1,0,unknown,no


In [4]:
concat_df = pd.concat([bank_fraud_train_df,bank_fraud_test_df])
concat_df['y'] = concat_df['y'].str.replace('no','0').replace('yes','1')

# Clearly based on the information above, "One-Hot-Encoding" Will Be Necessary..
# Pandas has gotten significantly better at encoding (since Python 3+)
# Thus, won't even need to use Sklearn's OneHotEncoding function, just use pd.get_dummies
# See below

encoding_columns = concat_df.keys().tolist()

In [5]:
# Train / Test Split using Sklearn's function.
X = concat_df.drop('y',axis=1)

X_non_encoded = X.iloc[:,:3]
X_encoded = pd.get_dummies(X.iloc[:, 3:], columns=X.iloc[:, 3:].keys().tolist())

X_final = pd.concat([X_non_encoded,X_encoded], axis=1)

y = concat_df[['y']]

X_train, X_test, y_train, y_test = train_test_split(X_final, y, train_size=0.80, random_state=42)

## Create a Model

### Sklearn Basic Decision Tree Model

In [6]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
X_test_prediction = decision_tree_model.predict(X_test)

print(f"Accuracy Score for Basic Decision Tree: {round(decision_tree_model.score(X_test,y_test),6)}")

print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,X_test_prediction))

Accuracy Score for Basic Decision Tree: 0.893536
Confusion Matrix for Decision Tree:
[[8224  550]
 [ 509  664]]


#####  Hyperparameter Tuning an Sklearn Decision Tree Model

In [7]:
parameters = {
    'min_samples_split' : [i*5 for i in range(10,20)],
    'criterion' : ['gini','entropy'],
    'max_leaf_nodes' : [i*5 for i in range(2,6)]
}

##### Grid Search

In [8]:
decision_tree_model_grid = GridSearchCV(decision_tree_model,parameters,n_jobs=-1,verbose=True)
decision_tree_model_grid.fit(X_train, y_train)

grid_parameter_soln = decision_tree_model_grid.best_params_
print(f"Grid Search Optimal Parameters: {grid_parameter_soln}")

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   49.8s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  1.8min finished


Grid Search Optimal Parameters: {'criterion': 'gini', 'max_leaf_nodes': 25, 'min_samples_split': 85}


##### Random Search

In [9]:
decision_tree_model_random = RandomizedSearchCV(decision_tree_model,parameters,n_iter=25,cv=5,n_jobs=-1,verbose=True)
decision_tree_model_random.fit(X_train, y_train)
random_parameter_soln = decision_tree_model_random.best_params_
print(f"Random Search Optimal Parameters: {random_parameter_soln}")

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   32.7s finished


Random Search Optimal Parameters: {'min_samples_split': 95, 'max_leaf_nodes': 25, 'criterion': 'gini'}


##### Creating Models with Hyperparameter Solutions

##### Grid

In [10]:
hyperparameter_grid = DecisionTreeClassifier(**grid_parameter_soln)

hyperparameter_grid.fit(X_train,y_train)
hyperparameter_grid_results = hyperparameter_grid.predict(X_test)

In [11]:
print(f"Accuracy Score After Hypertuning Grid Search for Decision Tree: {round(hyperparameter_grid.score(X_test,y_test),6)}")

Accuracy Score After Hypertuning Grid Search for Decision Tree: 0.902282


##### Random

In [12]:
hyperparameter_random = DecisionTreeClassifier(**random_parameter_soln)

hyperparameter_random.fit(X_train,y_train)
hyperparameter_random_results = hyperparameter_random.predict(X_test)

In [13]:
print(f"Accuracy Score After Hypertuning Random Search for Decision Tree: {round(hyperparameter_random.score(X_test,y_test),6)}")

Accuracy Score After Hypertuning Random Search for Decision Tree: 0.902282


Hyperparameter tuning with Grid Search produced marginally better results than random search results.
This is to be expected, as Grid Search will produce more accurate results at the expense of additional resources (time). Let's see the results of Hyperparameter Tuning a Decision Tree using Grid Search.

##### Using Optimal Parameters

In [14]:
print("=== Confusion Matrix After Hypertuning for Decision Tree ===")
print(confusion_matrix(y_test,hyperparameter_grid_results))

=== Confusion Matrix After Hypertuning for Decision Tree ===
[[8502  272]
 [ 700  473]]


In [15]:
print("=== CLASSIFICATION REPORT ===")
print(classification_report(y_test,hyperparameter_grid_results))

=== CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      8774
           1       0.63      0.40      0.49      1173

    accuracy                           0.90      9947
   macro avg       0.78      0.69      0.72      9947
weighted avg       0.89      0.90      0.89      9947



##### Cross Validation for Hyperparameterized Grid Search

In [16]:
classifier_cross_validation_score = cross_val_score(hyperparameter_grid, X_train, y_train, cv=25, scoring="balanced_accuracy")
print(f"Cross Validation for Grid Search:\n {classifier_cross_validation_score}")

Cross Validation for Grid Search:
 [0.69513911 0.67161474 0.68525444 0.69509705 0.67916686 0.68165619
 0.65370761 0.70628948 0.68987748 0.67450175 0.71045519 0.68513514
 0.67980085 0.6715505  0.69751067 0.71358464 0.64992888 0.6814367
 0.68136558 0.67916074 0.66522048 0.67759602 0.6901138  0.71232159
 0.68255462]


### Random Forest Modeling

##### Sklearn Basic Random Forest Model

In [17]:
random_forest = RandomForestClassifier()

In [18]:
random_forest.fit(X_train, y_train)
random_forest_soln = random_forest.predict(X_test)

In [19]:
print(f"Accuracy Score for Default Random Forest: {round(random_forest.score(X_test,y_test),6)}")

Accuracy Score for Default Random Forest: 0.913441


In [20]:
print("=== Confusion Matrix for Default Random Forest ===")
print(confusion_matrix(y_test,random_forest_soln))

=== Confusion Matrix for Default Random Forest ===
[[8664  110]
 [ 751  422]]


##### Random Forest Grid Search

In [21]:
random_parameters = {
    'min_samples_leaf' : [i for i in range(1,2)],
    'max_depth': [i*5 for i in range(3,5)],
    'max_features':[i*5 for i in range(3,5)],
    'n_estimators':[i for i in range(2,6)]
}

In [22]:
random_forest_grid = GridSearchCV(random_forest,random_parameters,n_jobs=-1,verbose=True)

random_forest_grid.fit(X_train,y_train)
random_forest_grid_soln = random_forest_grid.predict(X_test)

print(random_forest_grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   11.0s finished


{'max_depth': 20, 'max_features': 20, 'min_samples_leaf': 1, 'n_estimators': 5}


In [23]:
print(f"Accuracy Score After Hypertuning Grid Search for Decision Tree: {round(random_forest_grid.score(X_test,y_test),6)}")

Accuracy Score After Hypertuning Grid Search for Decision Tree: 0.896451


##### Random Forest Random Search

In [24]:
random_forest_random = RandomizedSearchCV(random_forest,random_parameters,n_iter=25,cv=5,n_jobs=-1,verbose=True)

In [25]:
random_forest_random.fit(X_train, y_train)
random_forest_random_soln = random_forest_random.best_params_

print(random_forest_random_soln)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   11.0s finished


{'n_estimators': 5, 'min_samples_leaf': 1, 'max_features': 20, 'max_depth': 20}


In [26]:
print(f"Accuracy Score for Random Forest with Random Search: {round(random_forest_random.score(X_test,y_test),6)}")

Accuracy Score for Random Forest with Random Search: 0.892832


In [27]:
print("=== Confusion Matrix After Hypertuning for Decision Tree ===")
print(confusion_matrix(y_test,hyperparameter_grid_results))

=== Confusion Matrix After Hypertuning for Decision Tree ===
[[8502  272]
 [ 700  473]]


Hyperparameter tuning with Grid Search is once again slightly more accurate than random search. 

##### Using the Best Parameters from GridSearchCV

In [28]:
best_forest_grid = RandomForestClassifier(**random_forest_grid.best_params_)

best_forest_grid.fit(X_train, y_train)
best_forest_grid_soln = best_forest_grid.predict(X_test)

In [29]:
print(f"Accuracy Score for Random Forest with Optimal Grid Search Parameters: {round(best_forest_grid.score(X_test,y_test),6)}")

Accuracy Score for Random Forest with Optimal Grid Search Parameters: 0.892832


In [30]:
print("=== Confusion Matrix Using Best Parameters ===")
print(confusion_matrix(y_test,best_forest_grid_soln))

=== Confusion Matrix Using Best Parameters ===
[[8674  100]
 [ 966  207]]


In [31]:
print("=== CLASSIFICATION REPORT ===")
print(classification_report(y_test,best_forest_grid_soln))

=== CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      8774
           1       0.67      0.18      0.28      1173

    accuracy                           0.89      9947
   macro avg       0.79      0.58      0.61      9947
weighted avg       0.87      0.89      0.86      9947



##### Cross-Validation Using Best Parameters from Grid Search

In [32]:
cross_validation_score = cross_val_score(best_forest_grid, X_train, y_train, cv=25, scoring="balanced_accuracy")
print(f"Cross Validation for Grid Search:\n {cross_validation_score}")

Cross Validation for Grid Search:
 [0.54447529 0.55218419 0.58175408 0.58139846 0.57871029 0.57495526
 0.55862739 0.57744459 0.61796601 0.53514508 0.58655761 0.55213371
 0.56671408 0.57681366 0.59466572 0.5883357  0.56493599 0.55789474
 0.57375533 0.60981508 0.543101   0.5784495  0.59665718 0.58210691
 0.58084223]
