# Question Two - Target Marketing 

##### Import Required Libraries

In [1]:
# Reading / Writing Files
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split

# Encoding
from sklearn.preprocessing import OneHotEncoder

# Models / Functions / Ensembles
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Accuracy Metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Miscellaneous
import time
import warnings
warnings.filterwarnings("ignore")

##### Load in Data, Create Dataframes and Perform Basic Data Cleaning & Operations

In [2]:
bank_fraud_train_df = pd.read_csv('data/assignment_two/portugese_bank_train.csv').dropna(axis='columns', how='all')
bank_fraud_test_df = pd.read_csv('data/assignment_two/portugese_bank_test_one.csv').dropna(axis='columns', how='all')

In [3]:
print(f"Observations in Train DF: {len(bank_fraud_train_df)}.")
print(f"Observations in Test DF: {len(bank_fraud_test_df)}.")

# Again, the training dataset is much smaller than the test dataset.
# Going to go ahead and do my own splits to improve accuracy..

bank_fraud_train_df.head(1)

Observations in Train DF: 4521.
Observations in Test DF: 45211.


Unnamed: 0,age,balance,duration,job,marital,education,default,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,30,1787,79,unemployed,married,primary,no,no,no,cellular,19,oct,1,-1,0,unknown,no


In [4]:
concat_df = pd.concat([bank_fraud_train_df,bank_fraud_test_df])
concat_df['y'] = concat_df['y'].str.replace('no','0').replace('yes','1')

# Clearly based on the information above, "One-Hot-Encoding" Will Be Necessary..
# Pandas has gotten significantly better at encoding (since Python 3+)
# Thus, won't even need to use Sklearn's OneHotEncoding function, just use pd.get_dummies
# See below

encoding_columns = concat_df.keys().tolist()

In [5]:
# Train / Test Split using Sklearn's function.
X = concat_df.drop('y',axis=1)
X_encoded = pd.get_dummies(X, columns=X.keys().tolist())
y = concat_df[['y']]

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, train_size=0.80, random_state=42)

## Create a Model

#### Sklearn Basic Decision Tree Model

In [6]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
X_test_prediction = decision_tree_model.predict(X_test)

print(f"Accuracy Score for Basic Decision Tree: {round(decision_tree_model.score(X_test,y_test),6)}")

print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,X_test_prediction))

Accuracy Score for Basic Decision Tree: 0.889012
Confusion Matrix for Decision Tree:
[[8342  432]
 [ 672  501]]


#####  Hyperparameter Tuning an Sklearn Decision Tree Model

In [7]:
parameters = {
    'min_samples_split' : [i*50 for i in range(1,25)],
    'criterion' : ['gini','entropy'],
    'max_leaf_nodes' : [i*2 for i in range(1,10)]
}

In [8]:
# Grid Search
decision_tree_model_grid = GridSearchCV(decision_tree_model,parameters,verbose=True)
decision_tree_model_grid.fit(X_train, y_train)

grid_parameter_soln = decision_tree_model_grid.best_params_
print(f"Grid Search Optimal Parameters: {grid_parameter_soln}")

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [None]:
# Random Search
decision_tree_model_random = RandomizedSearchCV(decision_tree_model,parameters,n_iter=100,cv=5,verbose=True)
decision_tree_model_random.fit(X_train, y_train)
random_parameter_soln = decision_tree_model_random.best_params_
print(f"Grid Search Optimal Parameters: {random_parameter_soln}")

##### Using Models with Hyperparameter Solutions

In [None]:
# Grid
hyperparameter_grid = DecisionTreeClassifier(**grid_parameter_soln)

hyperparameter_grid.fit(X_train,y_train)
hyperparameter_grid_results = hyperparameter_grid.predict(X_test)

In [None]:
print(f"Accuracy Score After Hypertuning Grid Search for Decision Tree: {round(hyperparameter_grid.score(X_test,y_test),6)}")

In [None]:
# Random
hyperparameter_random = DecisionTreeClassifier(**random_parameter_soln)

hyperparameter_random.fit(X_train,y_train)
hyperparameter_random_results = hyperparameter_random.predict(X_test)

In [None]:
print(f"Accuracy Score After Hypertuning Random Search for Decision Tree: {round(hyperparameter_random.score(X_test,y_test),6)}")

Hyperparameter tuning with Grid Search produced marginally better results than random search results.
This is to be expected, as Grid Search will produce more accurate results at the expense of additional resources (time). Let's see the results of Hyperparameter Tuning a Decision Tree using Grid Search.

In [None]:
print("Confusion Matrix After Hypertuning for Decision Tree")
print(confusion_matrix(y_test,hyperparameter_grid_results))

In [None]:
print("=== CLASSIFICATION REPORT ===")
print(classification_report(y_test,hyperparameter_grid_results))

##### Cross Validation for Hyperparameterized Grid Search

In [None]:
classifier_cross_validation_score = cross_val_score(hyperparameter_grid, X_train, y_train, cv=25, scoring="balanced_accuracy")
print(f"Cross Validation for Grid Search:\n {classifier_cross_validation_score}")

##### Sklearn Basic Random Forest Model

In [None]:
random_parameters = {
    'min_samples_leaf' : [i*2 for i in range(1,5)],
    'max_depth': [i*5 for i in range(1,5)],
    'max_features':[i*10 for i in range(2,7)],
    'n_estimators':[i*2 for i in range(1,5)]
}

In [None]:
random_forest = RandomForestClassifier()

In [None]:
random_forest.fit(X_train, y_train)
random_forest_soln = random_forest.predict(X_test)

In [None]:
print(f"Accuracy Score for Default Random Forest: {round(random_forest.score(X_test,y_test),6)}")

In [None]:
print("Confusion Matrix for Default Random Forest:")
print(confusion_matrix(y_test,random_forest_soln))

##### Random Forest Grid Search with Cross-Validation

In [None]:
random_forest_grid = GridSearchCV(random_forest,random_parameters,verbose=True)

random_forest_grid.fit(X_train,y_train)
random_forest_grid_soln = random_forest_grid.predict(X_test)

print(random_forest_grid.best_params_)

In [None]:
print(f"Accuracy Score After Hypertuning Grid Search for Decision Tree: {round(random_forest_grid.score(X_test,y_test),6)}")

##### Random Forest Random Search

In [None]:
random_forest_random = RandomizedSearchCV(random_forest,random_parameters,n_iter=10,cv=5)

In [None]:
random_forest_random.fit(X_train, y_train)
random_forest_random_soln = random_forest_random.best_params_

In [None]:
print(random_forest_random_soln)
print(f"Accuracy Score for Random Forest with Random Search: {round(random_forest_random.score(X_test,y_test),6)}")

In [None]:
print("Confusion Matrix After Hypertuning for Decision Tree")
print(confusion_matrix(y_test,hyperparameter_grid_results))

Hyperparameter tuning with Grid Search is once again slightly more accurate than random search. 

In [None]:
best_forest_grid = RandomForestClassifier(**random_forest_grid.best_params_)

best_forest_grid.fit(X_train, y_train)
best_forest_grid_soln = best_forest_grid.predict(X_test)

In [None]:
# Accuracy Check

In [None]:
print("Confusion Matrix Using Best Parameters")
print(confusion_matrix(y_test,best_forest_grid_soln))

In [None]:
print("=== CLASSIFICATION REPORT ===")
print(classification_report(y_test,best_forest_grid_soln))

##### Cross-Validation Using Best Parameters from Grid Search

In [None]:
cross_validation_score = cross_val_score(best_forest_grid, X_train, y_train, cv=25, scoring="balanced_accuracy")
print(f"Cross Validation for Grid Search:\n {cross_validation_score}")

##### Calculating F1 Score