# Assignment Three - Homesite Quote Conversion

### Import Required Libraries

In [1]:
# Reading / Writing Files & Encoding
import pandas as pd
import numpy as np

# Pre-Processing
from sklearn.model_selection import train_test_split

# Sampling
from imblearn.over_sampling import SMOTE

# Ensembles & Functions
from vecstack import stacking
from sklearn.svm import LinearSVC as svc
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# Accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Miscellaneous
from collections import Counter #for Smote, 
import warnings
warnings.filterwarnings("ignore")

### Load in Data, Create Dataframes and Perform Basic Cleaning & Operations

In [2]:
train_dataframe = pd.read_csv("data/assignment_three/revised_train.csv")
train_df = train_dataframe.drop(['QuoteConversion_Flag'], axis=1)
test_df = pd.read_csv("data/assignment_three/revised_test.csv")

In [3]:
print(f"Observations in the Training Dataset: {len(train_df)}.")
print(f"Observations in the Testing Dataset: {len(test_df)}.")

Observations in the Training Dataset: 65000.
Observations in the Testing Dataset: 173836.


In [4]:
X = train_df
y = train_dataframe['QuoteConversion_Flag']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=42)

In [6]:
def accuracy_output(model):
    print("=== ACCURACY ===")
    print(round(model.score(X_test, y_test),6))

In [7]:
def confusion_output(y_test,X_test_prediction):
    print("=== CONFUSION MATRIX ===")
    print(confusion_matrix(y_test, X_test_prediction))

# 1. Decision Tree Classifier

In [8]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
decision_tree_prediction = decision_tree.predict(X_test)

In [9]:
accuracy_output(decision_tree)
confusion_output(y_test, decision_tree_prediction)

=== ACCURACY ===
0.883692
=== CONFUSION MATRIX ===
[[14656  1155]
 [ 1113  2576]]


## 1.1 Hyperparameter Tuning - RandomSearchCV

In [10]:
parameters = {
    'min_samples_split' : [i*50 for i in range(1,20)],
    'criterion' : ['gini','entropy'],
    'max_leaf_nodes' : [i*50 for i in range(2,8)]
}

In [11]:
decision_tree_random = RandomizedSearchCV(decision_tree,parameters,n_iter=50,cv=5,n_jobs=-1,verbose=False)
decision_tree_random.fit(X_train, y_train)
decision_tree_random_params = decision_tree_random.best_params_
print(f"Random Search Optimal Parameters: {decision_tree_random_params}")

Random Search Optimal Parameters: {'min_samples_split': 100, 'max_leaf_nodes': 100, 'criterion': 'entropy'}


# 2. Random Forest

In [12]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
random_forest_prediction = random_forest.predict(X_test)

In [13]:
accuracy_output(random_forest)
confusion_output(y_test, random_forest_prediction)

=== ACCURACY ===
0.905641
=== CONFUSION MATRIX ===
[[15535   276]
 [ 1564  2125]]


## 2.1 Hyperparameter Tuning - RandomSearchCV

In [14]:
random_parameters = {
    'min_samples_leaf' : [i for i in range(1,5)],
    'max_depth': [i*5 for i in range(10,20)],
    'n_estimators':[i*5 for i in range(5,15)]
}

In [15]:
random_forest_random = RandomizedSearchCV(random_forest,random_parameters,n_iter=50,cv=5,n_jobs=-1,verbose=False)

random_forest_random.fit(X_train, y_train)
random_forest_random_params = random_forest_random.best_params_

print(random_forest_random_params)

{'n_estimators': 65, 'min_samples_leaf': 2, 'max_depth': 65}


# 3. Support Vector Machines

In [16]:
support_vector_machines = svc()
support_vector_machines.fit(X_train, y_train)
support_vector_prediction = support_vector_machines.predict(X_test)

In [17]:
accuracy_output(support_vector_machines)
confusion_output(y_test, support_vector_prediction)

=== ACCURACY ===
0.387795
=== CONFUSION MATRIX ===
[[ 4539 11272]
 [  666  3023]]


## 3.1 Hyperparameter Tuning - RandomSearchCV

In [18]:
svm_parameters = {
    'penalty':['l1','l2'],
    'C':[i*50 for i in range(1,5)],
    'max_iter':[i*1000 for i in range(1,5)]
}

In [19]:
svm_random = RandomizedSearchCV(support_vector_machines,svm_parameters,n_iter=5,cv=5,n_jobs=-1,verbose=False)

svm_random.fit(X_train,y_train)
svm_random_params = svm_random.best_params_

print(svm_random_params)

{'penalty': 'l2', 'max_iter': 2000, 'C': 200}


# 4. K-Nearest Neighbors

In [20]:
k_neighbors = KNeighborsClassifier()
k_neighbors.fit(X_train, y_train)
k_neighbors_prediction = k_neighbors.predict(X_test)

In [21]:
accuracy_output(k_neighbors)
confusion_output(y_test, k_neighbors_prediction)

=== ACCURACY ===
0.781333
=== CONFUSION MATRIX ===
[[15037   774]
 [ 3490   199]]


## 4.1 Hyperparameter Tuning - RandomSearchCV

In [22]:
knn_parameters = {
    'n_neighbors':[i*5 for i in range(1, 5)],
    'weights':['uniform','distance'],
    'p':[1,2],
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size':[i*30 for i in range(1,10)]
}

In [23]:
knn_random = RandomizedSearchCV(k_neighbors,knn_parameters,n_iter=10,cv=5,n_jobs=-1,verbose=False)

knn_random.fit(X_train,y_train)
knn_random_params = knn_random.best_params_

print(knn_random_params)

{'weights': 'distance', 'p': 1, 'n_neighbors': 20, 'leaf_size': 150, 'algorithm': 'auto'}


# 5. Multilayer Perceptron

In [24]:
multilayer_perceptron = MLPClassifier()
multilayer_perceptron.fit(X_train, y_train)
multilayer_perceptron_prediction = multilayer_perceptron.predict(X_test)

In [25]:
accuracy_output(multilayer_perceptron)
confusion_output(y_test, multilayer_perceptron_prediction)

=== ACCURACY ===
0.598923
=== CONFUSION MATRIX ===
[[8298 7513]
 [ 308 3381]]


## 5.1 Hyperparameter Tuning - RandomSearchCV

In [26]:
m_layer_parameters = {
    'activation':['identity', 'logistic', 'tanh', 'relu'],
    'learning_rate_init':[i/1000 for i in range(1,5)],
    'max_iter':[i*100 for i in range(1,5)],
    'tol':[i/10000 for i in range(1,4)]
}

In [27]:
m_layer_random = RandomizedSearchCV(multilayer_perceptron,m_layer_parameters,n_iter=5,cv=5,n_jobs=-1,verbose=False)

m_layer_random.fit(X_train,y_train)
m_layer_random_params = m_layer_random.best_params_

print(m_layer_random_params)

{'tol': 0.0001, 'max_iter': 100, 'learning_rate_init': 0.003, 'activation': 'identity'}


# SMOTE

In [28]:
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_train, y_train)

print(f"Original Train Dataframe Shape: {Counter(y_train)}.")
print(f"Post-SMOTE Train Dataframe Shape: {Counter(y_smote)}.")

Original Train Dataframe Shape: Counter({0: 36927, 1: 8573}).
Post-SMOTE Train Dataframe Shape: Counter({0: 36927, 1: 36927}).


# Stacked Model

In [29]:
models = [
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    svc(), 
    KNeighborsClassifier(), 
    MLPClassifier()
]

stack_train, stack_test = stacking(models,                   
    X_train, y_train, X_test,   
    regression=False, 
    mode='oof_pred_bag', 
    needs_proba=False,
    save_dir=None, 
    metric=accuracy_score, 
    n_folds=4, 
    stratified=True,
    shuffle=True,  
    random_state=0,    
    verbose=2)

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [DecisionTreeClassifier]
    fold  0:  [0.87718681]
    fold  1:  [0.88492308]
    fold  2:  [0.88026374]
    fold  3:  [0.87560440]
    ----
    MEAN:     [0.87949451] + [0.00355385]
    FULL:     [0.87949451]

model  1:     [RandomForestClassifier]
    fold  0:  [0.90109890]
    fold  1:  [0.90312088]
    fold  2:  [0.90382418]
    fold  3:  [0.90417582]
    ----
    MEAN:     [0.90305495] + [0.00119148]
    FULL:     [0.90305495]

model  2:     [LinearSVC]
    fold  0:  [0.81090110]
    fold  1:  [0.80307692]
    fold  2:  [0.62276923]
    fold  3:  [0.81142857]
    ----
    MEAN:     [0.76204396] + [0.08047828]
    FULL:     [0.76204396]

model  3:     [KNeighborsClassifier]
    fold  0:  [0.78250549]
    fold  1:  [0.78215385]
    fold  2:  [0.78602198]
    fold  3:  [0.77995604]
    ----
    MEAN:     [0.78265934] + [0.00217338]
    FULL:  

# Stacked Model With SMOTE & Hyperparameter Tuning

In [30]:
models = [
    DecisionTreeClassifier(**decision_tree_random_params), 
    RandomForestClassifier(**random_forest_random_params), 
    svc(**svm_random_params), 
    KNeighborsClassifier(**knn_random_params), 
    MLPClassifier(**m_layer_random_params)
]

stack_train, stack_test = stacking(models,                   
    X_smote, y_smote, X_test,   
    regression=False, 
    mode='oof_pred_bag', 
    needs_proba=False,
    save_dir=None, 
    metric=accuracy_score, 
    n_folds=4, 
    stratified=True,
    shuffle=True,  
    random_state=0,    
    verbose=2)

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [DecisionTreeClassifier]
    fold  0:  [0.92964688]
    fold  1:  [0.93170494]
    fold  2:  [0.93375941]
    fold  3:  [0.93110545]
    ----
    MEAN:     [0.93155417] + [0.00147690]
    FULL:     [0.93155415]

model  1:     [RandomForestClassifier]
    fold  0:  [0.93760832]
    fold  1:  [0.93809575]
    fold  2:  [0.93955479]
    fold  3:  [0.93711748]
    ----
    MEAN:     [0.93809408] + [0.00091151]
    FULL:     [0.93809408]

model  2:     [LinearSVC]
    fold  0:  [0.56834922]
    fold  1:  [0.68235485]
    fold  2:  [0.50560581]
    fold  3:  [0.67372583]
    ----
    MEAN:     [0.60750893] + [0.07400054]
    FULL:     [0.60750941]

model  3:     [KNeighborsClassifier]
    fold  0:  [0.73099003]
    fold  1:  [0.72985269]
    fold  2:  [0.73265450]
    fold  3:  [0.72821318]
    ----
    MEAN:     [0.73042760] + [0.00162092]
    FULL:  

# Train DataFrame Results

In [31]:
train_results = {
    'd_tree':decision_tree_prediction,
    'r_forest':random_forest_prediction,
    'svm':support_vector_prediction,
    'knn':k_neighbors_prediction,
    'm_layer':multilayer_perceptron_prediction
}

train_results_df = pd.DataFrame(data=train_results)
train_results_df.head(1)

Unnamed: 0,d_tree,r_forest,svm,knn,m_layer
0,0,0,1,0,1


# Casting to Test DataFrame

In [32]:
test_results = {
    'd_tree':DecisionTreeClassifier(**decision_tree_random_params).fit(X_smote,y_smote).predict(test_df),
    'r_forest':RandomForestClassifier(**random_forest_random_params).fit(X_smote,y_smote).predict(test_df),
    'svm':svc(**svm_random_params).fit(X_smote,y_smote).predict(test_df),
    'knn':KNeighborsClassifier(**knn_random_params).fit(X_smote,y_smote).predict(test_df),
    'm_layer':MLPClassifier(**m_layer_random_params).fit(X_smote,y_smote).predict(test_df)
}

test_results_df = pd.DataFrame(data=test_results)
test_results_df.head()

Unnamed: 0,d_tree,r_forest,svm,knn,m_layer
0,0,0,1,0,0
1,0,0,1,1,0
2,0,0,1,1,0
3,0,0,0,0,0
4,0,0,1,1,0


# Kaggle Submission

In [63]:
final_model = GradientBoostingClassifier()
final_model.fit(X_smote, y_smote)

GradientBoostingClassifier()

In [64]:
final_prediction = final_model.predict(test_df)

In [70]:
kaggle_df = pd.DataFrame(data=final_prediction)
kaggle_df.to_csv("bernert_kaggle_submission.csv")