## Importing all the required modules for the project.

In [14]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, LabelEncoder

import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix
from tabulate import tabulate

## Importing the dataset.

In [15]:
df = pd.read_csv("mqttdataset_reduced.csv")
df.head()

Unnamed: 0,tcp.flags,tcp.time_delta,tcp.len,mqtt.conack.flags,mqtt.conack.flags.reserved,mqtt.conack.flags.sp,mqtt.conack.val,mqtt.conflag.cleansess,mqtt.conflag.passwd,mqtt.conflag.qos,...,mqtt.qos,mqtt.retain,mqtt.sub.qos,mqtt.suback.qos,mqtt.ver,mqtt.willmsg,mqtt.willmsg_len,mqtt.willtopic,mqtt.willtopic_len,target
0,0x00000010,1.9e-05,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,legitimate
1,0x00000018,0.0,90,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dos
2,0x00000018,1e-06,8,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dos
3,0x00000018,1e-06,85,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dos
4,0x00000010,4e-06,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,legitimate


Removing unwanted rows as to focus only on the DoS attack related and legitimate data. 

In [16]:
df = df[(df["target"] == "legitimate") | (df["target"] == "dos")]
df["target"].unique()

array(['legitimate', 'dos'], dtype=object)

## Cleaning and Preprocessing the dataset.

In [17]:
df.shape

(295686, 34)

In [18]:
df.isna().any()

tcp.flags                     False
tcp.time_delta                False
tcp.len                       False
mqtt.conack.flags             False
mqtt.conack.flags.reserved    False
mqtt.conack.flags.sp          False
mqtt.conack.val               False
mqtt.conflag.cleansess        False
mqtt.conflag.passwd           False
mqtt.conflag.qos              False
mqtt.conflag.reserved         False
mqtt.conflag.retain           False
mqtt.conflag.uname            False
mqtt.conflag.willflag         False
mqtt.conflags                 False
mqtt.dupflag                  False
mqtt.hdrflags                 False
mqtt.kalive                   False
mqtt.len                      False
mqtt.msg                      False
mqtt.msgid                    False
mqtt.msgtype                  False
mqtt.proto_len                False
mqtt.protoname                False
mqtt.qos                      False
mqtt.retain                   False
mqtt.sub.qos                  False
mqtt.suback.qos             

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 295686 entries, 0 to 330925
Data columns (total 34 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   tcp.flags                   295686 non-null  object 
 1   tcp.time_delta              295686 non-null  float64
 2   tcp.len                     295686 non-null  int64  
 3   mqtt.conack.flags           295686 non-null  object 
 4   mqtt.conack.flags.reserved  295686 non-null  float64
 5   mqtt.conack.flags.sp        295686 non-null  float64
 6   mqtt.conack.val             295686 non-null  float64
 7   mqtt.conflag.cleansess      295686 non-null  float64
 8   mqtt.conflag.passwd         295686 non-null  float64
 9   mqtt.conflag.qos            295686 non-null  float64
 10  mqtt.conflag.reserved       295686 non-null  float64
 11  mqtt.conflag.retain         295686 non-null  float64
 12  mqtt.conflag.uname          295686 non-null  float64
 13  mqtt.conflag.willfl

### Handling all numerical data

In [20]:
scaler = MinMaxScaler(feature_range=(0, 1))

numerical_features = df.columns[df.dtypes != 'object']
if len(numerical_features) > 0:
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
df

Unnamed: 0,tcp.flags,tcp.time_delta,tcp.len,mqtt.conack.flags,mqtt.conack.flags.reserved,mqtt.conack.flags.sp,mqtt.conack.val,mqtt.conflag.cleansess,mqtt.conflag.passwd,mqtt.conflag.qos,...,mqtt.qos,mqtt.retain,mqtt.sub.qos,mqtt.suback.qos,mqtt.ver,mqtt.willmsg,mqtt.willmsg_len,mqtt.willtopic,mqtt.willtopic_len,target
0,0x00000010,3.166623e-07,0.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,legitimate
1,0x00000018,0.000000e+00,0.061644,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dos
2,0x00000018,1.666644e-08,0.005479,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dos
3,0x00000018,1.666644e-08,0.058219,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dos
4,0x00000010,6.666575e-08,0.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,legitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330921,0x00000010,4.999931e-08,0.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,legitimate
330922,0x00000010,1.221650e-05,1.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dos
330923,0x00000010,5.666589e-07,0.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dos
330924,0x00000010,1.133318e-06,1.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dos


### Handling the Categorical data

In [21]:
label_encoder = LabelEncoder()

In [22]:
print(df["tcp.flags"].unique())
df['tcp.flags'] = label_encoder.fit_transform(df['tcp.flags'])
print(df['tcp.flags'].unique())

['0x00000010' '0x00000018' '0x00000011' '0x00000002' '0x00000012']
[1 4 2 0 3]


In [23]:
df['mqtt.conack.flags'] = label_encoder.fit_transform(df['mqtt.conack.flags'])
df['mqtt.conack.flags'].unique()

array([0, 1])

In [24]:
df['mqtt.conflags'] = label_encoder.fit_transform(df['mqtt.conflags'])
df['mqtt.conflags'].unique()

array([0, 1])

In [25]:
df['mqtt.hdrflags'] = label_encoder.fit_transform(df['mqtt.hdrflags'])
df['mqtt.hdrflags'].unique()

array([0, 4, 6, 3, 7, 5, 8, 2, 9, 1])

In [29]:
df['mqtt.msg'] = df['mqtt.msg'].astype(str)

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
# Create a TF-IDF vectorizer object
vectorizer = TfidfVectorizer()

# Transform the 'mqtt.msg' column
encoded_msg = vectorizer.fit_transform(df['mqtt.msg'])
if not all(matrix.shape == encoded_msg[0].shape for matrix in encoded_msg):
    raise ValueError("Sparse matrices must have consistent dimensions.")
print(encoded_msg)
# Use the encoded_msg in your model
combined_matrix = hstack(encoded_msg)
df['mqtt.msg'] = combined_matrix

df['mqtt.msg']

  (1, 20399)	1.0
  (3, 2500)	1.0
  (6, 24710)	1.0
  (7, 10685)	1.0
  (8, 19)	1.0
  (9, 19710)	1.0
  (11, 2327)	1.0
  (13, 7062)	1.0
  (17, 48155)	1.0
  (18, 2327)	1.0
  (19, 6991)	1.0
  (21, 19)	1.0
  (24, 42289)	1.0
  (26, 5602)	1.0
  (30, 32761)	1.0
  (33, 19)	1.0
  (34, 9691)	1.0
  (35, 2327)	1.0
  (37, 46475)	1.0
  (41, 19)	1.0
  (43, 32012)	1.0
  (44, 2327)	1.0
  (45, 21212)	1.0
  (47, 2327)	1.0
  (49, 2327)	1.0
  :	:
  (295639, 19)	1.0
  (295641, 19)	1.0
  (295642, 33380)	1.0
  (295647, 4659)	1.0
  (295648, 4659)	1.0
  (295651, 19818)	1.0
  (295652, 19)	1.0
  (295654, 2327)	1.0
  (295656, 6464)	1.0
  (295657, 18445)	1.0
  (295661, 2327)	1.0
  (295668, 19)	1.0
  (295669, 2327)	1.0
  (295671, 4659)	1.0
  (295672, 19)	1.0
  (295673, 39879)	1.0
  (295674, 9684)	1.0
  (295676, 2885)	1.0
  (295677, 4659)	1.0
  (295678, 2582)	1.0
  (295679, 292)	1.0
  (295680, 5214)	1.0
  (295682, 8645)	1.0
  (295684, 33411)	1.0
  (295685, 4659)	1.0


ValueError: blocks must be 2-D

In [32]:
df['mqtt.msg'] = df['mqtt.msg'].astype(str)


from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Transform the 'mqtt.msg' column into a sparse matrix
encoded_msg = vectorizer.fit_transform(df['mqtt.msg'])

# Use the encoded_msg in your machine learning model

encoded_msg

from scipy.sparse import hstack

# Transform the 'mqtt.msg' column
encoded_msg = vectorizer.fit_transform(df['mqtt.msg'])
if not all(matrix.shape == encoded_msg[0].shape for matrix in encoded_msg):
    raise ValueError("Sparse matrices must have consistent dimensions.")
print(encoded_msg)
# Use the encoded_msg in your model
combined_matrix = hstack(encoded_msg)
df['mqtt.msg'] = combined_matrix

df['mqtt.msg']

  (1, 20399)	1
  (3, 2500)	1
  (6, 24710)	1
  (7, 10685)	1
  (8, 19)	1
  (9, 19710)	1
  (11, 2327)	1
  (13, 7062)	1
  (17, 48155)	1
  (18, 2327)	1
  (19, 6991)	1
  (21, 19)	1
  (24, 42289)	1
  (26, 5602)	1
  (30, 32761)	1
  (33, 19)	1
  (34, 9691)	1
  (35, 2327)	1
  (37, 46475)	1
  (41, 19)	1
  (43, 32012)	1
  (44, 2327)	1
  (45, 21212)	1
  (47, 2327)	1
  (49, 2327)	1
  :	:
  (295639, 19)	1
  (295641, 19)	1
  (295642, 33380)	1
  (295647, 4659)	1
  (295648, 4659)	1
  (295651, 19818)	1
  (295652, 19)	1
  (295654, 2327)	1
  (295656, 6464)	1
  (295657, 18445)	1
  (295661, 2327)	1
  (295668, 19)	1
  (295669, 2327)	1
  (295671, 4659)	1
  (295672, 19)	1
  (295673, 39879)	1
  (295674, 9684)	1
  (295676, 2885)	1
  (295677, 4659)	1
  (295678, 2582)	1
  (295679, 292)	1
  (295680, 5214)	1
  (295682, 8645)	1
  (295684, 33411)	1
  (295685, 4659)	1


ValueError: blocks must be 2-D

In [None]:
df["msg_len"] = [len(val) for val in df["mqtt.msg"].to_list()]
# df = df.drop('mqtt.msg', axis=1)
df["msg_len"].unique()

In [None]:
df['mqtt.protoname'] = label_encoder.fit_transform(df['mqtt.protoname'])
df["mqtt.protoname"].unique()

In [None]:
label_encoder.fit(["legitimate", "dos"])
mapping = {"legitimate": 0, "dos": 1}
df["target"] = [mapping[label] for label in df["target"].to_list()]
df["target"].unique()

In [None]:
df.info()

## Analysing the dataset

In [None]:
target_x = df["target"].unique()
target_l = list(df["target"])
target_y = []
for i in target_x:
    target_y.append(round(target_l.count(i)*100/len(target_l), 2))
plt.pie(target_y, labels=["Legitimate Traffic - " + str(target_y[0]) + "%", "DOS Traffic - " + str(target_y[1]) + "%"])
plt.show()

We can see that the dataset is not balanced, hence we will be using the algorithms which are less affected by unbalanced datasets. 
* Decision Trees
* Random Forests
* Support Vector Machines (SVMs) 

In [None]:
tcp_flag_x = list(df["tcp.flags"].unique())

tcp_flag_l_top = list(df[df["target"]==1]["tcp.flags"])
tcp_flag_y_top = []
for i in tcp_flag_x:
    tcp_flag_y_top.append(tcp_flag_l_top.count(i))

tcp_flag_l_bottom = list(df[df["target"]==0]["tcp.flags"])
tcp_flag_y_bottom = []
for i in tcp_flag_x:
    tcp_flag_y_bottom.append(tcp_flag_l_bottom.count(i))

plt.bar(tcp_flag_x, tcp_flag_y_bottom, label="Legitimate")
plt.bar(tcp_flag_x, tcp_flag_y_top, bottom=tcp_flag_y_bottom, label="DoS Attack")

plt.legend()
plt.show()

We can see in the graph that the majority of the flags are either SYN or RST flags.

1 -> '0x00000010' [SYN (Synchronization) flag]

4 -> '0x00000018' [Reset (RST) flag]

2 -> '0x00000011' [Urgent (URG) and Acknowledgment (ACK) flag]

0 -> '0x00000002' [Conditional Receive (CWR) flag]

3 -> '0x00000012' [PUSH (PSH) flag]

In [None]:
qos_x = df["mqtt.qos"].unique()
qos_l = list(df["mqtt.qos"])
qos_y = []
for i in qos_x:
    qos_y.append(qos_l.count(i))
plt.pie(qos_y, labels=["QOS Level: 0", "QOS Level: 1"])
plt.show()

Majority of the packets have been sent using QOS 0.

## Feature Selection

In [None]:
x = df.drop('target', axis=1)
x

In [None]:
y = df["target"]
y

In [None]:
selector = SelectKBest(chi2, k=20)
X_reduced = selector.fit_transform(x, y)
all_features = x.columns.tolist()
select_x = [all_features[i] for i in selector.get_support(indices=True)]
select_x

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(x, y)

feature_importances = model.feature_importances_

top_k_features = np.argsort(feature_importances)[::-1][:20]

random_x = []
for i in top_k_features:
    random_x.append(x.keys()[i])
random_x = list(random_x)
random_x

In [None]:
research_x = [
    "tcp.flags", 
    "tcp.time_delta", 
    "tcp.len", 
    "mqtt.conack.flags", 
    "mqtt.conflag", 
    "mqtt.dupflag", 
    "mqtt.hdrflags", 
    "mqtt.kalive", 
    "mqtt.len", 
    "msg_len", 
    "mqtt.qos", 
    "mqtt.sub.qos", 
    "mqtt.retain", 
    "mqtt.willmsg", 
    "mqtt.willtopic"
]

In [None]:
feature_names = list(set(random_x) & set(select_x) & set(research_x))
feature_names

In [None]:
features = {}
for i in feature_names:
    features[i] = x[i]
features = pd.DataFrame(features)
features

## Splitting of dataset.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)

Splitted the dataset into train (60%), test (20%) and validation (20%). 

## Model Building

As discussed before during analysis, we will focus on the algorithms - Decision Tree, Random Forest and SVM.

### Decision Tree

Default Parameters

In [None]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 1")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

Hypertuning Parameters

criterion Parameter

In [None]:
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"] # default - "gini"
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 2")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

splitter Parameter

In [None]:
param_grid = {
    "splitter": ["best", "random"] # default - "best"
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 3")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

max_features Parameter

In [None]:
param_grid = {
    "max_features": [None, "sqrt", "log2"] # default - None
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 4")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

max_depth Parameter

In [None]:
param_grid = {
    "max_depth": [3, 5, 8] # default - None
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 5")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

min_samples_split Parameter

In [None]:
param_grid = {
    "min_samples_split": [2, 5, 10] # default - 2
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 6")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

min_samples_leaf Parameter

In [None]:
param_grid = {
    "min_samples_leaf": [1, 2, 4] # default - 1
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 7")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

min_weight_fraction_leaf Parameter

In [None]:
param_grid = {
    "min_weight_fraction_leaf": [0.0, 0.1, 0.2] # default - 0.0
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 8")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

random_state Parameter

In [None]:
param_grid = {
    "random_state": [42, 100, None] # default - None
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 9")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

max_leaf_nodes Parameter

In [None]:
param_grid = {
    "max_leaf_nodes": [None, 10, 30] # default - None
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 10")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

min_impurity_decrease Parameter

In [None]:
param_grid = {
    "min_impurity_decrease": [0.0, 0.1, 0.2] # default - 0.0
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 11")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

class_weight Parameter

In [None]:
param_grid = {
    "class_weight": ["balanced", None] # default - None
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 12")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

monotonic_cst Parameter

In [None]:
param_grid = {
    "monotonic_cst": [0.0, 0.1, None] # default - None
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 13")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

ccp_alpha Parameter

In [None]:
param_grid = {
    "ccp_alpha": [0.0, 0.1, None] # default - 0.0
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Trial 14")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

Fine Tuned Algorithm

In [None]:
clf = DecisionTreeClassifier(splitter="random")
clf.fit(x_train, y_train)

In [None]:
y_cal = clf.predict(x_val)

print("Decision Tree - Validation Dataset Final")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

In [None]:
y_res = clf.predict(x_test)

print("Decision Tree - Test Dataset Final")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_test, y_res)*100, 2)
accuracy = round(accuracy_score(y_test, y_res)*100, 2)
f1 = round(f1_score(y_test, y_res)*100, 2)
cm = confusion_matrix(y_test, y_res)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

### Random Forest

Default Parameters

In [None]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 1")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

Hypertuning Parameters

criterion Parameter

In [None]:
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"] # default - "gini"
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 2")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

n_estimators Parameter

In [None]:
param_grid = {
    "n_estimators": [50, 100, 200] # default - 100
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 3")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

max_depth Parameter

In [None]:
param_grid = {
    "max_depth": [3, 5, 8] # default - None
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 4")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

min_samples_split Parameter

In [None]:
param_grid = {
    "min_samples_split": [2, 5, 10] # default - 2
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 5")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

min_samples_leaf Parameter

In [None]:
param_grid = {
    "min_samples_leaf": [1, 2, 4] # default - 1
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 6")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

min_weight_fraction_leaf Parameter

In [None]:
param_grid = {
    "min_weight_fraction_leaf": [0.0, 0.1, 0.2] # default - 0.0
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 7")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

max_features Parameter

In [None]:
param_grid = {
    "max_features": ["auto", "sqrt", "log2"] # default - "sqrt"
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 8")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

max_leaf_nodes Parameter

In [None]:
param_grid = {
    "max_leaf_nodes": [None, 10, 30] # default - None
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 9")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

min_impurity_decrease Parameter

In [None]:
param_grid = {
    "min_impurity_decrease": [0.0, 0.1, 0.2] # default - 0.0
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 10")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

bootstrap Parameter

In [None]:
param_grid = {
    "bootstrap": [True, False] # default - True
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 11")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

oob_score Parameter

In [None]:
param_grid = {
    "oob_score": [True, False] # default - False
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 12")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

n_jobs Parameter

In [None]:
param_grid = {
    "n_jobs": [-1, 1] # default - None
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 13")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

random_state Parameter

In [None]:
param_grid = {
    "random_state": [42, 100, None] # default - None
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 14")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

verbose Parameter

In [None]:
param_grid = {
    "verbose": [0, 1] # default - 0
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 15")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

warm_start Parameter

In [None]:
param_grid = {
    "warm_start": [True, False] # default - False
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 16")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

class_weight Parameter

In [None]:
param_grid = {
    "class_weight": ["balanced", None] # default - None
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 17")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

ccp_alpha Parameter

In [None]:
param_grid = {
    "ccp_alpha": [0.0, 0.1, None] # default - 0.0
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 18")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

max_samples Parameter

In [None]:
param_grid = {
    "max_samples": [None, 0.5, 1.0] # default - None
}

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
clf.fit(x_train, y_train)

print(clf.best_params_)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Trial 19")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

Fine Tuned Algorithm

In [None]:
clf = RandomForestClassifier(max_samples = 0.5)
clf.fit(x_train, y_train)

In [None]:
y_cal = clf.predict(x_val)

print("Random Forest - Validation Dataset Final")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

In [None]:
y_res = clf.predict(x_test)

print("Random Forest - Test Dataset Final")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_test, y_res)*100, 2)
accuracy = round(accuracy_score(y_test, y_res)*100, 2)
f1 = round(f1_score(y_test, y_res)*100, 2)
cm = confusion_matrix(y_test, y_res)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

### SVM

Default Parameters

In [None]:
clf = SVC()
clf.fit(x_train, y_train)

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 1")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

Hypertuning Parameters

C Parameter

In [None]:
param_grid = {
    "C": [0.1, 1, 10, 100] # default - 1.0
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 2")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

kernel Parameter

In [None]:
param_grid = {
    "kernel": ["linear", "rbf", "poly"] # default - "rbf"
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 3")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

degree Parameter

In [None]:
param_grid = {
    "degree": [2, 3, 4] # default - 3
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 4")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

gamma Parameter

In [None]:
param_grid = {
    "gamma": ["auto", "scale", 0.1, 0.01] # default - "scale"
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 5")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

coef0 Parameter

In [None]:
param_grid = {
    "coef0": [0.0] # default - 0.0
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 6")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

shrinking Parameter

In [None]:
param_grid = {
    "shrinking": [True, False] # default - True
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 7")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

probability Parameter

In [None]:
param_grid = {
    "probability": [True, False] # default - False
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 8")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

tol Parameter

In [None]:
param_grid = {
    "tol": [1e-3, 1e-4, 1e-5] # default - 1e-3
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 9")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

cache_size Parameter

In [None]:
param_grid = {
    "cache_size": [200, 500] # default - 200
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 10")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

class_weight Parameter

In [None]:
param_grid = {
    "class_weight": ["balanced", None] # default - None
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 11")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

verbose Parameter

In [None]:
param_grid = {
    "verbose": [0, 1] # default - False
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 12")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

max_iter Parameter

In [None]:
param_grid = {
    "max_iter": [-1, 1000] # default - -1
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 13")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

decision_function_shape Parameter

In [None]:
param_grid = {
    "decision_function_shape": ["ovr", "ovo"] # default - "ovr"
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 14")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

break_ties Parameter

In [None]:
param_grid = {
    "break_ties": [True, False] # default - False
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 15")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

random_state Parameter

In [None]:
param_grid = {
    "random_state": [42, 100, None] # default - None
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Trial 16")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

Fine Tuned Algorithm

In [None]:
clf = SVC()
clf.fit(x_train, y_train)

In [None]:
y_cal = clf.predict(x_val)

print("SVM - Validation Dataset Final")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_val, y_cal)*100, 2)
accuracy = round(accuracy_score(y_val, y_cal)*100, 2)
f1 = round(f1_score(y_val, y_cal)*100, 2)
cm = confusion_matrix(y_val, y_cal)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")

In [None]:
y_res = clf.predict(x_test)

print("SVM - Test Dataset Final")
print("-----------------------\n")

auc_roc = round(roc_auc_score(y_test, y_res)*100, 2)
accuracy = round(accuracy_score(y_test, y_res)*100, 2)
f1 = round(f1_score(y_test, y_res)*100, 2)
cm = confusion_matrix(y_test, y_res)

print("AUC-ROC Score: ", auc_roc, "%", sep="")
print("Accuracy Score: ", accuracy, "%", sep="")
print("F1 Score: ", f1, "%", sep="")
print("Confusion Matrix:\n", tabulate(cm, tablefmt="grid"), sep="")