## Importing the relevant libraries

In [1]:
import os, tarfile, urllib, urlextract, email, nltk, re, warnings
from email import policy
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from html import unescape
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, r2_score
from scipy.sparse import csr_matrix
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import NearMiss
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
warnings.filterwarnings('ignore')
from sklearn.feature_selection import VarianceThreshold

In [2]:
pip install urlextract

Note: you may need to restart the kernel to use updated packages.


In [3]:
import urlextract

## Fetching the data

In [4]:
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

fetch_spam_data()

## Loading a few emails

In [5]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

print(len(ham_filenames))

print(len(spam_filenames))

2500
500


## Using Python's email module to parse these emails for handling errors, encoding and so on.

In [6]:
def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

## Looking at one example of ham and one example of spam, to get a feel of what the data looks like:

In [7]:
print(ham_emails[1].get_content().strip())
print(spam_emails[6].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/
Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are loo

## Since some emails are actually multipart consisting of emails and attachments(which in turn may have their own attachments), I'll be examining various types of structures that we have:

In [8]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [9]:
def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [10]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [11]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

## Email Headers

In [12]:
for header, value in spam_emails[0].items():
    print(header,":",value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

## Focusing mainly on the subject header

In [13]:
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

## Splitting emails data into training and testing sets

In [14]:
X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=45)

In [15]:
def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [16]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")

<html>
<head>
</head>
<body>

<center>
<font face="times" size="6" color="#000000">Save up to

<font color="#ff0000">75%</font> on your Term Life
Insurance!</font>
<br>  
<font face="times" size="4" color="#000000">
<i>Compare rates from top insurance companies around
the country</i></font>
<br><br>
<font face="arial" size="4" color="#7084D6">
<b>In our life and times, it's important to plan for
your family's future, while 
<br>being comfortable financially.  Choose the right
Life Insurance policy today.</font>
<p>
<font face="arial" size="3" color="#000000">
<i>Click the link below to compare the lowest rates
and save up to <font
color="#ff0000">75%</font></i></b></font>  
<p>
<a
href="http://insurancequotesource.com/user0202/termquotes/473400/"><font
face="arial"
size="4">
<b>COMPARE YOUR COVERAGE</b></font></a>
<p>
<font face="times" size="5" color="#000000">
You'll be able to compare rates and get a free
application in <i>less than a minute!</i></font>
<p>
<font face="arial" size="

In [17]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [18]:
print(email_to_text(sample_html_spam)[:100], "...")


Save up to
75% on your Term Life
Insurance!
Compare rates from top insurance companies around
the c ...


In [19]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [20]:
try:
    import google.colab
    !conda install -q -U urlextract
except ImportError:
    pass

try:
    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("Error: replacing URLs requires the urlextract module.")
    url_extractor = None

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [21]:
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [22]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'number': 12, 'is': 3, 'on': 3, 'url': 2, 'msie': 2, 'os': 2, 'x': 2, 'i': 2, 'use': 2, 'build': 2, 'chimera': 2, 'with': 2, 'the': 2, 'are': 2, 'too': 2, 'and': 2, 'date': 1, 'numbertnumb': 1, 'rael': 1, 'plagu': 1, 'by': 1, 'instabl': 1, 'mac': 1, 'a': 1, 'recent': 1, 'nightli': 1, 'of': 1, 'as': 1, 'my': 1, 'default': 1, 'browser': 1, 'ha': 1, 'some': 1, 'issu': 1, 'plugin': 1, 'or': 1, 'quicktim': 1, 'anyway': 1, 'but': 1, 'work': 1, 'great': 1, 'fast': 1, 'mozilla': 1, 'ugli': 1, 'slow': 1, 'unstabl': 1, 'opera': 1, 'doesn': 1, 't': 1, 'render': 1, 'mani': 1, 'page': 1, 'omniweb': 1, 'icab': 1, 'not': 1, 'keep': 1, 'up': 1, 'rock': 1, 'have': 1, 'chimeraknight': 1, 'to': 1, 'do': 1, 'updat': 1, 'it': 1, 'also': 1, 'make': 1}),
       Counter({'to': 2, 'exmh': 2, 'user': 2, 'what': 1, 's': 1, 'the': 1, 'trick': 1, 'again': 1, 'have': 1, 'it': 1, 'default': 1, 'show': 1, 'text': 1, 'plain': 1, 'instead': 1, 'of': 1, 'html': 1, 'harlan': 1, '__________________________

In [23]:
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [24]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.intc'>'
	with 24 stored elements in Compressed Sparse Row format>

In [25]:
X_few_vectors.toarray()

array([[67, 12,  2,  3,  2,  0,  3,  2,  1,  1,  1],
       [21,  0,  1,  0,  1,  0,  0,  0,  1,  2,  1],
       [52,  0,  3,  1,  1,  4,  0,  1,  1,  0,  1]], dtype=int32)

In [26]:
vocab_transformer.vocabulary_

{'number': 1,
 'url': 2,
 'is': 3,
 'the': 4,
 'spamassassin': 5,
 'on': 6,
 'build': 7,
 'of': 8,
 'to': 9,
 'it': 10}

## Running a pipeline for transforming an email to its corresponding vector..

In [27]:
pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

In [28]:
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.fit_transform(X_test)

## Balancing the target variable using Imbalanced Learn library's Under Sampling Method, namely the NearMiss Algorithm

In [29]:
nm = NearMiss()
X_train_transformed, y_train = nm.fit_resample(X_train_transformed,y_train)
X_test_transformed, y_test = nm.fit_resample(X_test_transformed,y_test)

## Feature Selection

In [30]:
sel = VarianceThreshold(threshold=0)
sel.fit(X_train_transformed,y_train)

VarianceThreshold(threshold=0)

In [31]:
sum(sel.get_support())

983

In [32]:
X_train_transformed = sel.transform(X_train_transformed)
X_test_transformed = sel.transform(X_test_transformed)

In [33]:
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_train_transformed,y_train)

VarianceThreshold(threshold=0.01)

In [34]:
sum(sel.get_support())

897

In [35]:
X_train_transformed = sel.transform(X_train_transformed)
X_test_transformed = sel.transform(X_test_transformed)

## Training and testing the Logistic Regression model

In [36]:
lr = LogisticRegression(
                C = 0.05,
                max_iter = 100,
                tol = 0.0001,
                solver = 'sag',
                fit_intercept = True,
                penalty = 'l2',
                dual = False,
                verbose = 0)
lr.fit(X_train_transformed,y_train)

LogisticRegression(C=0.05, solver='sag')

In [37]:
predictions = lr.predict(X_test_transformed)
predictions

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [38]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[  8 166]
 [  0 174]]
              precision    recall  f1-score   support

           0       1.00      0.05      0.09       174
           1       0.51      1.00      0.68       174

    accuracy                           0.52       348
   macro avg       0.76      0.52      0.38       348
weighted avg       0.76      0.52      0.38       348



## Training and evaluating the Random Forest Classifier

In [39]:
rfc = RandomForestClassifier()
rfc.fit(X_train_transformed,y_train)

RandomForestClassifier()

In [40]:
rfc_pred = rfc.predict(X_test_transformed)
rfc_pred

array([1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [41]:
print(confusion_matrix(y_test,rfc_pred))
print(classification_report(y_test,rfc_pred))

[[163  11]
 [ 26 148]]
              precision    recall  f1-score   support

           0       0.86      0.94      0.90       174
           1       0.93      0.85      0.89       174

    accuracy                           0.89       348
   macro avg       0.90      0.89      0.89       348
weighted avg       0.90      0.89      0.89       348



## Training and testing the Decision Tree Classifier

In [42]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train_transformed,y_train)

DecisionTreeClassifier()

In [43]:
dtree_pred = dtree.predict(X_test_transformed)
dtree_pred

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,

In [44]:
print(confusion_matrix(y_test,dtree_pred))
print(classification_report(y_test,dtree_pred))

[[145  29]
 [ 35 139]]
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       174
           1       0.83      0.80      0.81       174

    accuracy                           0.82       348
   macro avg       0.82      0.82      0.82       348
weighted avg       0.82      0.82      0.82       348



## Training and evaluating the K Nearest Neighbors Classifier

In [45]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train_transformed,y_train)

KNeighborsClassifier(n_neighbors=6)

In [46]:
knn_pred = knn.predict(X_test_transformed)
knn_pred

array([1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,

In [47]:
print(confusion_matrix(y_test,knn_pred))
print(classification_report(y_test,knn_pred))

[[168   6]
 [ 51 123]]
              precision    recall  f1-score   support

           0       0.77      0.97      0.85       174
           1       0.95      0.71      0.81       174

    accuracy                           0.84       348
   macro avg       0.86      0.84      0.83       348
weighted avg       0.86      0.84      0.83       348



## Training and testing the Support Vector Classifier

In [48]:
svm = SVC()
svm.fit(X_train_transformed,y_train)

SVC()

In [49]:
svm_pred = svm.predict(X_test_transformed)
svm_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,

In [50]:
print(confusion_matrix(y_test,svm_pred))
print(classification_report(y_test,svm_pred))

[[174   0]
 [ 59 115]]
              precision    recall  f1-score   support

           0       0.75      1.00      0.86       174
           1       1.00      0.66      0.80       174

    accuracy                           0.83       348
   macro avg       0.87      0.83      0.83       348
weighted avg       0.87      0.83      0.83       348



## Training and evaluating the XG Boost Classifier

In [51]:
xgb = XGBClassifier()
xgb.fit(X_train_transformed,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [52]:
xgb_pred = xgb.predict(X_test_transformed)
xgb_pred

array([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,

In [53]:
print(confusion_matrix(y_test,xgb_pred))
print(classification_report(y_test,xgb_pred))

[[160  14]
 [ 33 141]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       174
           1       0.91      0.81      0.86       174

    accuracy                           0.86       348
   macro avg       0.87      0.86      0.86       348
weighted avg       0.87      0.86      0.86       348



## Training and testing the Multinomial Naive Bayes model

In [54]:
mnb = MultinomialNB()
mnb.fit(X_train_transformed,y_train)

MultinomialNB()

In [55]:
mnb_pred = mnb.predict(X_test_transformed)
mnb_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [56]:
print(confusion_matrix(y_test,mnb_pred))
print(classification_report(y_test,mnb_pred))

[[136  38]
 [ 40 134]]
              precision    recall  f1-score   support

           0       0.77      0.78      0.78       174
           1       0.78      0.77      0.77       174

    accuracy                           0.78       348
   macro avg       0.78      0.78      0.78       348
weighted avg       0.78      0.78      0.78       348



## Hyperparameter Tuning using GridSearchCV

In [57]:
param_grid = {'C': [1,0.1,10,100,1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}

In [58]:
grid_search = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid_search.fit(X_train_transformed,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.626, total=   0.1s
[CV] C=1, gamma=1, kernel=rbf ........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ............ C=1, gamma=1, kernel=rbf, score=0.504, total=   0.1s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.500, total=   0.1s
[CV] C=1, gamma=1, kernel=rbf ........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ............ C=1, gamma=1, kernel=rbf, score=0.500, total=   0.1s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.500, total=   0.1s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.916, total=   0.1s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.916, total=   0.1s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.869, total=   0.1s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.754, total=   0.1s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.546, total=   0.1s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] .

[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.992, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.969, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.969, total=   0.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.969, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.977, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.969, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.969, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:    9.1s finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 0.1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [59]:
grid_search.best_score_

0.9755020551967117

In [60]:
svm = SVC(C=1,gamma=0.01,kernel='rbf',random_state=4)
svm.fit(X_train_transformed,y_train)

SVC(C=1, gamma=0.01, random_state=4)

In [61]:
optimized_svm_pred = svm.predict(X_test_transformed)
optimized_svm_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,

In [62]:
print(confusion_matrix(y_test,optimized_svm_pred))
print(classification_report(y_test,optimized_svm_pred))

[[173   1]
 [ 22 152]]
              precision    recall  f1-score   support

           0       0.89      0.99      0.94       174
           1       0.99      0.87      0.93       174

    accuracy                           0.93       348
   macro avg       0.94      0.93      0.93       348
weighted avg       0.94      0.93      0.93       348



In [63]:
param_grid = {'n_neighbors': [3,4,5,6,7],
             'weights': ['uniform','distance'],
             'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
             'p': [1,2]}

In [64]:
grid_search = GridSearchCV(KNeighborsClassifier(),param_grid,refit=True,verbose=4)
grid_search.fit(X_train_transformed,y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] algorithm=auto, n_neighbors=3, p=1, weights=uniform .............
[CV]  algorithm=auto, n_neighbors=3, p=1, weights=uniform, score=0.840, total=   0.0s
[CV] algorithm=auto, n_neighbors=3, p=1, weights=uniform .............
[CV]  algorithm=auto, n_neighbors=3, p=1, weights=uniform, score=0.863, total=   0.0s
[CV] algorithm=auto, n_neighbors=3, p=1, weights=uniform .............
[CV]  algorithm=auto, n_neighbors=3, p=1, weights=uniform, score=0.908, total=   0.0s
[CV] algorithm=auto, n_neighbors=3, p=1, weights=uniform .............
[CV]  algorithm=auto, n_neighbors=3, p=1, weights=uniform, score=0.900, total=   0.0s
[CV] algorithm=auto, n_neighbors=3, p=1, weights=uniform .............
[CV]  algorithm=auto, n_neighbors=3, p=1, weights=uniform, score=0.931, total=   0.0s
[CV] algorithm=auto, n_neighbors=3, p=1, weights=distance ............
[CV]  algorithm=auto, n_neighbors=3, p=1, weights=distance, score=0.847, total=   

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s


[CV]  algorithm=auto, n_neighbors=3, p=2, weights=distance, score=0.878, total=   0.0s
[CV] algorithm=auto, n_neighbors=3, p=2, weights=distance ............
[CV]  algorithm=auto, n_neighbors=3, p=2, weights=distance, score=0.893, total=   0.0s
[CV] algorithm=auto, n_neighbors=3, p=2, weights=distance ............
[CV]  algorithm=auto, n_neighbors=3, p=2, weights=distance, score=0.923, total=   0.0s
[CV] algorithm=auto, n_neighbors=3, p=2, weights=distance ............
[CV]  algorithm=auto, n_neighbors=3, p=2, weights=distance, score=0.954, total=   0.0s
[CV] algorithm=auto, n_neighbors=3, p=2, weights=distance ............
[CV]  algorithm=auto, n_neighbors=3, p=2, weights=distance, score=0.954, total=   0.0s
[CV] algorithm=auto, n_neighbors=4, p=1, weights=uniform .............
[CV]  algorithm=auto, n_neighbors=4, p=1, weights=uniform, score=0.809, total=   0.0s
[CV] algorithm=auto, n_neighbors=4, p=1, weights=uniform .............
[CV]  algorithm=auto, n_neighbors=4, p=1, weights=uni

[CV]  algorithm=auto, n_neighbors=7, p=1, weights=uniform, score=0.877, total=   0.0s
[CV] algorithm=auto, n_neighbors=7, p=1, weights=uniform .............
[CV]  algorithm=auto, n_neighbors=7, p=1, weights=uniform, score=0.915, total=   0.0s
[CV] algorithm=auto, n_neighbors=7, p=1, weights=distance ............
[CV]  algorithm=auto, n_neighbors=7, p=1, weights=distance, score=0.802, total=   0.0s
[CV] algorithm=auto, n_neighbors=7, p=1, weights=distance ............
[CV]  algorithm=auto, n_neighbors=7, p=1, weights=distance, score=0.878, total=   0.0s
[CV] algorithm=auto, n_neighbors=7, p=1, weights=distance ............
[CV]  algorithm=auto, n_neighbors=7, p=1, weights=distance, score=0.900, total=   0.0s
[CV] algorithm=auto, n_neighbors=7, p=1, weights=distance ............
[CV]  algorithm=auto, n_neighbors=7, p=1, weights=distance, score=0.915, total=   0.0s
[CV] algorithm=auto, n_neighbors=7, p=1, weights=distance ............
[CV]  algorithm=auto, n_neighbors=7, p=1, weights=dist

[CV]  algorithm=ball_tree, n_neighbors=4, p=2, weights=distance, score=0.954, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=5, p=1, weights=uniform ........
[CV]  algorithm=ball_tree, n_neighbors=5, p=1, weights=uniform, score=0.824, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=5, p=1, weights=uniform ........
[CV]  algorithm=ball_tree, n_neighbors=5, p=1, weights=uniform, score=0.847, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=5, p=1, weights=uniform ........
[CV]  algorithm=ball_tree, n_neighbors=5, p=1, weights=uniform, score=0.915, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=5, p=1, weights=uniform ........
[CV]  algorithm=ball_tree, n_neighbors=5, p=1, weights=uniform, score=0.923, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=5, p=1, weights=uniform ........
[CV]  algorithm=ball_tree, n_neighbors=5, p=1, weights=uniform, score=0.923, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=5, p=1, weights=distance .......
[CV]  algorithm=ball_tree, 

[CV]  algorithm=ball_tree, n_neighbors=7, p=2, weights=uniform, score=0.870, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=7, p=2, weights=uniform ........
[CV]  algorithm=ball_tree, n_neighbors=7, p=2, weights=uniform, score=0.908, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=7, p=2, weights=uniform ........
[CV]  algorithm=ball_tree, n_neighbors=7, p=2, weights=uniform, score=0.931, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=7, p=2, weights=uniform ........
[CV]  algorithm=ball_tree, n_neighbors=7, p=2, weights=uniform, score=0.915, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=7, p=2, weights=distance .......
[CV]  algorithm=ball_tree, n_neighbors=7, p=2, weights=distance, score=0.863, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=7, p=2, weights=distance .......
[CV]  algorithm=ball_tree, n_neighbors=7, p=2, weights=distance, score=0.878, total=   0.0s
[CV] algorithm=ball_tree, n_neighbors=7, p=2, weights=distance .......
[CV]  algorithm=ball_tree,

[CV]  algorithm=kd_tree, n_neighbors=5, p=1, weights=distance, score=0.923, total=   0.0s
[CV] algorithm=kd_tree, n_neighbors=5, p=1, weights=distance .........
[CV]  algorithm=kd_tree, n_neighbors=5, p=1, weights=distance, score=0.946, total=   0.0s
[CV] algorithm=kd_tree, n_neighbors=5, p=1, weights=distance .........
[CV]  algorithm=kd_tree, n_neighbors=5, p=1, weights=distance, score=0.962, total=   0.0s
[CV] algorithm=kd_tree, n_neighbors=5, p=2, weights=uniform ..........
[CV]  algorithm=kd_tree, n_neighbors=5, p=2, weights=uniform, score=0.863, total=   0.0s
[CV] algorithm=kd_tree, n_neighbors=5, p=2, weights=uniform ..........
[CV]  algorithm=kd_tree, n_neighbors=5, p=2, weights=uniform, score=0.863, total=   0.0s
[CV] algorithm=kd_tree, n_neighbors=5, p=2, weights=uniform ..........
[CV]  algorithm=kd_tree, n_neighbors=5, p=2, weights=uniform, score=0.923, total=   0.0s
[CV] algorithm=kd_tree, n_neighbors=5, p=2, weights=uniform ..........
[CV]  algorithm=kd_tree, n_neighbors=

[CV]  algorithm=brute, n_neighbors=3, p=1, weights=uniform, score=0.900, total=   0.0s
[CV] algorithm=brute, n_neighbors=3, p=1, weights=uniform ............
[CV]  algorithm=brute, n_neighbors=3, p=1, weights=uniform, score=0.931, total=   0.0s
[CV] algorithm=brute, n_neighbors=3, p=1, weights=distance ...........
[CV]  algorithm=brute, n_neighbors=3, p=1, weights=distance, score=0.847, total=   0.0s
[CV] algorithm=brute, n_neighbors=3, p=1, weights=distance ...........
[CV]  algorithm=brute, n_neighbors=3, p=1, weights=distance, score=0.893, total=   0.0s
[CV] algorithm=brute, n_neighbors=3, p=1, weights=distance ...........
[CV]  algorithm=brute, n_neighbors=3, p=1, weights=distance, score=0.915, total=   0.0s
[CV] algorithm=brute, n_neighbors=3, p=1, weights=distance ...........
[CV]  algorithm=brute, n_neighbors=3, p=1, weights=distance, score=0.923, total=   0.0s
[CV] algorithm=brute, n_neighbors=3, p=1, weights=distance ...........
[CV]  algorithm=brute, n_neighbors=3, p=1, weigh

[CV]  algorithm=brute, n_neighbors=6, p=1, weights=uniform, score=0.840, total=   0.0s
[CV] algorithm=brute, n_neighbors=6, p=1, weights=uniform ............
[CV]  algorithm=brute, n_neighbors=6, p=1, weights=uniform, score=0.885, total=   0.0s
[CV] algorithm=brute, n_neighbors=6, p=1, weights=uniform ............
[CV]  algorithm=brute, n_neighbors=6, p=1, weights=uniform, score=0.892, total=   0.0s
[CV] algorithm=brute, n_neighbors=6, p=1, weights=uniform ............
[CV]  algorithm=brute, n_neighbors=6, p=1, weights=uniform, score=0.908, total=   0.0s
[CV] algorithm=brute, n_neighbors=6, p=1, weights=distance ...........
[CV]  algorithm=brute, n_neighbors=6, p=1, weights=distance, score=0.840, total=   0.0s
[CV] algorithm=brute, n_neighbors=6, p=1, weights=distance ...........
[CV]  algorithm=brute, n_neighbors=6, p=1, weights=distance, score=0.885, total=   0.0s
[CV] algorithm=brute, n_neighbors=6, p=1, weights=distance ...........
[CV]  algorithm=brute, n_neighbors=6, p=1, weights

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    4.3s finished


GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [3, 4, 5, 6, 7], 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             verbose=4)

In [65]:
grid_search.best_params_

{'algorithm': 'auto', 'n_neighbors': 4, 'p': 2, 'weights': 'distance'}

In [66]:
optimized_knn_pred = grid_search.predict(X_test_transformed)
optimized_knn_pred

array([1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,

In [67]:
print(confusion_matrix(y_test,optimized_knn_pred))
print(classification_report(y_test,optimized_knn_pred))

[[168   6]
 [ 49 125]]
              precision    recall  f1-score   support

           0       0.77      0.97      0.86       174
           1       0.95      0.72      0.82       174

    accuracy                           0.84       348
   macro avg       0.86      0.84      0.84       348
weighted avg       0.86      0.84      0.84       348



In [68]:
param_grid = {'n_estimators': [100,200,300,400,500],
             'criterion': ['gini','entropy'],
             'max_features': ["auto", "sqrt", "log2"],
             'bootstrap': [True,False],
             'class_weight': ['balanced','balanced_subsample']}

In [69]:
grid_search = GridSearchCV(RandomForestClassifier(),param_grid,refit=True,verbose=4)
grid_search.fit(X_train_transformed,y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=100, score=0.969, total=   0.2s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=100, score=0.985, total=   0.2s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=100, score=0.985, total=   0.2s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=100, score=0.969, total=   0.2s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.5s remaining:    0.0s


[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=100, score=0.962, total=   0.2s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=200 
[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=200, score=0.969, total=   0.4s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=200 
[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=200, score=0.985, total=   0.4s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=200 
[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=200, score=0.985, total=   0.4s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=200 
[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=200, score=0.977, t

[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=400, score=0.985, total=   0.8s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=400 
[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=400, score=0.985, total=   0.7s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=400 
[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=400, score=0.977, total=   0.7s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=400 
[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=400, score=0.962, total=   0.7s
[CV] bootstrap=True, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=500 
[CV]  bootstrap=True, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=500, score=0.977, t

[CV]  bootstrap=True, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=100, score=0.985, total=   0.2s
[CV] bootstrap=True, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=100, score=0.962, total=   0.2s
[CV] bootstrap=True, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=200 
[CV]  bootstrap=True, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=200, score=0.962, total=   0.4s
[CV] bootstrap=True, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=200 
[CV]  bootstrap=True, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=200, score=0.992, total=   0.4s
[CV] bootstrap=True, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=200 
[CV]  bootstrap=True, class_weight=balanced, criterion=entropy, max_features=auto, n_est

[CV]  bootstrap=True, class_weight=balanced, criterion=entropy, max_features=sqrt, n_estimators=300, score=0.954, total=   0.5s
[CV] bootstrap=True, class_weight=balanced, criterion=entropy, max_features=sqrt, n_estimators=400 
[CV]  bootstrap=True, class_weight=balanced, criterion=entropy, max_features=sqrt, n_estimators=400, score=0.977, total=   0.7s
[CV] bootstrap=True, class_weight=balanced, criterion=entropy, max_features=sqrt, n_estimators=400 
[CV]  bootstrap=True, class_weight=balanced, criterion=entropy, max_features=sqrt, n_estimators=400, score=0.992, total=   0.8s
[CV] bootstrap=True, class_weight=balanced, criterion=entropy, max_features=sqrt, n_estimators=400 
[CV]  bootstrap=True, class_weight=balanced, criterion=entropy, max_features=sqrt, n_estimators=400, score=0.985, total=   0.7s
[CV] bootstrap=True, class_weight=balanced, criterion=entropy, max_features=sqrt, n_estimators=400 
[CV]  bootstrap=True, class_weight=balanced, criterion=entropy, max_features=sqrt, n_est

[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=100, score=0.977, total=   0.2s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=100, score=0.985, total=   0.2s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=100, score=0.985, total=   0.2s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=100, score=0.977, total=   0.2s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=100 
[CV]  bootstrap=True, class_weig

[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=sqrt, n_estimators=200, score=0.962, total=   0.4s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=sqrt, n_estimators=300 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=sqrt, n_estimators=300, score=0.969, total=   0.6s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=sqrt, n_estimators=300 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=sqrt, n_estimators=300, score=0.985, total=   0.6s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=sqrt, n_estimators=300 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=sqrt, n_estimators=300, score=0.985, total=   0.6s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=sqrt, n_estimators=300 
[CV]  bootstrap=True, class_weig

[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=400, score=0.992, total=   0.8s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=400 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=400, score=0.969, total=   0.8s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=500 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=500, score=0.977, total=   1.0s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=500 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=500, score=0.992, total=   1.0s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=500 
[CV]  bootstrap=True, class_weig

[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=100, score=0.977, total=   0.2s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=100, score=0.969, total=   0.2s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=200 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=200, score=0.969, total=   0.4s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=200 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=200, score=0.985, total=   0.4s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=200 
[CV]  bo

[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=log2, n_estimators=300, score=0.985, total=   0.7s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=log2, n_estimators=300 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=log2, n_estimators=300, score=0.985, total=   0.6s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=log2, n_estimators=300 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=log2, n_estimators=300, score=0.977, total=   0.6s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=log2, n_estimators=400 
[CV]  bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=log2, n_estimators=400, score=0.977, total=   0.8s
[CV] bootstrap=True, class_weight=balanced_subsample, criterion=entropy, max_features=log2, n_estimators=400 
[CV]  bo

[CV]  bootstrap=False, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=500, score=0.977, total=   0.7s
[CV] bootstrap=False, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=500 
[CV]  bootstrap=False, class_weight=balanced, criterion=gini, max_features=auto, n_estimators=500, score=0.954, total=   0.7s
[CV] bootstrap=False, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=100 
[CV]  bootstrap=False, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=100, score=0.977, total=   0.2s
[CV] bootstrap=False, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=100 
[CV]  bootstrap=False, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=100, score=0.985, total=   0.2s
[CV] bootstrap=False, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=100 
[CV]  bootstrap=False, class_weight=balanced, criterion=gini, max_features=sqrt, n_estimators=100, score

[CV]  bootstrap=False, class_weight=balanced, criterion=gini, max_features=log2, n_estimators=300, score=0.977, total=   0.5s
[CV] bootstrap=False, class_weight=balanced, criterion=gini, max_features=log2, n_estimators=300 
[CV]  bootstrap=False, class_weight=balanced, criterion=gini, max_features=log2, n_estimators=300, score=0.985, total=   0.5s
[CV] bootstrap=False, class_weight=balanced, criterion=gini, max_features=log2, n_estimators=300 
[CV]  bootstrap=False, class_weight=balanced, criterion=gini, max_features=log2, n_estimators=300, score=0.985, total=   0.4s
[CV] bootstrap=False, class_weight=balanced, criterion=gini, max_features=log2, n_estimators=300 
[CV]  bootstrap=False, class_weight=balanced, criterion=gini, max_features=log2, n_estimators=300, score=0.985, total=   0.4s
[CV] bootstrap=False, class_weight=balanced, criterion=gini, max_features=log2, n_estimators=300 
[CV]  bootstrap=False, class_weight=balanced, criterion=gini, max_features=log2, n_estimators=300, score

[CV]  bootstrap=False, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=500, score=0.992, total=   0.8s
[CV] bootstrap=False, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=500 
[CV]  bootstrap=False, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=500, score=0.985, total=   0.8s
[CV] bootstrap=False, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=500 
[CV]  bootstrap=False, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=500, score=0.977, total=   0.8s
[CV] bootstrap=False, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=500 
[CV]  bootstrap=False, class_weight=balanced, criterion=entropy, max_features=auto, n_estimators=500, score=0.962, total=   0.7s
[CV] bootstrap=False, class_weight=balanced, criterion=entropy, max_features=sqrt, n_estimators=100 
[CV]  bootstrap=False, class_weight=balanced, criterion=entropy, max_features=sq

[CV]  bootstrap=False, class_weight=balanced, criterion=entropy, max_features=log2, n_estimators=200, score=0.985, total=   0.3s
[CV] bootstrap=False, class_weight=balanced, criterion=entropy, max_features=log2, n_estimators=200 
[CV]  bootstrap=False, class_weight=balanced, criterion=entropy, max_features=log2, n_estimators=200, score=0.985, total=   0.3s
[CV] bootstrap=False, class_weight=balanced, criterion=entropy, max_features=log2, n_estimators=200 
[CV]  bootstrap=False, class_weight=balanced, criterion=entropy, max_features=log2, n_estimators=200, score=0.977, total=   0.3s
[CV] bootstrap=False, class_weight=balanced, criterion=entropy, max_features=log2, n_estimators=300 
[CV]  bootstrap=False, class_weight=balanced, criterion=entropy, max_features=log2, n_estimators=300, score=0.977, total=   0.4s
[CV] bootstrap=False, class_weight=balanced, criterion=entropy, max_features=log2, n_estimators=300 
[CV]  bootstrap=False, class_weight=balanced, criterion=entropy, max_features=lo

[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=400, score=0.985, total=   0.6s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=400 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=400, score=0.969, total=   0.6s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=400 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=400, score=0.962, total=   0.6s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=500 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=500, score=0.977, total=   0.9s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=auto, n_estimators=500 
[CV]  bootstrap=False, c

[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=100, score=0.992, total=   0.2s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=100 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=100, score=0.985, total=   0.1s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=100 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=100, score=0.969, total=   0.1s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=100 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=100, score=0.977, total=   0.1s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_features=log2, n_estimators=200 
[CV]  bootstrap=False, c

[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=auto, n_estimators=300, score=0.977, total=   0.5s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=auto, n_estimators=300 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=auto, n_estimators=300, score=0.985, total=   0.5s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=auto, n_estimators=300 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=auto, n_estimators=300, score=0.985, total=   0.4s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=auto, n_estimators=300 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=auto, n_estimators=300, score=0.985, total=   0.5s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=auto, n_estimators=300 


[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=400, score=0.977, total=   0.6s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=400 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=400, score=0.962, total=   0.6s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=500 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=500, score=0.977, total=   0.9s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=500 
[CV]  bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=500, score=0.985, total=   0.8s
[CV] bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_features=sqrt, n_estimators=500 


[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:  5.1min finished


GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False],
                         'class_weight': ['balanced', 'balanced_subsample'],
                         'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 200, 300, 400, 500]},
             verbose=4)

In [70]:
rfc = RandomForestClassifier(bootstrap=True,class_weight='balanced',criterion='entropy',max_features='log2',n_estimators=500,random_state=68)
rfc.fit(X_train_transformed,y_train)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_features='log2', n_estimators=500, random_state=68)

In [71]:
optimized_rfc_pred = rfc.predict(X_test_transformed)
optimized_rfc_pred

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [72]:
print(confusion_matrix(y_test,optimized_rfc_pred))
print(classification_report(y_test,optimized_rfc_pred))

[[163  11]
 [ 21 153]]
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       174
           1       0.93      0.88      0.91       174

    accuracy                           0.91       348
   macro avg       0.91      0.91      0.91       348
weighted avg       0.91      0.91      0.91       348



## Conclusion

In [73]:
print("Performance of ML models used:")
print("Accuracy score of Logistic Regression:",str(np.round(accuracy_score(y_test,predictions)*100,2)) + '%')
print("Precision score of Logistic Regression:",str(np.round(precision_score(y_test,predictions)*100,2)) + '%')
print("Recall score of Logistic Regression:",str(np.round(recall_score(y_test,predictions)*100,2)) + '%')
print('-------------------------------------------------------------')
print("Accuracy score of Support Vector Classifier:",str(np.round(accuracy_score(y_test,optimized_svm_pred)*100,2)) + '%')
print("Precision score of Support Vector Classifier:",str(np.round(precision_score(y_test,optimized_svm_pred)*100,2)) + '%')
print("Recall score of Support Vector Classifier:",str(np.round(recall_score(y_test,optimized_svm_pred)*100,2)) + '%')
print('-------------------------------------------------------------')
print("Accuracy score of K Neighbors Classifier:",str(np.round(accuracy_score(y_test,optimized_knn_pred)*100,2)) + '%')
print("Precision score of K Neighbors Classifier:",str(np.round(precision_score(y_test,optimized_knn_pred)*100,2)) + '%')
print("Recall score of K Neighbors Classifier:",str(np.round(recall_score(y_test,optimized_knn_pred)*100,2)) + '%')
print('-------------------------------------------------------------')
print("Accuracy score of Multinomial Naive Bayes:",str(np.round(accuracy_score(y_test,mnb_pred)*100,2)) + '%')
print("Precision score of Multinomial Naive Bayes:",str(np.round(precision_score(y_test,mnb_pred)*100,2)) + '%')
print("Recall score of Multinomial Naive Bayes:",str(np.round(recall_score(y_test,mnb_pred)*100,2)) + '%')
print('-------------------------------------------------------------')
print("Accuracy score of Decision Tree Classifier:",str(np.round(accuracy_score(y_test,dtree_pred)*100,2)) + '%')
print("Precision score of Decision Tree Classifier:",str(np.round(precision_score(y_test,dtree_pred)*100,2)) + '%')
print("Recall score of Decision Tree Classifier:",str(np.round(recall_score(y_test,dtree_pred)*100,2)) + '%')
print('-------------------------------------------------------------')
print("Accuracy score of Random Forests Classifier:",str(np.round(accuracy_score(y_test,optimized_rfc_pred)*100,2)) + '%')
print("Precision score of Random Forests Classifier:",str(np.round(precision_score(y_test,optimized_rfc_pred)*100,2)) + '%')
print("Recall score of Random Forests Classifier:",str(np.round(recall_score(y_test,optimized_rfc_pred)*100,2)) + '%')
print('-------------------------------------------------------------')
print("Accuracy score of XG Boost Classifier:",str(np.round(accuracy_score(y_test,xgb_pred)*100,2)) + '%')
print("Precision score of XG Boost Classifier:",str(np.round(precision_score(y_test,xgb_pred)*100,2)) + '%')
print("Recall score of XG Boost Classifier:",str(np.round(recall_score(y_test,xgb_pred)*100,2)) + '%')
print('-------------------------------------------------------------')

Performance of ML models used:
Accuracy score of Logistic Regression: 52.3%
Precision score of Logistic Regression: 51.18%
Recall score of Logistic Regression: 100.0%
-------------------------------------------------------------
Accuracy score of Support Vector Classifier: 93.39%
Precision score of Support Vector Classifier: 99.35%
Recall score of Support Vector Classifier: 87.36%
-------------------------------------------------------------
Accuracy score of K Neighbors Classifier: 84.2%
Precision score of K Neighbors Classifier: 95.42%
Recall score of K Neighbors Classifier: 71.84%
-------------------------------------------------------------
Accuracy score of Multinomial Naive Bayes: 77.59%
Precision score of Multinomial Naive Bayes: 77.91%
Recall score of Multinomial Naive Bayes: 77.01%
-------------------------------------------------------------
Accuracy score of Decision Tree Classifier: 81.61%
Precision score of Decision Tree Classifier: 82.74%
Recall score of Decision Tree Cla

### Support Vector Classifier has the best predictive performance among all the used models, possessing an accuracy score of more than 93%, closely followed by Random Forests Classifier which has an accuracy of just over 90%. Logistic regression has the worst predictive performance, having an accuracy score of a little more than 50%.