In [4]:
!pip install -q tqdm

In [62]:
import pandas as pd
import numpy as np
import tqdm
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

<h1> Read Data </h1>

In [63]:
X_train = pd.read_csv('train_bert_embedding_w_jobid.csv')
X_test = pd.read_csv('test_bert_embedding_w_jobid.csv')

In [64]:
X_train = X_train.drop(['job_id', 'fraudulent'], axis=1)
X_test = X_test.drop(['job_id', 'fraudulent'], axis=1)
y_train = pd.read_csv('train_bert_embedding_w_jobid.csv')['fraudulent']
y_test = pd.read_csv('test_bert_embedding_w_jobid.csv')['fraudulent']

In [65]:
# Display the first 5 rows of the dataset
X_train.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,sentiment score_profile,sentiment score_requirement,sentiment score_benefits,sentiment score_description,department_frequency,department_mean_target,num_of_nouns_company_profile,...,requirement759,requirement760,requirement761,requirement762,requirement763,requirement764,requirement765,requirement766,requirement767,requirement768
0,0,1,0,0.9313,0.9929,0.9716,0.9951,0.00028,0.0,0.185841,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
1,0,1,0,0.9618,0.926,0.0,0.9509,0.675685,0.044594,0.132743,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
2,0,1,1,0.9913,0.4019,0.3818,0.9426,0.675685,0.044594,0.207965,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
3,0,1,1,0.962,0.9657,0.9081,0.5719,7e-05,0.0,0.150442,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
4,0,1,1,0.9753,0.7391,0.9671,0.9974,0.675685,0.044594,0.19469,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869


In [66]:
# Display the first 5 rows of the dataset
X_test.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,sentiment score_profile,sentiment score_requirement,sentiment score_benefits,sentiment score_description,department_frequency,department_mean_target,num_of_nouns_company_profile,...,requirement759,requirement760,requirement761,requirement762,requirement763,requirement764,requirement765,requirement766,requirement767,requirement768
0,0,1,0,0.9856,0.7345,0.0,0.6486,0.022301,0.00627,0.123894,...,0.067283,-0.105608,0.290805,-0.149879,-0.095278,-0.495512,0.19802,-0.294343,-0.289188,0.094234
1,0,1,0,0.9612,0.9819,0.8462,0.9957,0.675685,0.044594,0.123894,...,0.067283,-0.105608,0.290805,-0.149879,-0.095278,-0.495512,0.19802,-0.294343,-0.289188,0.094234
2,0,0,0,0.0,0.0,0.0,0.9983,0.675685,0.044594,0.0,...,0.067283,-0.105608,0.290805,-0.149879,-0.095278,-0.495512,0.19802,-0.294343,-0.289188,0.094234
3,0,1,1,0.9934,0.8439,0.0,0.0,0.675685,0.044594,0.234513,...,0.067283,-0.105608,0.290805,-0.149879,-0.095278,-0.495512,0.19802,-0.294343,-0.289188,0.094234
4,0,1,0,0.9783,0.9618,0.8885,0.9896,0.675685,0.044594,0.079646,...,0.067283,-0.105608,0.290805,-0.149879,-0.095278,-0.495512,0.19802,-0.294343,-0.289188,0.094234


###K-Nearest Neighbors (KNN)###

In [36]:
from sklearn.neighbors import KNeighborsClassifier

# KNN Classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)

# Predictions
y_pred_knn = knn_classifier.predict(X_test)

In [37]:
accuracy = accuracy_score(y_test, y_pred_knn)
precision = precision_score(y_test, y_pred_knn)
recall = recall_score(y_test, y_pred_knn)
f1 = f1_score(y_test, y_pred_knn)
print("KNN - Accuracy:", accuracy)
print("KNN - Precision:", precision)
print("KNN - Recall:", recall)
print("KNN - F1:", f1)


KNN - Accuracy: 0.9798657718120806
KNN - Precision: 0.8707482993197279
KNN - Recall: 0.7071823204419889
KNN - F1: 0.7804878048780487


In [97]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
knn = KNeighborsClassifier(p=1)

# Create the random grid
params = {'n_neighbors': [3, 4, 5, 6, 7, 9],
          'weights': ['uniform', 'distance'],
          'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
          'leaf_size': [5, 10, 20, 30, 40]}

# Random search of parameters, using 5 fold cross validation,
# search across 100 different combinations, and use all available cores
knn_random = RandomizedSearchCV(estimator=knn, 
                                param_distributions=params, 
                                n_iter=100, 
                                cv=5, 
                                verbose=3, 
                                random_state=61,
                                n_jobs=-1,
                                scoring='f1')

# Fit the random search model
knn_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


4069.22s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4069.23s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4069.23s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4069.24s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4069.24s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4069.26s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4069.27s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4069.28s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4069.28s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4069.29s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please

[CV 4/5] END algorithm=brute, leaf_size=30, n_neighbors=6, weights=uniform;, score=0.648 total time= 2.6min
[CV 3/5] END algorithm=brute, leaf_size=30, n_neighbors=6, weights=uniform;, score=0.742 total time= 2.6min
[CV 5/5] END algorithm=brute, leaf_size=30, n_neighbors=6, weights=uniform;, score=0.667 total time= 2.6min
[CV 2/5] END algorithm=brute, leaf_size=30, n_neighbors=6, weights=uniform;, score=0.649 total time= 2.6min
[CV 1/5] END algorithm=brute, leaf_size=30, n_neighbors=6, weights=uniform;, score=0.661 total time= 2.6min
[CV 3/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=6, weights=distance;, score=0.818 total time= 3.5min
[CV 4/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=6, weights=distance;, score=0.717 total time= 3.6min
[CV 1/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=6, weights=distance;, score=0.777 total time= 3.7min
[CV 2/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=6, weights=distance;, score=0.720 total time= 3.8min
[CV 5/5] END algorit

7514.40s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 2/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=7, weights=uniform;, score=0.644 total time= 2.2min
[CV 5/5] END algorithm=kd_tree, leaf_size=20, n_neighbors=7, weights=uniform;, score=0.667 total time= 2.3min
[CV 1/5] END algorithm=auto, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.714 total time= 2.5min
[CV 2/5] END algorithm=auto, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.702 total time= 2.5min
[CV 3/5] END algorithm=auto, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.759 total time= 2.4min
[CV 4/5] END algorithm=auto, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.698 total time= 2.4min
[CV 5/5] END algorithm=auto, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.684 total time= 2.4min


7655.96s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 2/5] END algorithm=brute, leaf_size=40, n_neighbors=9, weights=distance;, score=0.677 total time= 2.4min
[CV 1/5] END algorithm=brute, leaf_size=40, n_neighbors=9, weights=distance;, score=0.669 total time= 2.5min
[CV 3/5] END algorithm=brute, leaf_size=40, n_neighbors=9, weights=distance;, score=0.795 total time= 2.4min
[CV 4/5] END algorithm=brute, leaf_size=40, n_neighbors=9, weights=distance;, score=0.667 total time= 2.5min
[CV 5/5] END algorithm=brute, leaf_size=40, n_neighbors=9, weights=distance;, score=0.676 total time= 2.4min
[CV 2/5] END algorithm=ball_tree, leaf_size=10, n_neighbors=4, weights=uniform;, score=0.689 total time= 2.3min
[CV 1/5] END algorithm=ball_tree, leaf_size=10, n_neighbors=4, weights=uniform;, score=0.715 total time= 2.4min
[CV 3/5] END algorithm=ball_tree, leaf_size=10, n_neighbors=4, weights=uniform;, score=0.776 total time= 2.3min
[CV 4/5] END algorithm=ball_tree, leaf_size=10, n_neighbors=4, weights=uniform;, score=0.682 total time= 2.3min


7809.86s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 5/5] END algorithm=ball_tree, leaf_size=10, n_neighbors=4, weights=uniform;, score=0.688 total time= 2.3min
[CV 3/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=6, weights=uniform;, score=0.742 total time= 3.4min
[CV 1/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=6, weights=uniform;, score=0.661 total time= 3.6min
[CV 4/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=6, weights=uniform;, score=0.648 total time= 3.5min
[CV 2/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=6, weights=uniform;, score=0.649 total time= 3.7min
[CV 5/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=6, weights=uniform;, score=0.667 total time= 3.8min
[CV 1/5] END algorithm=brute, leaf_size=20, n_neighbors=7, weights=distance;, score=0.716 total time= 2.4min
[CV 2/5] END algorithm=brute, leaf_size=20, n_neighbors=7, weights=distance;, score=0.715 total time= 2.5min
[CV 3/5] END algorithm=brute, leaf_size=20, n_neighbors=7, weights=distance;, score=0.802 total time= 2.5min
[CV 5/5] END alg

8772.85s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 1/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=6, weights=distance;, score=0.777 total time= 2.4min
[CV 2/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=6, weights=distance;, score=0.720 total time= 2.3min
[CV 3/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=6, weights=distance;, score=0.818 total time= 2.3min
[CV 4/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=6, weights=distance;, score=0.717 total time= 2.3min
[CV 5/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=6, weights=distance;, score=0.702 total time= 2.4min
[CV 1/5] END algorithm=brute, leaf_size=30, n_neighbors=9, weights=uniform;, score=0.596 total time= 2.4min
[CV 2/5] END algorithm=brute, leaf_size=30, n_neighbors=9, weights=uniform;, score=0.585 total time= 2.4min
[CV 3/5] END algorithm=brute, leaf_size=30, n_neighbors=9, weights=uniform;, score=0.720 total time= 2.4min
[CV 4/5] END algorithm=brute, leaf_size=30, n_neighbors=9, weights=uniform;, score=0.640 total time= 2.4min
[CV

9188.13s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
9192.13s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 4/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.682 total time= 2.2min
[CV 5/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=4, weights=uniform;, score=0.688 total time= 2.2min
[CV 1/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.661 total time= 2.4min
[CV 2/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.649 total time= 2.4min
[CV 3/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.742 total time= 2.4min
[CV 4/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.648 total time= 2.4min
[CV 5/5] END algorithm=auto, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.667 total time= 2.4min
[CV 1/5] END algorithm=brute, leaf_size=30, n_neighbors=6, weights=distance;, score=0.777 total time= 2.4min
[CV 2/5] END algorithm=brute, leaf_size=30, n_neighbors=6, weights=distance;, score=0.720 total time= 2.4min
[CV 3/5] END algorithm=

9978.99s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 4/5] END algorithm=kd_tree, leaf_size=10, n_neighbors=7, weights=distance;, score=0.689 total time= 2.5min


9987.64s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 5/5] END algorithm=kd_tree, leaf_size=10, n_neighbors=7, weights=distance;, score=0.685 total time= 2.7min
[CV 1/5] END algorithm=auto, leaf_size=20, n_neighbors=3, weights=uniform;, score=0.741 total time= 2.4min
[CV 2/5] END algorithm=auto, leaf_size=20, n_neighbors=3, weights=uniform;, score=0.715 total time= 2.4min
[CV 3/5] END algorithm=auto, leaf_size=20, n_neighbors=3, weights=uniform;, score=0.831 total time= 2.4min
[CV 4/5] END algorithm=auto, leaf_size=20, n_neighbors=3, weights=uniform;, score=0.731 total time= 2.4min
[CV 5/5] END algorithm=auto, leaf_size=20, n_neighbors=3, weights=uniform;, score=0.739 total time= 2.4min
[CV 1/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.714 total time= 2.3min
[CV 2/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.702 total time= 2.2min
[CV 3/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=5, weights=uniform;, score=0.759 total time= 2.3min
[CV 4/5] END algor

10463.29s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 1/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.794 total time= 2.4min
[CV 2/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.731 total time= 2.3min
[CV 3/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.816 total time= 2.3min
[CV 4/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.723 total time= 2.4min
[CV 5/5] END algorithm=ball_tree, leaf_size=30, n_neighbors=5, weights=distance;, score=0.716 total time= 2.3min
[CV 1/5] END algorithm=auto, leaf_size=30, n_neighbors=9, weights=distance;, score=0.669 total time= 2.4min
[CV 2/5] END algorithm=auto, leaf_size=30, n_neighbors=9, weights=distance;, score=0.677 total time= 2.4min
[CV 3/5] END algorithm=auto, leaf_size=30, n_neighbors=9, weights=distance;, score=0.795 total time= 2.4min


10582.20s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 4/5] END algorithm=auto, leaf_size=30, n_neighbors=9, weights=distance;, score=0.667 total time= 2.4min
[CV 5/5] END algorithm=auto, leaf_size=30, n_neighbors=9, weights=distance;, score=0.676 total time= 2.4min
[CV 2/5] END algorithm=auto, leaf_size=40, n_neighbors=3, weights=distance;, score=0.747 total time= 2.4min
[CV 1/5] END algorithm=auto, leaf_size=40, n_neighbors=3, weights=distance;, score=0.761 total time= 2.4min
[CV 3/5] END algorithm=auto, leaf_size=40, n_neighbors=3, weights=distance;, score=0.862 total time= 2.4min
[CV 4/5] END algorithm=auto, leaf_size=40, n_neighbors=3, weights=distance;, score=0.751 total time= 2.4min
[CV 5/5] END algorithm=auto, leaf_size=40, n_neighbors=3, weights=distance;, score=0.763 total time= 2.4min
[CV 1/5] END algorithm=brute, leaf_size=40, n_neighbors=4, weights=distance;, score=0.773 total time= 2.4min
[CV 2/5] END algorithm=brute, leaf_size=40, n_neighbors=4, weights=distance;, score=0.744 total time= 2.4min
[CV 4/5] END algorithm=bru

10908.26s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 2/5] END algorithm=ball_tree, leaf_size=10, n_neighbors=7, weights=distance;, score=0.715 total time= 2.3min
[CV 1/5] END algorithm=ball_tree, leaf_size=10, n_neighbors=7, weights=distance;, score=0.716 total time= 2.3min
[CV 3/5] END algorithm=ball_tree, leaf_size=10, n_neighbors=7, weights=distance;, score=0.802 total time= 2.3min
[CV 4/5] END algorithm=ball_tree, leaf_size=10, n_neighbors=7, weights=distance;, score=0.689 total time= 2.3min
[CV 5/5] END algorithm=ball_tree, leaf_size=10, n_neighbors=7, weights=distance;, score=0.685 total time= 2.3min
[CV 2/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.649 total time= 2.3min
[CV 1/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.661 total time= 2.3min
[CV 3/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.742 total time= 2.3min
[CV 4/5] END algorithm=ball_tree, leaf_size=20, n_neighbors=6, weights=uniform;, score=0.648 total 

11239.60s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV 2/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=9, weights=uniform;, score=0.585 total time= 3.9min
[CV 4/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=9, weights=distance;, score=0.667 total time= 2.0min
[CV 5/5] END algorithm=kd_tree, leaf_size=40, n_neighbors=9, weights=distance;, score=0.676 total time= 2.2min
[CV 5/5] END algorithm=kd_tree, leaf_size=5, n_neighbors=9, weights=uniform;, score=0.602 total time= 3.9min
[CV 2/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=6, weights=uniform;, score=0.649 total time= 2.3min
[CV 1/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=6, weights=uniform;, score=0.661 total time= 2.4min
[CV 3/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=6, weights=uniform;, score=0.742 total time= 2.4min
[CV 4/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=6, weights=uniform;, score=0.648 total time= 2.4min
[CV 5/5] END algorithm=ball_tree, leaf_size=40, n_neighbors=6, weights=uniform;, score=0.667 total time= 2.4min


In [98]:
# Get the best parameters
knn_random.best_params_

{'weights': 'distance', 'n_neighbors': 3, 'leaf_size': 30, 'algorithm': 'auto'}

In [99]:
# Get the best train performance
knn_random.best_score_

0.7769098369167072

In [100]:
# Get the performance metrics
knn_best_random = knn_random.best_estimator_
knn_y_pred = knn_best_random.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_precision = precision_score(y_test, knn_y_pred)
knn_recall = recall_score(y_test, knn_y_pred)
knn_f1 = f1_score(y_test, knn_y_pred)

print("KNN - Best Parameters:", knn_random.best_params_)
print("KNN - Accuracy:", knn_accuracy)
print("KNN - Precision:", knn_precision)
print("KNN - Recall:", knn_recall)
print("KNN - F1:", knn_f1)

KNN - Best Parameters: {'weights': 'distance', 'n_neighbors': 3, 'leaf_size': 30, 'algorithm': 'auto'}
KNN - Accuracy: 0.979586129753915
KNN - Precision: 0.8506493506493507
KNN - Recall: 0.7237569060773481
KNN - F1: 0.782089552238806


###LSTM###

Without Tuning:

In [42]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow_addons as tfa
from tensorflow.keras.metrics import Recall, Precision

# Convert DataFrame and Series to NumPy arrays
X_train_np = X_train.to_numpy()
y_train_np = np.array(y_train)

# Reshape the train data for LSTM
X_train_np = np.reshape(X_train_np, (X_train_np.shape[0], 1, X_train_np.shape[1]))

# Reshpae the test data for LSTM
X_test_np = np.reshape(X_test.to_numpy(), (X_test.to_numpy().shape[0], 1, X_test.to_numpy().shape[1]))

model = Sequential()
model.add(LSTM(64, return_sequences=True))
model.add(Dense(32, activation='tanh'))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(16,))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[Recall(), Precision(), 'accuracy'])

# Training the model
model.fit(X_train_np, y_train_np, epochs=10, batch_size=64)

# Generate probabilities
y_pred_prob = model.predict(X_test_np, verbose=0)

# Convert probabilities into class labels
y_pred_lstm = (y_pred_prob > 0.5).astype("int32")

lstm_accuracy = accuracy_score(y_test, y_pred_lstm)
lstm_precision = precision_score(y_test, y_pred_lstm)
lstm_recall = recall_score(y_test, y_pred_lstm)
lstm_f1 = f1_score(y_test, y_pred_lstm)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
# Get the performance metrics
print("LSTM - Accuracy:", lstm_accuracy)
print("LSTM - Precision:", lstm_precision)
print("LSTM - Recall:", lstm_recall)
print("LSTM - F1:", lstm_f1)

LSTM - Accuracy: 0.9532997762863534
LSTM - Precision: 0.5236486486486487
LSTM - Recall: 0.856353591160221
LSTM - F1: 0.649895178197065


With Tuning:

In [95]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow_addons as tfa
from tensorflow.keras.metrics import Recall, Precision

# Convert DataFrame and Series to NumPy arrays
X_train_np = X_train.to_numpy()
y_train_np = np.array(y_train)

# Reshape the train data for LSTM
X_train_np = np.reshape(X_train_np, (X_train_np.shape[0], 1, X_train_np.shape[1]))

# Reshpae the test data for LSTM
X_test_np = np.reshape(X_test.to_numpy(), (X_test.to_numpy().shape[0], 1, X_test.to_numpy().shape[1]))

model = Sequential()
model.add(LSTM(256, return_sequences=True))
model.add(Dense(64, activation='tanh'))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(16,))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[Recall(), Precision(), 'accuracy'])

# Training the model
model.fit(X_train_np, y_train_np, epochs=30, batch_size=64)

# Generate probabilities
y_pred_prob = model.predict(X_test_np, verbose=0)

# Convert probabilities into class labels
y_pred_lstm = (y_pred_prob > 0.5).astype("int32")

lstm_accuracy = accuracy_score(y_test, y_pred_lstm)
lstm_precision = precision_score(y_test, y_pred_lstm)
lstm_recall = recall_score(y_test, y_pred_lstm)
lstm_f1 = f1_score(y_test, y_pred_lstm)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [52]:
# Get the performance metrics
print("LSTM - Accuracy:", lstm_accuracy)
print("LSTM - Precision:", lstm_precision)
print("LSTM - Recall:", lstm_recall)
print("LSTM - F1:", lstm_f1)

LSTM - Accuracy: 0.9823825503355704
LSTM - Precision: 0.8277777777777777
LSTM - Recall: 0.8232044198895028
LSTM - F1: 0.8254847645429362
