In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#from google.colab import drive
#drive.mount('/content/drive')
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('/content/drive/My Drive/BT4012 Group 06/Data/train_non_embedded_w_jobid.csv', index_col=None)

In [4]:
test = pd.read_csv('/content/drive/My Drive/BT4012 Group 06/Data/test_non_embedded_w_jobid.csv', index_col=None)

In [4]:
text_data = pd.read_csv('/content/drive/My Drive/BT4012 Group 06/Data/cleaned_data.csv', index_col=None)

In [7]:
text_data.set_index('job_id', inplace = True)

<h1>Generate BERT embeddings

In [11]:
from transformers import BertModel, BertTokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

import torch

# Generate BERT embeddings
def generate_bert_word_embeddings(text):
  input_ids = tokenizer.encode(test, return_tensors='pt')
  with torch.no_grad():
    outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    column_mean = last_hidden_states.mean()
    output = []
    for i in range(768):
      output.append(column_mean[i])
    return output

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
benefits_embeddings = []
description_embeddings = []
profile_embeddings = []
requirement_embeddings = []
for i in range(len(text_data)):
    benefit = text_data.iloc[i]['benefits']
    description = text_data.iloc[i]['description']
    profile = text_data.iloc[i]['company_profile']
    requirement = text_data.iloc[i]['requirements']
    benefit_embedding = generate_bert_word_embeddings(benefit)
    description_embedding = generate_bert_word_embeddings(description)
    profile_embedding = generate_bert_word_embeddings(profile)
    requirement_embedding = generate_bert_word_embeddings(requirement)
    benefits_embeddings.append(benefit_embedding)
    description_embeddings.append(description_embedding)
    profile_embeddings.append(profile_embedding)
    requirement_embeddings.append(requirement_embedding)

In [None]:
train_benefits = []
train_descriptions = []
train_profiles = []
train_requirements = []
for id in train['job_id']:
  benefit = benefits_embeddings[i-1]
  description = description_embeddings[i-1]
  profile = profile_embeddings[i-1]
  requirement = requirement_embeddings[i-1]
  train_benefits.append(benefit)
  train_descriptions.append(description)
  train_profiles.append(profile)
  train_requirements.append(requirement)

new_train = pd.concat([train, train_benefits], axis = 1)
new_train = pd.concat([new_train, train_descriptions], axis = 1)
new_train = pd.concat([new_train, train_profiles], axis = 1)
new_train = pd.concat([new_train, train_requirements], axis = 1)

In [None]:
test_benefits = []
test_descriptions = []
test_profiles = []
test_requirements = []
for id in test'job_id']:
  benefit = benefits_embeddings[i-1]
  description = description_embeddings[i-1]
  profile = profile_embeddings[i-1]
  requirement = requirement_embeddings[i-1]
  test_benefits.append(benefit)
  test_descriptions.append(description)
  test_profiles.append(profile)
  test_requirements.append(requirement)

new_test = pd.concat([test, test_benefits], axis = 1)
new_test = pd.concat([new_test, test_descriptions], axis = 1)
new_test = pd.concat([new_test, test_profiles], axis = 1)
new_test = pd.concat([new_test, test_requirements], axis = 1)

In [None]:
new_train.to_csv('/content/drive/My Drive/BT4012 Group 06/Data/train_bert_embedding_w_jobid.csv', index = False)
new_test.to_csv('/content/drive/My Drive/BT4012 Group 06/Data/test_bert_embedding_w_jobid.csv', index = False)

<h1>Train the model with embedding features

In [5]:
new_train = pd.read_csv('train_bert_embedding_w_jobid.csv')
new_test = pd.read_csv('test_bert_embedding_w_jobid.csv')

In [6]:
new_X_train = new_train.drop(['job_id','fraudulent'],axis = 1)
new_y_train = new_train[['fraudulent']]
new_X_test = new_test.drop(['job_id', 'fraudulent'],axis = 1)
new_y_test = new_test[['fraudulent']]

In [None]:
# Calculate the correlation between each feature and the target
correlation_with_target = new_X_train.apply(lambda col: col.corr(new_y_train['fraudulent']))
correlation_df = pd.DataFrame(list(correlation_with_target.items()), columns=['Feature', 'Correlation'])
correlation_df['abs_correlation'] = correlation_df['Correlation'].abs()
# Sort the DataFrame by correlation values in descending order
correlation_df = correlation_df.sort_values(by='abs_correlation', ascending=False)

In [None]:
# save a copy of non_embedded features with job_id and target value, both train and test
df_combined = new_train.iloc[:,:76].append(new_test.iloc[:,:76]).sort_values(by='job_id')
df_combined.reset_index(drop=True, inplace=True)

In [None]:
df_combined.to_csv('/content/drive/My Drive/BT4012 Group 06/Data/embedded_features_non_split.csv', index = False)

<h1>Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV

In [None]:
model = LogisticRegression()
model.fit(new_X_train, new_y_train)
y_pred = model.predict(new_X_test)
accuracy = accuracy_score(new_y_test, y_pred)
precision = precision_score(new_y_test, y_pred)
recall = recall_score(new_y_test, y_pred)
f1 = f1_score(new_y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.8808724832214765
Precision: 0.29549248747913187
Recall: 0.9779005524861878
F1: 0.45384615384615384


In [None]:
coefficients = model.coef_
intercept = model.intercept_
feature_names = new_X_train.columns
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients[0]})
coefficients_df = coefficients_df.sort_values(by='Coefficient', ascending=False)



In [None]:
abs_coeff_df = coefficients_df
abs_coeff_df['Absolute Coefficient'] = abs(abs_coeff_df['Coefficient'])
abs_coeff_df = abs_coeff_df.sort_values(by='Absolute Coefficient', ascending=False)

print(abs_coeff_df[:30])

                     Feature  Coefficient  Absolute Coefficient
8     department_mean_target     6.516872              6.516872
60             country_ratio     1.988857              1.988857
913            description72    -1.583659              1.583659
1           has_company_logo    -1.497235              1.497235
2175              profile566     1.242869              1.242869
1169          description328     1.182845              1.182845
1693               profile84    -1.144260              1.144260
899            description58     1.065375              1.065375
1559          description718    -1.058637              1.058637
1204          description363     1.041430              1.041430
2048              profile439     1.037066              1.037066
407               benefit334     1.016867              1.016867
991           description150     0.983489              0.983489
1368          description527     0.969773              0.969773
2073              profile464     0.94137

<h1>Logistic Regression with Hyperparameter Tuning

In [None]:
# Randomized search for best hyperparameters

# Create the model
lr = LogisticRegression()

# Create the random grid
params = {'penalty': ['l1', 'l2', 'elasticnet', None],
          'C': [0.01, 0.1, 1, 10, 100],
          'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
          'max_iter': [100, 200, 300, 400, 500]}

# Random search of parameters, using 5 fold cross validation,
# search across 100 different combinations, and use all available cores
lr_random = RandomizedSearchCV(estimator=lr,
                               param_distributions=params,
                               n_iter=100,
                               cv=5,
                               verbose=2,
                               random_state=61,
                               n_jobs=-1,
                               scoring='f1')

# Fit the random search model
lr_random.fit(new_X_train, new_y_train)
print(lr_random.best_params_)
print(lr_random.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [None]:
lr_best_random = lr_random.best_estimator_
lr_y_pred = lr_best_random.predict(new_X_test)
lr_accuracy = accuracy_score(new_y_test, lr_y_pred)
lr_precision = precision_score(new_y_test, lr_y_pred)
lr_recall = recall_score(new_y_test, lr_y_pred)
lr_f1 = f1_score(new_y_test, lr_y_pred)
print("Logistic Regression - Best Parameters:", lr_random.best_params_)
print("Logistic Regression - Accuracy:", lr_accuracy)
print("Logistic Regression - Precision:", lr_precision)
print("Logistic Regression - Recall:", lr_recall)
print("Logistic Regression - F1:", lr_f1)

<h1>SVM

Linear SVM

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC(kernel='linear', C=1.0)
model.fit(new_X_train, new_y_train)
y_pred = model.predict(new_X_test)
accuracy = accuracy_score(new_y_test, y_pred)
precision = precision_score(new_y_test, y_pred)
recall = recall_score(new_y_test, y_pred)
f1 = f1_score(new_y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9818232662192393
Precision: 0.9027777777777778
Recall: 0.7182320441988951
F1: 0.8


In [None]:
coefficients = model.coef_
intercept = model.intercept_
feature_names = new_X_train.columns
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients[0]})
coefficients_df = coefficients_df.sort_values(by='Coefficient', ascending=False)
abs_coeff_df = coefficients_df
abs_coeff_df['Absolute Coefficient'] = abs(abs_coeff_df['Coefficient'])
abs_coeff_df = abs_coeff_df.sort_values(by='Absolute Coefficient', ascending=False)

print(abs_coeff_df[:30])

                     Feature  Coefficient  Absolute Coefficient
8     department_mean_target     3.305469              3.305469
60             country_ratio     2.335733              2.335733
913            description72    -1.466913              1.466913
890            description49    -1.289334              1.289334
991           description150     1.261346              1.261346
1011          description170     1.038046              1.038046
1169          description328     1.013306              1.013306
941           description100    -1.005545              1.005545
1204          description363     0.997754              0.997754
1362          description521    -0.992338              0.992338
1330          description489     0.985629              0.985629
1480          description639     0.931881              0.931881
1368          description527     0.906241              0.906241
1482          description641     0.890448              0.890448
1270          description429     0.88690

Polynomial SVM

In [None]:
model = SVC(kernel='poly', C=1.0)
model.fit(new_X_train, new_y_train)
y_pred = model.predict(new_X_test)
accuracy = accuracy_score(new_y_test, y_pred)
precision = precision_score(new_y_test, y_pred)
recall = recall_score(new_y_test, y_pred)
f1 = f1_score(new_y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.95917225950783
Precision: 1.0
Recall: 0.19337016574585636
F1: 0.32407407407407407


In [None]:
support_vectors = model.support_vectors_
dual_coefficients = model.dual_coef_
max_dual_coefficient = np.max(model.dual_coef_)
max_dual_coefficient_index = np.argmax(model.dual_coef_)
corresponding_support_vector = model.support_vectors_[max_dual_coefficient_index]
abs_corresponding_support_vector = np.abs(corresponding_support_vector)
sorted_indices = np.argsort(abs_corresponding_support_vector, axis=None)[::-1]
top_30_columns = new_X_train.columns[sorted_indices[:30]]
top_30_coefficient= corresponding_support_vector[sorted_indices[:30]]
top_30_features = pd.DataFrame({'Features': top_30_columns, 'abs_coefficients': top_30_coefficient})
print(top_30_features)


                         Features  abs_coefficients
0                  description309         -4.446088
1                  requirement309         -3.657734
2                      profile181         -2.767094
3                      benefit181         -2.767094
4                      benefit309         -2.166787
5                      profile309         -2.166787
6                      profile753         -1.997128
7                      benefit753         -1.997128
8                      benefit721         -1.192218
9                      profile721         -1.192218
10                     profile709          1.059671
11                     benefit709          1.059671
12                     profile668         -1.051562
13                     benefit668         -1.051562
14                 description605          1.034723
15         adjs_vs_words_benefits          1.000000
16  adjs_vs_words_company_profile          1.000000
17     adjs_vs_words_requirements          1.000000
18      empl

sigmoid SVM

In [None]:
model = SVC(kernel='sigmoid', C=1.0)
model.fit(new_X_train, new_y_train)
y_pred = model.predict(new_X_test)
accuracy = accuracy_score(new_y_test, y_pred)
precision = precision_score(new_y_test, y_pred)
recall = recall_score(new_y_test, y_pred)
f1 = f1_score(new_y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9513422818791947
Precision: 0.5411764705882353
Recall: 0.2541436464088398
F1: 0.3458646616541354


In [None]:
support_vectors = model.support_vectors_
dual_coefficients = model.dual_coef_
max_dual_coefficient = np.max(model.dual_coef_)
max_dual_coefficient_index = np.argmax(model.dual_coef_)
corresponding_support_vector = model.support_vectors_[max_dual_coefficient_index]
abs_corresponding_support_vector = np.abs(corresponding_support_vector)
sorted_indices = np.argsort(abs_corresponding_support_vector, axis=None)[::-1]
top_30_columns = new_X_train.columns[sorted_indices[:30]]
top_30_coefficient= corresponding_support_vector[sorted_indices[:30]]
top_30_features = pd.DataFrame({'Features': top_30_columns, 'abs_coefficients': top_30_coefficient})
print(top_30_features)


                       Features  abs_coefficients
0                    profile309         -3.754920
1                    benefit309         -3.702072
2                requirement309         -3.657734
3                description309         -3.465783
4                    benefit540         -1.127818
5            country_fraudulent          1.000000
6              has_company_logo          1.000000
7        department_mean_target          1.000000
8                 has_questions          1.000000
9     employment_type_Full-time          1.000000
10         country_total_number          1.000000
11               description266         -0.996184
12                   benefit353          0.992599
13      sentiment score_profile          0.977100
14     sentiment score_benefits          0.963300
15  sentiment score_description          0.943200
16                   profile605          0.936106
17  sentiment score_requirement          0.930000
18               requirement173          0.904001


<h1>SVM with Hyperparameter Tuning

In [None]:
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(new_X_train, new_y_train)
tuned_svm = SVC(C=grid_search.best_params_['C'], kernel=grid_search.best_params_['kernel'], gamma=grid_search.best_params_['gamma'])
tuned_svm.fit(new_X_train, new_y_train)
tuned_y_pred = tuned_svm.predict(new_X_test)
accuracy = accuracy_score(new_y_test, tuned_y_pred)
precision = precision_score(new_y_test, tuned_y_pred)
recall = recall_score(new_y_test, tuned_y_pred)
f1 = f1_score(new_y_test, tuned_y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9586129753914989
Precision: 1.0
Recall: 0.18232044198895028
F1: 0.308411214953271


<h1>Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(new_X_train, new_y_train)
y_pred = model.predict(new_X_test)
accuracy = accuracy_score(new_y_test, y_pred)
precision = precision_score(new_y_test, y_pred)
recall = recall_score(new_y_test, y_pred)
f1 = f1_score(new_y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

In [None]:
# Create the model
rf = RandomForestClassifier(n_estimators = 300, max_features = 'sqrt', criterion = 'gini', min_samples_leaf = 1, bootstrap = False)

# Create the random grid
params = {'max_depth': [None, 20, 50, 100, 200, 300],
          'min_samples_split': [2, 3, 5, 8, 10]}

# Random search of parameters, using 5 fold cross validation,
# search across 30 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=params,
                               n_iter=30,
                               cv=5,
                               verbose=3,
                               random_state=61,
                               n_jobs=-1,
                               scoring='f1')

# Fit the random search model
rf_random.fit(new_X_train, new_y_train)

<h1>LightGBM

In [None]:
classifier_lgbm = LGBMClassifier(random_state=0,objective='binary')
# train
classifier_lgbm.fit(new_X_train, new_y_train)
# predict
y_pred_lgbm = classifier_lgbm.predict(new_X_test)

[LightGBM] [Info] Number of positive: 685, number of negative: 13619
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.118423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 595444
[LightGBM] [Info] Number of data points in the train set: 14304, number of used features: 2378
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047889 -> initscore=-2.989802
[LightGBM] [Info] Start training from score -2.989802


In [None]:
accuracy = accuracy_score(new_y_test, y_pred_lgbm)
precision = precision_score(new_y_test, y_pred_lgbm)
recall = recall_score(new_y_test, y_pred_lgbm)
f1 = f1_score(new_y_test, y_pred_lgbm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9823825503355704
Precision: 0.9758064516129032
Recall: 0.6685082872928176
F1: 0.7934426229508196


<h1>LightGDM with Hyperparameter Tuning

In [None]:
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV

# Define parameter distribution for LightGBM
param_dist_lgbm = {
    'num_leaves': randint(20, 100),
    'max_depth': [-1, 10, 20, 30],
    'learning_rate': uniform(0.01, 0.5),
    'n_estimators': randint(50, 200)
}

# Create LightGBM model
lgbm_model = LGBMClassifier(random_state=42)

# Perform RandomizedSearchCV for LightGBM
random_search_lgbm = RandomizedSearchCV(lgbm_model, param_distributions=param_dist_lgbm, n_iter=20,
                                        cv=5, scoring='f1', n_jobs=-1, random_state=42)

# Fit the model on the training data
random_search_lgbm.fit(new_X_train, new_y_train)

# Get the best parameters and the best estimator
best_params_lgbm = random_search_lgbm.best_params_
best_estimator_lgbm = random_search_lgbm.best_estimator_

# Use the best estimator to predict on the test set
y_pred_lgbm = best_estimator_lgbm.predict(new_X_test)

# Calculate evaluation metrics
accuracy_lgbm = accuracy_score(new_y_test, y_pred_lgbm)
precision_lgbm = precision_score(new_y_test, y_pred_lgbm)
recall_lgbm = recall_score(new_y_test, y_pred_lgbm)
f1_lgbm = f1_score(new_y_test, y_pred_lgbm)

# Print evaluation metrics for LightGBM
print("------- LightGBM Evaluation Metrics -------")
print("Best Parameters:", best_params_lgbm)
print("Accuracy:", accuracy_lgbm)
print("Precision:", precision_lgbm)
print("Recall:", recall_lgbm)
print("F1 Score:", f1_lgbm)
print("-----------------------------------------\n")

[LightGBM] [Info] Number of positive: 685, number of negative: 13619
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.069115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 595444
[LightGBM] [Info] Number of data points in the train set: 14304, number of used features: 2378
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047889 -> initscore=-2.989802
[LightGBM] [Info] Start training from score -2.989802
------- LightGBM Evaluation Metrics -------
Best Parameters: {'learning_rate': 0.2396244459829336, 'max_depth': -1, 'n_estimators': 149, 'num_leaves': 43}
Accuracy: 0.9823825503355704
Precision: 0.9836065573770492
Recall: 0.6629834254143646
F1 Score: 0.7920792079207921
-----------------------------------------



<h1>GaussianNB

In [None]:
classifier_gnb = GaussianNB()
# train
classifier_gnb.fit(new_X_train, new_y_train)
# predict
y_pred_gnb = classifier_gnb.predict(new_X_test)

In [None]:
accuracy = accuracy_score(new_y_test, y_pred_gnb)
precision = precision_score(new_y_test, y_pred_gnb)
recall = recall_score(new_y_test, y_pred_gnb)
f1 = f1_score(new_y_test, y_pred_gnb)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.8375279642058165
Precision: 0.21014492753623187
Recall: 0.8011049723756906
F1: 0.3329506314580941


<h1>Gaussian NB with Hyperparater Tuning

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
gnb = GaussianNB()

# Create the random grid
params = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        'priors': [None, [0.5, 0.5], [0.25, 0.75], [0.75, 0.25], [0.1, 0.9], [0.9, 0.1], [0.01, 0.99], [0.99, 0.01]]}

# Random search of parameters, using 5 fold cross validation,
# search across 200 different combinations, and use all available cores
gnb_random = RandomizedSearchCV(estimator=gnb,
                                param_distributions=params,
                                n_iter=200,
                                cv=5,
                                verbose=3,
                                random_state=61,
                                n_jobs=-1,
                                scoring='f1')

# Fit the random search model
gnb_random.fit(new_X_train, new_y_train)

<h1> LSTM

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Convert DataFrame to NumPy array and reshape
X_train_reshaped = new_X_train.values.reshape(new_X_train.shape[0], 1, new_X_train.shape[1])
X_test_reshaped = new_X_test.values.reshape(new_X_test.shape[0], 1, new_X_test.shape[1])
model = Sequential([
    LSTM(50, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    Dense(1, activation='sigmoid')
])

# Compile the model (adjust loss and optimizer as needed)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit your model to the training data
model.fit(X_train_reshaped, new_y_train, epochs=10, batch_size=32, validation_data=(X_test_reshaped, new_y_test))


2023-11-22 16:45:23.265339: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-22 16:45:23.292599: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-22 16:45:23.292623: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-22 16:45:23.292638: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-22 16:45:23.297979: I tensorflow/core/platform/cpu_feature_g

Epoch 1/10


2023-11-22 16:45:28.845963: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8902


  1/447 [..............................] - ETA: 23:10 - loss: 0.7163 - accuracy: 0.2188

2023-11-22 16:45:30.761255: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f5064460fd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-22 16:45:30.761274: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2023-11-22 16:45:30.761277: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2023-11-22 16:45:30.768182: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-22 16:45:30.864103: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f524a4922c0>

In [None]:
# Predict using the trained model
y_pred_prob = model.predict(X_test_reshaped)

# Convert probabilities to binary classes using a threshold (e.g., 0.5)
threshold = 0.5
y_pred_lstm = (y_pred_prob > threshold).astype(int)

# Calculate evaluation metrics
accuracy_lstm = accuracy_score(new_y_test, y_pred_lstm)
precision_lstm = precision_score(new_y_test, y_pred_lstm)
recall_lstm = recall_score(new_y_test, y_pred_lstm)
f1_lstm = f1_score(new_y_test, y_pred_lstm)

# Print the evaluation metrics
print("LSTM Model Metrics:")
print("Accuracy:", accuracy_lstm)
print("Precision:", precision_lstm)
print("Recall:", recall_lstm)
print("F1 Score:", f1_lstm)

LSTM Model Metrics:
Accuracy: 0.9837807606263982
Precision: 0.896774193548387
Recall: 0.7679558011049724
F1 Score: 0.8273809523809523


<h1> LSTM with Hyperparameter Tuning

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow_addons as tfa
from tensorflow.keras.metrics import Recall, Precision

# Convert DataFrame and Series to NumPy arrays
X_train_np = new_X_train.to_numpy()
y_train_np = np.array(new_y_train)

# Reshape the train data for LSTM
X_train_np = np.reshape(X_train_np, (X_train_np.shape[0], 1, X_train_np.shape[1]))

# Reshpae the test data for LSTM
X_test_np = np.reshape(new_X_test.to_numpy(), (new_X_test.to_numpy().shape[0], 1, new_X_test.to_numpy().shape[1]))

model = Sequential()
model.add(LSTM(256, return_sequences=True))
model.add(Dense(64, activation='tanh'))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(16,))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[Recall(), Precision(), 'accuracy'])

# Training the model
model.fit(X_train_np, y_train_np, epochs=30, batch_size=64)

# Generate probabilities
y_pred_prob = model.predict(X_test_np, verbose=0)

# Convert probabilities into class labels
y_pred_lstm = (y_pred_prob > 0.5).astype("int32")

lstm_accuracy = accuracy_score(new_y_test, y_pred_lstm)
lstm_precision = precision_score(new_y_test, y_pred_lstm)
lstm_recall = recall_score(new_y_test, y_pred_lstm)
lstm_f1 = f1_score(new_y_test, y_pred_lstm)
print("LSTM - Accuracy: 0.9532997762863534")
print("LSTM - Precision: 0.5236486486486487")
print("LSTM - Recall: 0.856353591160221")
print("LSTM - F1: 0.649895178197065")

LSTM - Accuracy: 0.9532997762863534
LSTM - Precision: 0.5236486486486487
LSTM - Recall: 0.856353591160221
LSTM - F1: 0.649895178197065


<h1> Rare Event Logistic

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create the logistic regression model with class_weight='balanced'
logistic_model = LogisticRegression(class_weight='balanced')

# Fit the model on the training data
logistic_model.fit(new_X_train, new_y_train)

# Predict on the test set
y_pred_logistic = logistic_model.predict(new_X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(new_y_test, y_pred_logistic)
precision = precision_score(new_y_test, y_pred_logistic)
recall = recall_score(new_y_test, y_pred_logistic)
f1 = f1_score(new_y_test, y_pred_logistic)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.7980984340044742
Precision: 0.1997780244173141
Recall: 0.994475138121547
F1 Score: 0.33271719038817005


<h1>Rare Event Logistic

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Create the logistic regression model with a higher penalty for misclassifying the minority class
logistic_model = LogisticRegression(class_weight='balanced', solver='liblinear')

# Fit the model on the training data
logistic_model.fit(new_X_train, new_y_train)

# Predict on the test set
y_pred_logistic = logistic_model.predict(new_X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(new_y_test, y_pred_logistic)
precision = precision_score(new_y_test, y_pred_logistic)
recall = recall_score(new_y_test, y_pred_logistic)
f1 = f1_score(new_y_test, y_pred_logistic)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.7217561521252797
Precision: 0.15332197614991483
Recall: 0.994475138121547
F1 Score: 0.2656826568265683


<h1>XGBoost

In [None]:
import xgboost as xgb

# Define your XGBoost model
xgb_model = xgb.XGBClassifier()

# Train the model using your data
xgb_model.fit(new_X_train, new_y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(new_X_test)

# Calculate evaluation metrics
accuracy_xgb = accuracy_score(new_y_test, y_pred_xgb)
precision_xgb = precision_score(new_y_test, y_pred_xgb)
recall_xgb = recall_score(new_y_test, y_pred_xgb)
f1_xgb = f1_score(new_y_test, y_pred_xgb)

# Print the evaluation metrics
print("XGBoost Model Metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1 Score:", f1_xgb)

XGBoost Model Metrics:
Accuracy: 0.9818232662192393
Precision: 0.967741935483871
Recall: 0.6629834254143646
F1 Score: 0.7868852459016392


<h1>XGBoost with Hyperparameter Tuning

In [None]:
from xgboost import XGBClassifier
from scipy.stats import randint, uniform


# Create XGBoost classifier
xgb_model = XGBClassifier(random_state=42)

# Define parameter distribution for XGBoost
param_dist_xgb = {
    'n_estimators': randint(50, 200),
    'learning_rate': uniform(0.01, 0.5),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

# Perform RandomizedSearchCV for XGBoost
random_search_xgb = RandomizedSearchCV(xgb_model, param_distributions=param_dist_xgb, n_iter=20,
                                       cv=5, scoring='f1', n_jobs=-1, random_state=42)

# Fit the model on the training data
random_search_xgb.fit(new_X_train, new_y_train)

# Get the best parameters and the best estimator
best_params_xgb = random_search_xgb.best_params_
best_estimator_xgb = random_search_xgb.best_estimator_

# Use the best estimator to predict on the test set
y_pred_xgb = best_estimator_xgb.predict(new_X_test)

# Calculate evaluation metrics
accuracy_xgb = accuracy_score(new_y_test, y_pred_xgb)
precision_xgb = precision_score(new_y_test, y_pred_xgb)
recall_xgb = recall_score(new_y_test, y_pred_xgb)
f1_xgb = f1_score(new_y_test, y_pred_xgb)

# Print evaluation metrics for XGBoost
print("------- XGBoost Evaluation Metrics -------")
print("Best Parameters:", best_params_xgb)
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1 Score:", f1_xgb)
print("-----------------------------------------\n")


------- XGBoost Evaluation Metrics -------
Best Parameters: {'colsample_bytree': 0.6872700594236812, 'learning_rate': 0.4853571532049581, 'max_depth': 5, 'n_estimators': 121, 'subsample': 0.7993292420985183}
Accuracy: 0.9821029082774049
Precision: 0.968
Recall: 0.6685082872928176
F1 Score: 0.7908496732026142
-----------------------------------------



<h1>K Nearest Neightbors

In [7]:
from sklearn.neighbors import KNeighborsClassifier

# KNN Classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(new_X_train, new_y_train)

# Predictions
y_pred_knn = knn_classifier.predict(new_X_test)
accuracy = accuracy_score(new_y_test, y_pred_knn)
precision = precision_score(new_y_test, y_pred_knn)
recall = recall_score(new_y_test, y_pred_knn)
f1 = f1_score(new_y_test, y_pred_knn)
print("KNN - Accuracy:", accuracy)
print("KNN - Precision:", precision)
print("KNN - Recall:", recall)
print("KNN - F1:", f1)

KNN - Accuracy: 0.9798657718120806
KNN - Precision: 0.8707482993197279
KNN - Recall: 0.7071823204419889
KNN - F1: 0.7804878048780487


<h1> K nearest neightbours with Hyperparameter Tuning

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
knn = KNeighborsClassifier(p=1)

# Create the random grid
params = {'n_neighbors': [3, 4, 5, 6, 7, 9],
          'weights': ['uniform', 'distance'],
          'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
          'leaf_size': [5, 10, 20, 30, 40]}

# Random search of parameters, using 5 fold cross validation,
# search across 100 different combinations, and use all available cores
knn_random = RandomizedSearchCV(estimator=knn,
                                param_distributions=params,
                                n_iter=100,
                                cv=5,
                                verbose=3,
                                random_state=61,
                                n_jobs=-1,
                                scoring='f1')

# Fit the random search model
knn_random.fit(new_X_train, new_y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


# Model Training with Only Important Features

In [None]:
new_X_train

Unnamed: 0,telecommuting,has_company_logo,has_questions,sentiment score_profile,sentiment score_requirement,sentiment score_benefits,sentiment score_description,department_frequency,department_mean_target,num_of_nouns_company_profile,...,requirement759,requirement760,requirement761,requirement762,requirement763,requirement764,requirement765,requirement766,requirement767,requirement768
0,0,1,0,0.9313,0.9929,0.9716,0.9951,0.000280,0.000000,0.185841,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
1,0,1,0,0.9618,0.9260,0.0000,0.9509,0.675685,0.044594,0.132743,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
2,0,1,1,0.9913,0.4019,0.3818,0.9426,0.675685,0.044594,0.207965,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
3,0,1,1,0.9620,0.9657,0.9081,0.5719,0.000070,0.000000,0.150442,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
4,0,1,1,0.9753,0.7391,0.9671,0.9974,0.675685,0.044594,0.194690,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14299,0,0,0,0.0000,0.0000,0.0000,0.8176,0.675685,0.044594,0.000000,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
14300,0,1,1,0.9975,0.9931,0.9937,0.9920,0.675685,0.044594,0.300885,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
14301,0,1,1,0.9831,0.9652,-0.4453,0.9652,0.001678,0.000000,0.340708,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869
14302,0,0,0,0.4019,0.4019,0.0000,0.9750,0.675685,0.044594,0.026549,...,-0.167115,-0.206595,0.087953,-0.391113,-0.06382,-0.088465,-0.131694,-0.234339,0.194257,-0.060869


In [10]:
# Calculate the correlation between each feature and the target
correlation_embedded_df = pd.DataFrame(list(new_X_train.apply(lambda col: col.corr(new_y_train['fraudulent'])).items()), columns=['Feature', 'Correlation'])
correlation_embedded_df['abs_correlation'] = correlation_embedded_df['Correlation'].abs()
# Sort the DataFrame by correlation values in descending order
correlation_embedded_df = correlation_embedded_df.sort_values(by='abs_correlation', ascending=False)

In [11]:
# choose the features with correlation > 0.1
correlation_embedded_df[correlation_embedded_df['abs_correlation'] > 0.1]

Unnamed: 0,Feature,Correlation,abs_correlation
8,department_mean_target,0.477420,0.477420
2060,profile451,-0.277849,0.277849
1790,profile181,-0.276605,0.276605
2284,profile675,-0.276272,0.276272
2106,profile497,0.275570,0.275570
...,...,...,...
2231,profile622,0.100884,0.100884
2339,profile730,0.100825,0.100825
1124,description283,0.100740,0.100740
1613,profile4,0.100454,0.100454


In [12]:
# select top features
top_features = correlation_embedded_df[correlation_embedded_df['abs_correlation'] > 0.1]['Feature']

In [13]:
selected_new_X_train = new_X_train[top_features]
selected_new_X_test = new_X_test[top_features]

<h1> Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(selected_new_X_train, new_y_train)
y_pred = model.predict(selected_new_X_test)
accuracy = accuracy_score(new_y_test, y_pred)
precision = precision_score(new_y_test, y_pred)
recall = recall_score(new_y_test, y_pred)
f1 = f1_score(new_y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9644854586129754
Precision: 0.8857142857142857
Recall: 0.3425414364640884
F1: 0.4940239043824702


<h1> Rare Event Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create the logistic regression model with class_weight='balanced'
logistic_model = LogisticRegression(class_weight='balanced')

# Fit the model on the training data
logistic_model.fit(selected_new_X_train, new_y_train)

# Predict on the test set
y_pred_logistic = logistic_model.predict(selected_new_X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(new_y_test, y_pred_logistic)
precision = precision_score(new_y_test, y_pred_logistic)
recall = recall_score(new_y_test, y_pred_logistic)
f1 = f1_score(new_y_test, y_pred_logistic)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9342841163310962
Precision: 0.43112244897959184
Recall: 0.9337016574585635
F1 Score: 0.5898778359511344


<h1> SVM

Linear SVM

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC(kernel='linear', C=1.0)
model.fit(selected_new_X_train, new_y_train)
y_pred = model.predict(selected_new_X_test)
accuracy = accuracy_score(new_y_test, y_pred)
precision = precision_score(new_y_test, y_pred)
recall = recall_score(new_y_test, y_pred)
f1 = f1_score(new_y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9658836689038032
Precision: 0.927536231884058
Recall: 0.35359116022099446
F1: 0.5119999999999999


<h1> Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
selected_new_X_train = new_X_train[top_features]
selected_new_X_test = new_X_test[top_features]
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(selected_new_X_train, new_y_train)
y_pred = model.predict(selected_new_X_test)
accuracy = accuracy_score(new_y_test, y_pred)
precision = precision_score(new_y_test, y_pred)
recall = recall_score(new_y_test, y_pred)
f1 = f1_score(new_y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9804250559284117
Precision: 1.0
Recall: 0.6132596685082873
F1: 0.7602739726027398


<h1> Random Forest with Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification


# Define the parameters grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Instantiate GridSearchCV with the RandomForestClassifier and parameter grid
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)

# Fit the model on the training data
grid_search.fit(selected_new_X_train, new_y_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Use the best estimator to predict on the test set
y_pred = best_estimator.predict(selected_new_X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(new_y_test, y_pred)
precision = precision_score(new_y_test, y_pred)
recall = recall_score(new_y_test, y_pred)
f1 = f1_score(new_y_test, y_pred)

# Print evaluation metrics
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)


Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.9807046979865772
Precision: 0.9912280701754386
Recall: 0.6243093922651933
F1: 0.7661016949152543


<h1> GaussianNB

In [None]:
classifier_gnb = GaussianNB()
# train
classifier_gnb.fit(selected_new_X_train, new_y_train)
# predict
y_pred_gnb = classifier_gnb.predict(selected_new_X_test)

accuracy = accuracy_score(new_y_test, y_pred_gnb)
precision = precision_score(new_y_test, y_pred_gnb)
recall = recall_score(new_y_test, y_pred_gnb)
f1 = f1_score(new_y_test, y_pred_gnb)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.8294183445190156
Precision: 0.19313304721030042
Recall: 0.7458563535911602
F1: 0.30681818181818177


<h1> LSTM

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Convert DataFrame to NumPy array and reshape
selected_X_train_reshaped = selected_new_X_train.values.reshape(selected_new_X_train.shape[0], 1, selected_new_X_train.shape[1])
selected_X_test_reshaped = selected_new_X_test.values.reshape(selected_new_X_test.shape[0], 1, selected_new_X_test.shape[1])
model = Sequential([
    LSTM(50, input_shape=(selected_X_train_reshaped.shape[1], selected_X_train_reshaped.shape[2])),
    Dense(1, activation='sigmoid')
])

# Compile the model (adjust loss and optimizer as needed)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit your model to the training data
model.fit(selected_X_train_reshaped, new_y_train, epochs=10, batch_size=32, validation_data=(selected_X_test_reshaped, new_y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c0ee16b3c70>

In [None]:
# Predict using the trained model
y_pred_prob = model.predict(selected_X_test_reshaped)

# Convert probabilities to binary classes using a threshold (e.g., 0.5)
threshold = 0.5
y_pred_lstm = (y_pred_prob > threshold).astype(int)

# Calculate evaluation metrics
accuracy_lstm = accuracy_score(new_y_test, y_pred_lstm)
precision_lstm = precision_score(new_y_test, y_pred_lstm)
recall_lstm = recall_score(new_y_test, y_pred_lstm)
f1_lstm = f1_score(new_y_test, y_pred_lstm)

# Print the evaluation metrics
print("LSTM Model Metrics:")
print("Accuracy:", accuracy_lstm)
print("Precision:", precision_lstm)
print("Recall:", recall_lstm)
print("F1 Score:", f1_lstm)

LSTM Model Metrics:
Accuracy: 0.9667225950782998
Precision: 0.8974358974358975
Recall: 0.3867403314917127
F1 Score: 0.5405405405405406


<h1> LSTM with Hyperparameter Tuning

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow_addons as tfa
from tensorflow.keras.metrics import Recall, Precision

# Convert DataFrame and Series to NumPy arrays
X_train_np = selected_new_X_train.to_numpy()
y_train_np = np.array(new_y_train)

# Reshape the train data for LSTM
X_train_np = np.reshape(X_train_np, (X_train_np.shape[0], 1, X_train_np.shape[1]))

# Reshpae the test data for LSTM
X_test_np = np.reshape(selected_new_X_test.to_numpy(), (new_X_test.to_numpy().shape[0], 1, new_X_test.to_numpy().shape[1]))

model = Sequential()
model.add(LSTM(256, return_sequences=True))
model.add(Dense(64, activation='tanh'))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(16,))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[Recall(), Precision(), 'accuracy'])

# Training the model
model.fit(X_train_np, y_train_np, epochs=30, batch_size=64)

# Generate probabilities
y_pred_prob = model.predict(X_test_np, verbose=0)

# Convert probabilities into class labels
y_pred_lstm = (y_pred_prob > 0.5).astype("int32")

lstm_accuracy = accuracy_score(new_y_test, y_pred_lstm)
lstm_precision = precision_score(new_y_test, y_pred_lstm)
lstm_recall = recall_score(new_y_test, y_pred_lstm)
lstm_f1 = f1_score(new_y_test, y_pred_lstm)

# Print the evaluation metrics
print("LSTM Model Metrics:")
print("Accuracy:", accuracy_lstm)
print("Precision:", precision_lstm)
print("Recall:", recall_lstm)
print("F1 Score:", f1_lstm)

Epoch 1/10


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
LSTM Model Metrics:
Accuracy: 0.9664429530201343
Precision: 0.8279569892473119
Recall: 0.425414364640884
F1 Score: 0.5620437956204379


<h1> LightGBM

In [None]:
classifier_lgbm = LGBMClassifier(random_state=0,objective='binary')
# train
classifier_lgbm.fit(selected_new_X_train, new_y_train)
# predict
y_pred_lgbm = classifier_lgbm.predict(selected_new_X_test)

[LightGBM] [Info] Number of positive: 685, number of negative: 13619
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144411
[LightGBM] [Info] Number of data points in the train set: 14304, number of used features: 581
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047889 -> initscore=-2.989802
[LightGBM] [Info] Start training from score -2.989802


In [None]:
# Calculate evaluation metrics
accuracy_lgbm = accuracy_score(new_y_test, y_pred_lgbm)
precision_lgbm = precision_score(new_y_test, y_pred_lgbm)
recall_lgbm = recall_score(new_y_test, y_pred_lgbm)
f1_lgbm = f1_score(new_y_test, y_pred_lgbm)

# Print the evaluation metrics
print("LightGBM Model Metrics:")
print("Accuracy:", accuracy_lgbm)
print("Precision:", precision_lgbm)
print("Recall:", recall_lgbm)
print("F1 Score:", f1_lgbm)

LightGBM Model Metrics:
Accuracy: 0.9798657718120806
Precision: 0.9224806201550387
Recall: 0.6574585635359116
F1 Score: 0.7677419354838709


<h1> XGBoost

In [None]:
import xgboost as xgb

# Define your XGBoost model
xgb_model = xgb.XGBClassifier()

# Train the model using your data
xgb_model.fit(selected_new_X_train, new_y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(selected_new_X_test)

# Calculate evaluation metrics
accuracy_xgb = accuracy_score(new_y_test, y_pred_xgb)
precision_xgb = precision_score(new_y_test, y_pred_xgb)
recall_xgb = recall_score(new_y_test, y_pred_xgb)
f1_xgb = f1_score(new_y_test, y_pred_xgb)

# Print the evaluation metrics
print("XGBoost Model Metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1 Score:", f1_xgb)

XGBoost Model Metrics:
Accuracy: 0.9784675615212528
Precision: 0.8661971830985915
Recall: 0.6795580110497238
F1 Score: 0.761609907120743


<h1> K nearest neightbors

In [14]:
from sklearn.neighbors import KNeighborsClassifier

# KNN Classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(selected_new_X_train, new_y_train)

# Predictions
y_pred_knn = knn_classifier.predict(selected_new_X_test)
accuracy = accuracy_score(new_y_test, y_pred_knn)
precision = precision_score(new_y_test, y_pred_knn)
recall = recall_score(new_y_test, y_pred_knn)
f1 = f1_score(new_y_test, y_pred_knn)
print("KNN - Accuracy:", accuracy)
print("KNN - Precision:", precision)
print("KNN - Recall:", recall)
print("KNN - F1:", f1)

KNN - Accuracy: 0.9675615212527964
KNN - Precision: 0.768595041322314
KNN - Recall: 0.5138121546961326
KNN - F1: 0.6158940397350994


<h2>Logistic Regression with Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform

# Define parameter distribution for Logistic Regression
param_dist_logreg = {
    'C': uniform(0.1, 10),
    'penalty': ['l1', 'l2']
}

# Create Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Perform RandomizedSearchCV for Logistic Regression
random_search_logreg = RandomizedSearchCV(logreg_model, param_distributions=param_dist_logreg, n_iter=20,
                                         cv=5, scoring='f1', n_jobs=-1, random_state=42)

# Fit the model on the training data
random_search_logreg.fit(selected_new_X_train, new_y_train)

# Get the best parameters and the best estimator
best_params_logreg = random_search_logreg.best_params_
best_estimator_logreg = random_search_logreg.best_estimator_

# Use the best estimator to predict on the test set
y_pred_logreg = best_estimator_logreg.predict(selected_new_X_test)

# Calculate evaluation metrics
accuracy_logreg = accuracy_score(new_y_test, y_pred_logreg)
precision_logreg = precision_score(new_y_test, y_pred_logreg)
recall_logreg = recall_score(new_y_test, y_pred_logreg)
f1_logreg = f1_score(new_y_test, y_pred_logreg)

# Print evaluation metrics for Logistic Regression
print("------- Logistic Regression -------")
print("Best Parameters:", best_params_logreg)
print("Accuracy:", accuracy_logreg)
print("Precision:", precision_logreg)
print("Recall:", recall_logreg)
print("F1:", f1_logreg)
print("-----------------------------------\n")


------- Logistic Regression -------
Best Parameters: {'C': 8.424426408004217, 'penalty': 'l2'}
Accuracy: 0.9681208053691275
Precision: 0.8526315789473684
Recall: 0.44751381215469616
F1: 0.5869565217391305
-----------------------------------



In [None]:
# Create logistic regression model
logistic_model = LogisticRegression(class_weight='balanced', max_iter=1000)

# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization norm
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(logistic_model, param_distributions=param_grid, n_iter=10, scoring='f1', cv=5, random_state=42)
random_search.fit(selected_new_X_train, new_y_train)

# Get the best model
best_logistic_model = random_search.best_estimator_

# Predict on the test set using the best model
y_pred_logistic_best = best_logistic_model.predict(selected_new_X_test)

best_params_logreg = random_search.best_estimator_
# Calculate evaluation metrics using the best model
accuracy_best = accuracy_score(new_y_test, y_pred_logistic_best)
precision_best = precision_score(new_y_test, y_pred_logistic_best)
recall_best = recall_score(new_y_test, y_pred_logistic_best)
f1_best = f1_score(new_y_test, y_pred_logistic_best)

# Print the evaluation metrics
print("Best Logistic Regression Model Metrics:")
print("Best Parameters:", best_params_logreg)
print("Accuracy:", accuracy_best)
print("Precision:", precision_best)
print("Recall:", recall_best)
print("F1 Score:", f1_best)

Best Logistic Regression Model Metrics:
Best Parameters: LogisticRegression(C=100, class_weight='balanced', max_iter=1000)
Accuracy: 0.9507829977628636
Precision: 0.5076923076923077
Recall: 0.9116022099447514
F1 Score: 0.6521739130434783


<h2>SVM with Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from scipy.stats import randint

# Define parameter distribution for SVM
param_dist_svm = {
    'C': randint(1, 100),
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

# Create SVM model
svm_model = SVC(random_state=42)

# Perform RandomizedSearchCV for SVM
random_search_svm = RandomizedSearchCV(svm_model, param_distributions=param_dist_svm, n_iter=20,
                                      cv=5, scoring='f1', n_jobs=-1, random_state=42)

# Fit the model on the training data
random_search_svm.fit(selected_new_X_train, new_y_train)

# Get the best parameters and the best estimator
best_params_svm = random_search_svm.best_params_
best_estimator_svm = random_search_svm.best_estimator_

# Use the best estimator to predict on the test set
y_pred_svm = best_estimator_svm.predict(selected_new_X_test)

# Calculate evaluation metrics
accuracy_svm = accuracy_score(new_y_test, y_pred_svm)
precision_svm = precision_score(new_y_test, y_pred_svm)
recall_svm = recall_score(new_y_test, y_pred_svm)
f1_svm = f1_score(new_y_test, y_pred_svm)

# Print evaluation metrics for SVM
print("------- SVM Evaluation Metrics -------")
print("Best Parameters:", best_params_svm)
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1 Score:", f1_svm)
print("-------------------------------------\n")

------- SVM Evaluation Metrics -------
Best Parameters: {'C': 7, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy: 0.9678411633109619
Precision: 0.8837209302325582
Recall: 0.4198895027624309
F1 Score: 0.5692883895131086
-------------------------------------



 <h2>LightGBM with Hyperparameter Tuning

In [None]:
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform


# Define parameter distribution for LightGBM
param_dist_lgbm = {
    'num_leaves': randint(20, 100),
    'max_depth': [-1, 10, 20, 30],
    'learning_rate': uniform(0.01, 0.5),
    'n_estimators': randint(50, 200)
}

# Create LightGBM model
lgbm_model = LGBMClassifier(random_state=42)

# Perform RandomizedSearchCV for LightGBM
random_search_lgbm = RandomizedSearchCV(lgbm_model, param_distributions=param_dist_lgbm, n_iter=20,
                                        cv=5, scoring='f1', n_jobs=-1, random_state=42)

# Fit the model on the training data
random_search_lgbm.fit(selected_new_X_train, new_y_train)

# Get the best parameters and the best estimator
best_params_lgbm = random_search_lgbm.best_params_
best_estimator_lgbm = random_search_lgbm.best_estimator_

# Use the best estimator to predict on the test set
y_pred_lgbm = best_estimator_lgbm.predict(selected_new_X_test)

# Calculate evaluation metrics
accuracy_lgbm = accuracy_score(new_y_test, y_pred_lgbm)
precision_lgbm = precision_score(new_y_test, y_pred_lgbm)
recall_lgbm = recall_score(new_y_test, y_pred_lgbm)
f1_lgbm = f1_score(new_y_test, y_pred_lgbm)

# Print evaluation metrics for LightGBM
print("------- LightGBM Evaluation Metrics -------")
print("Best Parameters:", best_params_lgbm)
print("Accuracy:", accuracy_lgbm)
print("Precision:", precision_lgbm)
print("Recall:", recall_lgbm)
print("F1 Score:", f1_lgbm)
print("-----------------------------------------\n")


[LightGBM] [Info] Number of positive: 685, number of negative: 13619
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.190205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144411
[LightGBM] [Info] Number of data points in the train set: 14304, number of used features: 581
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047889 -> initscore=-2.989802
[LightGBM] [Info] Start training from score -2.989802
------- LightGBM Evaluation Metrics -------
Best Parameters: {'learning_rate': 0.1254469128110745, 'max_depth': 30, 'n_estimators': 160, 'num_leaves': 79}
Accuracy: 0.9801454138702461
Precision: 0.9365079365079365
Recall: 0.6519337016574586
F1 Score: 0.768729641693811
-----------------------------------------



<h1> GaussianNB with Hyperparameter Tuning

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
gnb = GaussianNB()

# Create the random grid
params = {'var_smoothing': [ 1e-6, 1e-5, 1e-4],
        'priors': [[0.75, 0.25], [0.1, 0.9], [0.9, 0.1]]}

# Random search of parameters, using 5 fold cross validation,
# search across 200 different combinations, and use all available cores
gnb_random = RandomizedSearchCV(estimator=gnb,
                                param_distributions=params,
                                n_iter=200,
                                cv=5,
                                verbose=3,
                                random_state=61,
                                n_jobs=-1,
                                scoring='f1')

# Fit the random search model
gnb_random.fit(selected_new_X_train, new_y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [None]:
# Get the performance metrics
gnb_best_random = gnb_random.best_estimator_
gnb_y_pred = gnb_best_random.predict(selected_new_X_test)
gnb_accuracy = accuracy_score(new_y_test, gnb_y_pred)
gnb_precision = precision_score(new_y_test, gnb_y_pred)
gnb_recall = recall_score(new_y_test, gnb_y_pred)
gnb_f1 = f1_score(new_y_test, gnb_y_pred)

# Print evaluation metrics for GaussianNB
print("------- GaussianNB Evaluation Metrics -------")
print("Best Parameters:", gnb_random.best_params_)
print("Best Parameters:", )
print("Accuracy:", gnb_accuracy)
print("Precision:", gnb_precision)
print("Recall:", gnb_recall)
print("F1 Score:", gnb_f1)
print("------------------------------------------\n")

------- GaussianNB Evaluation Metrics -------
Best Parameters: {'var_smoothing': 1e-06, 'priors': [0.1, 0.9]}
Best Parameters:
Accuracy: 0.8294183445190156
Precision: 0.19313304721030042
Recall: 0.7458563535911602
F1 Score: 0.30681818181818177
------------------------------------------



<h2> AdaBoost with Hyperparameter Tuning

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from scipy.stats import randint

# Create AdaBoost classifier
adaboost_model = AdaBoostClassifier(random_state=42)

# Define parameter distribution for AdaBoost
param_dist_adaboost = {
    'n_estimators': randint(50, 200),
    'learning_rate': [0.01, 0.1, 1.0, 10.0]
}

# Perform RandomizedSearchCV for AdaBoost
random_search_adaboost = RandomizedSearchCV(adaboost_model, param_distributions=param_dist_adaboost, n_iter=20,
                                            cv=5, scoring='f1', n_jobs=-1, random_state=42)

# Fit the model on the training data
random_search_adaboost.fit(selected_new_X_train, new_y_train)

# Get the best parameters and the best estimator
best_params_adaboost = random_search_adaboost.best_params_
best_estimator_adaboost = random_search_adaboost.best_estimator_

# Use the best estimator to predict on the test set
y_pred_adaboost = best_estimator_adaboost.predict(selected_new_X_test)

# Calculate evaluation metrics
accuracy_adaboost = accuracy_score(new_y_test, y_pred_adaboost)
precision_adaboost = precision_score(new_y_test, y_pred_adaboost)
recall_adaboost = recall_score(new_y_test, y_pred_adaboost)
f1_adaboost = f1_score(new_y_test, y_pred_adaboost)

# Print evaluation metrics for AdaBoost
print("------- AdaBoost Evaluation Metrics -------")
print("Best Parameters:", best_params_adaboost)
print("Accuracy:", accuracy_adaboost)
print("Precision:", precision_adaboost)
print("Recall:", recall_adaboost)
print("F1 Score:", f1_adaboost)
print("------------------------------------------\n")


------- AdaBoost Evaluation Metrics -------
Best Parameters: {'learning_rate': 1.0, 'n_estimators': 171}
Accuracy: 0.9717561521252797
Precision: 0.7380952380952381
Recall: 0.6850828729281768
F1 Score: 0.7106017191977078
------------------------------------------



<h2> XGBoost with Hyperparameter Tuning

In [None]:
from xgboost import XGBClassifier
from scipy.stats import randint, uniform


# Create XGBoost classifier
xgb_model = XGBClassifier(random_state=42)

# Define parameter distribution for XGBoost
param_dist_xgb = {
    'n_estimators': randint(50, 200),
    'learning_rate': uniform(0.01, 0.5),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

# Perform RandomizedSearchCV for XGBoost
random_search_xgb = RandomizedSearchCV(xgb_model, param_distributions=param_dist_xgb, n_iter=20,
                                       cv=5, scoring='f1', n_jobs=-1, random_state=42)

# Fit the model on the training data
random_search_xgb.fit(selected_new_X_train, new_y_train)

# Get the best parameters and the best estimator
best_params_xgb = random_search_xgb.best_params_
best_estimator_xgb = random_search_xgb.best_estimator_

# Use the best estimator to predict on the test set
y_pred_xgb = best_estimator_xgb.predict(selected_new_X_test)

# Calculate evaluation metrics
accuracy_xgb = accuracy_score(new_y_test, y_pred_xgb)
precision_xgb = precision_score(new_y_test, y_pred_xgb)
recall_xgb = recall_score(new_y_test, y_pred_xgb)
f1_xgb = f1_score(new_y_test, y_pred_xgb)

# Print evaluation metrics for XGBoost
print("------- XGBoost Evaluation Metrics -------")
print("Best Parameters:", best_params_xgb)
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1 Score:", f1_xgb)
print("-----------------------------------------\n")


------- XGBoost Evaluation Metrics -------
Best Parameters: {'colsample_bytree': 0.5477050582452057, 'learning_rate': 0.1954091260991332, 'max_depth': 8, 'n_estimators': 86, 'subsample': 0.864803089169032}
Accuracy: 0.9767897091722595
Precision: 0.85
Recall: 0.6574585635359116
F1 Score: 0.7414330218068537
-----------------------------------------



<h1> K nearest neightbors with Hyperparameter Tuning

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
knn = KNeighborsClassifier(p=1)

# Create the random grid
params = {'n_neighbors': [3, 4, 5, 6, 7, 9],
          'weights': ['uniform', 'distance'],
          'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
          'leaf_size': [5, 10, 20, 30, 40]}

# Random search of parameters, using 5 fold cross validation,
# search across 100 different combinations, and use all available cores
knn_random = RandomizedSearchCV(estimator=knn,
                                param_distributions=params,
                                n_iter=100,
                                cv=5,
                                verbose=3,
                                random_state=61,
                                n_jobs=-1,
                                scoring='f1')

# Fit the random search model
knn_random.fit(selected_new_X_train, new_y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [12]:
knn_random.best_params_

{'weights': 'distance',
 'n_neighbors': 4,
 'leaf_size': 30,
 'algorithm': 'brute'}

In [13]:
knn_random.best_score_

0.7520285623944599

In [14]:
knn_best_random = knn_random.best_estimator_
knn_y_pred = knn_best_random.predict(selected_new_X_test)
knn_accuracy = accuracy_score(new_y_test, knn_y_pred)
knn_precision = precision_score(new_y_test, knn_y_pred)
knn_recall = recall_score(new_y_test, knn_y_pred)
knn_f1 = f1_score(new_y_test, knn_y_pred)

print("KNN - Best Parameters:", knn_random.best_params_)
print("KNN - Accuracy:", knn_accuracy)
print("KNN - Precision:", knn_precision)
print("KNN - Recall:", knn_recall)
print("KNN - F1:", knn_f1)

KNN - Best Parameters: {'weights': 'distance', 'n_neighbors': 4, 'leaf_size': 30, 'algorithm': 'brute'}
KNN - Accuracy: 0.977069351230425
KNN - Precision: 0.8461538461538461
KNN - Recall: 0.6685082872928176
KNN - F1: 0.7469135802469135
