In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q tqdm

In [None]:
import pandas as pd
import numpy as np
import tqdm
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/content/drive/My Drive/BT4012 Group 06/Data/embedded_features_non_split.csv', index_col=None)

In [None]:
cleaned_data = pd.read_csv('/content/drive/My Drive/BT4012 Group 06/Data/cleaned_data.csv', index_col=None)

In [None]:
# Selecting the columns to be added from cleaned_data
text_columns = ['job_id', 'title', 'company_profile', 'description', 'requirements', 'benefits']

# Merging the datasets on 'job_id'
data = pd.merge(data, cleaned_data[text_columns], on='job_id', how='left')


In [None]:
# Save the merged dataset
data.to_csv('merged_dataset.csv', index=False)


###TF-IDF###

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
# If you are first using NLTK, uncomment the following lines to download the required packages
# import nltk
# nltk.download()

In [None]:
# Concatenate textual columns for TF-IDF
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
data['combined_text'] = data[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Drop the text columns
data.drop(text_columns, axis=1, inplace=True)

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = ''.join([c for c in text if c not in string.punctuation])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatize the words
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    return text

# Apply the preprocessing to the combined_text column
data['combined_text'] = data['combined_text'].apply(preprocess_text)

# Initialize and apply TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the number of features as needed
X_tfidf = vectorizer.fit_transform(data['combined_text'])


In [None]:
# Convert TF-IDF output to DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Get the columns of TF-IDF dataframe
tfidf_columns = list(tfidf_df.columns)

In [None]:
# Add all the columns from data except combined_text
for col in data.columns:
    if col != 'combined_text' and col != 'job_id':
        tfidf_df[col] = data[col]

In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
y = tfidf_df['fraudulent']
X = tfidf_df.drop('fraudulent', axis=1)

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# If you want to check the shape of the split data
print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Train set shape: (14304, 1074)
Test set shape: (3576, 1074)


In [None]:
# Get the column names that have negative values
print(X_train.columns[(X_train < 0).any()])
print(X_test.columns[(X_test < 0).any()])

Index(['sentiment score_profile', 'sentiment score_requirement',
       'sentiment score_benefits', 'sentiment score_description'],
      dtype='object')
Index(['sentiment score_profile', 'sentiment score_requirement',
       'sentiment score_benefits', 'sentiment score_description'],
      dtype='object')


In [None]:
# Standardize all the columns except the tfidf_columns
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train[X_train.columns.difference(tfidf_columns)] = scaler.fit_transform(X_train[X_train.columns.difference(tfidf_columns)])
X_test[X_test.columns.difference(tfidf_columns)] = scaler.transform(X_test[X_test.columns.difference(tfidf_columns)])

In [None]:
# Save the transformed train and test DataFrames to CSV files
X_train.to_csv('train_with_tfidf.csv', index=False)
X_test.to_csv('test_with_tfidf.csv', index=False)


In [None]:
from imblearn.over_sampling import SMOTE  # For synthetic data generation

sm = SMOTE(random_state=0)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
# If you want to check the shape of the split data
print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Train set shape: (27182, 1074)
Test set shape: (3576, 1074)


###Logistic Regression###

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Logistic Regression Classifier
logreg_classifier = LogisticRegression(solver='saga', penalty=None, max_iter=300, C=0.01)
logreg_classifier.fit(X_train, y_train)

# Predictions
y_pred_logreg = logreg_classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_logreg)
precision = precision_score(y_test, y_pred_logreg)
recall = recall_score(y_test, y_pred_logreg)
f1 = f1_score(y_test, y_pred_logreg)
print("Logistic Regression - Accuracy:", accuracy)
print("Logistic Regression - Precision:", precision)
print("Logistic Regression - Recall:", recall)
print("Logistic Regression - F1:", f1)

Logistic Regression - Accuracy: 0.9781879194630873
Logistic Regression - Precision: 0.6963350785340314
Logistic Regression - Recall: 0.869281045751634
Logistic Regression - F1: 0.7732558139534884


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
lr = LogisticRegression()

# Create the random grid
params = {'penalty': ['l1', 'l2', 'elasticnet', None],
          'C': [0.01, 0.1, 1, 10, 100],
          'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
          'max_iter': [100, 200, 300, 400, 500]}

# Random search of parameters, using 5 fold cross validation,
# search across 10 different combinations, and use all available cores
lr_random = RandomizedSearchCV(estimator=lr,
                               param_distributions=params,
                               n_iter=100,
                               cv=5,
                               verbose=2,
                               random_state=61,
                               n_jobs=-1,
                               scoring='f1')

# Fit the random search model
lr_random.fit(X_train, y_train)

In [None]:
# Get the best parameters
lr_random.best_params_

{'solver': 'saga', 'penalty': None, 'max_iter': 300, 'C': 0.01}

In [None]:
# Get the best train performance
lr_random.best_score_

0.7731668289732605

In [None]:
# Get the performance metrics
lr_best_random = lr_random.best_estimator_
lr_y_pred = lr_best_random.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_precision = precision_score(y_test, lr_y_pred)
lr_recall = recall_score(y_test, lr_y_pred)
lr_f1 = f1_score(y_test, lr_y_pred)
print("Logistic Regression - Best Parameters:", lr_random.best_params_)
print("Logistic Regression - Accuracy:", lr_accuracy)
print("Logistic Regression - Precision:", lr_precision)
print("Logistic Regression - Recall:", lr_recall)
print("Logistic Regression - F1:", lr_f1)

Logistic Regression - Best Parameters: {'solver': 'saga', 'penalty': None, 'max_iter': 300, 'C': 0.01}
Logistic Regression - Accuracy: 0.9832214765100671
Logistic Regression - Precision: 0.803921568627451
Logistic Regression - Recall: 0.803921568627451
Logistic Regression - F1: 0.8039215686274509


###Support Vector Machines (SVM)###

In [None]:
from sklearn.svm import SVC

# SVM Classifier
svm_classifier = SVC(kernel='poly', gamma=1, C=0.1, degree=4)
svm_classifier.fit(X_train, y_train)

# Predictions
y_pred_svm = svm_classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm)
recall = recall_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm)
print("SVM - Accuracy:", accuracy)
print("SVM - Precision:", precision)
print("SVM - Recall:", recall)
print("SVM - F1:", f1)

SVM - Accuracy: 0.9890939597315436
SVM - Precision: 0.9191176470588235
SVM - Recall: 0.8169934640522876
SVM - F1: 0.8650519031141868


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
svm = SVC(kernel = 'poly', gamma=1, C=0.1)

# Create the random grid
params = {'degree': [1, 2, 3, 4, 6, 8, 10]}

# Random search of parameters, using 5 fold cross validation,
# search across 7 different combinations, and use all available cores
svm_random = RandomizedSearchCV(estimator=svm,
                                param_distributions=params,
                                n_iter=7,
                                cv=5,
                                verbose=3,
                                random_state=61,
                                n_jobs=-1,
                                scoring='f1')

# Fit the random search model
svm_random.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 5/5] END ..........................degree=1;, score=0.503 total time=  15.6s
[CV 1/5] END ..........................degree=1;, score=0.515 total time=  15.8s
[CV 2/5] END ..........................degree=1;, score=0.476 total time=  15.8s
[CV 3/5] END ..........................degree=1;, score=0.471 total time=  15.8s
[CV 4/5] END ..........................degree=1;, score=0.571 total time=  15.9s
[CV 2/5] END ..........................degree=2;, score=0.798 total time=  16.2s
[CV 1/5] END ..........................degree=2;, score=0.806 total time=  16.4s
[CV 4/5] END ..........................degree=2;, score=0.815 total time=  16.8s
[CV 3/5] END ..........................degree=2;, score=0.858 total time=  16.9s
[CV 5/5] END ..........................degree=2;, score=0.847 total time=  17.3s
[CV 4/5] END ..........................degree=3;, score=0.829 total time=  18.4s
[CV 2/5] END ..........................degree=3;,

In [None]:
# Get the best parameters
svm_random.best_params_

{'degree': 4}

In [None]:
# Get the best train performance
svm_random.best_score_

0.8381078679136902

In [None]:
# Get the performance metrics
svm_best_random = svm_random.best_estimator_
svm_y_pred = svm_best_random.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_precision = precision_score(y_test, svm_y_pred)
svm_recall = recall_score(y_test, svm_y_pred)
svm_f1 = f1_score(y_test, svm_y_pred)

print("SVM - Best Parameters:", svm_random.best_params_)
print("SVM - Accuracy:", svm_accuracy)
print("SVM - Precision:", svm_precision)
print("SVM - Recall:", svm_recall)
print("SVM - F1:", svm_f1)

SVM - Best Parameters: {'degree': 4}
SVM - Accuracy: 0.9902125279642058
SVM - Precision: 0.9402985074626866
SVM - Recall: 0.8235294117647058
SVM - F1: 0.8780487804878049


###Naive Bayes###

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predictions
y_pred_nb = nb_classifier.predict(X_test)


In [None]:
accuracy = accuracy_score(y_test, y_pred_nb)
precision = precision_score(y_test, y_pred_nb)
recall = recall_score(y_test, y_pred_nb)
f1 = f1_score(y_test, y_pred_nb)
print("Naive Bayes - Accuracy:", accuracy)
print("Naive Bayes - Precision:", precision)
print("Naive Bayes - Recall:", recall)
print("Naive Bayes - F1:", f1)

Naive Bayes - Accuracy: 0.8763982102908278
Naive Bayes - Precision: 0.23486238532110093
Naive Bayes - Recall: 0.8366013071895425
Naive Bayes - F1: 0.36676217765042973


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
nb = MultinomialNB()

# Create the random grid
params = {'alpha': [0.01, 0.1, 1, 10, 100],
          'fit_prior': [True, False],
          'class_prior': [None, [0.5, 0.5], [0.25, 0.75], [0.75, 0.25], [0.1, 0.9], [0.9, 0.1], [0.01, 0.99], [0.99, 0.01]],
          'force_alpha': [True, False]}

# Random search of parameters, using 5 fold cross validation,
# search across 100 different combinations, and use all available cores
nb_random = RandomizedSearchCV(estimator=nb,
                               param_distributions=params,
                               n_iter=100,
                               cv=5,
                               verbose=3,
                               random_state=61,
                               n_jobs=-1,
                               scoring='f1')

# Fit the random search model
nb_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 3/5] END alpha=10, class_prior=[0.5, 0.5], fit_prior=False, force_alpha=False;, score=0.236 total time=   0.1s
[CV 4/5] END alpha=10, class_prior=[0.5, 0.5], fit_prior=False, force_alpha=False;, score=0.328 total time=   0.2s
[CV 1/5] END alpha=10, class_prior=[0.5, 0.5], fit_prior=False, force_alpha=False;, score=0.311 total time=   0.2s
[CV 1/5] END alpha=1, class_prior=[0.25, 0.75], fit_prior=True, force_alpha=False;, score=0.319 total time=   0.1s
[CV 2/5] END alpha=10, class_prior=[0.5, 0.5], fit_prior=False, force_alpha=False;, score=0.220 total time=   0.2s
[CV 3/5] END alpha=1, class_prior=[0.25, 0.75], fit_prior=True, force_alpha=False;, score=0.300 total time=   0.1s
[CV 4/5] END alpha=1, class_prior=[0.25, 0.75], fit_prior=True, force_alpha=False;, score=0.305 total time=   0.1s
[CV 5/5] END alpha=10, class_prior=[0.5, 0.5], fit_prior=False, force_alpha=False;, score=0.234 total time=   0.2s
[CV 2/5] END alph

In [None]:
# Get the best parameters
nb_random.best_params_

{'force_alpha': False, 'fit_prior': True, 'class_prior': None, 'alpha': 0.01}

In [None]:
# Get the best train performance
nb_random.best_score_

0.5123881017576462

In [None]:
# Get the performance metrics
nb_best_random = nb_random.best_estimator_
nb_y_pred = nb_best_random.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_precision = precision_score(y_test, nb_y_pred)
nb_recall = recall_score(y_test, nb_y_pred)
nb_f1 = f1_score(y_test, nb_y_pred)

print("Naive Bayes - Best Parameters:", nb_random.best_params_)
print("Naive Bayes - Accuracy:", nb_accuracy)
print("Naive Bayes - Precision:", nb_precision)
print("Naive Bayes - Recall:", nb_recall)
print("Naive Bayes - F1:", nb_f1)

Naive Bayes - Best Parameters: {'force_alpha': False, 'fit_prior': True, 'class_prior': None, 'alpha': 0.01}
Naive Bayes - Accuracy: 0.939317673378076
Naive Bayes - Precision: 0.3644067796610169
Naive Bayes - Recall: 0.5620915032679739
Naive Bayes - F1: 0.44215938303341906


###Random Forest###

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators = 300,
                                       max_depth = 50,
                                       min_samples_split = 2,
                                       max_features = 'sqrt',
                                       criterion = 'gini',
                                       min_samples_leaf = 1,
                                       bootstrap = False)
rf_classifier.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_classifier.predict(X_test)



In [None]:
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
print("Random Forest - Accuracy:", accuracy)
print("Random Forest - Precision:", precision)
print("Random Forest - Recall:", recall)
print("Random Forest - F1:", f1)

Random Forest - Accuracy: 0.9868568232662193
Random Forest - Precision: 0.9344262295081968
Random Forest - Recall: 0.7450980392156863
Random Forest - F1: 0.8290909090909092


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
rf = RandomForestClassifier(n_estimators = 300, max_features = 'sqrt', criterion = 'gini', min_samples_leaf = 1, bootstrap = False)

# Create the random grid
params = {'max_depth': [None, 20, 50, 100, 200, 300],
          'min_samples_split': [2, 3, 5, 8, 10]}

# Random search of parameters, using 5 fold cross validation,
# search across 30 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=params,
                               n_iter=30,
                               cv=5,
                               verbose=3,
                               random_state=61,
                               n_jobs=-1,
                               scoring='f1')

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 5/5] END max_depth=None, min_samples_split=2;, score=0.791 total time=  43.2s
[CV 2/5] END max_depth=None, min_samples_split=3;, score=0.748 total time=  43.5s
[CV 2/5] END max_depth=None, min_samples_split=2;, score=0.750 total time=  43.8s
[CV 3/5] END max_depth=None, min_samples_split=2;, score=0.757 total time=  44.3s
[CV 4/5] END max_depth=None, min_samples_split=3;, score=0.769 total time=  44.3s
[CV 4/5] END max_depth=None, min_samples_split=2;, score=0.759 total time=  44.6s
[CV 3/5] END max_depth=None, min_samples_split=3;, score=0.759 total time= 1.0min
[CV 5/5] END max_depth=None, min_samples_split=3;, score=0.786 total time= 1.2min
[CV 1/5] END max_depth=None, min_samples_split=3;, score=0.773 total time= 1.2min
[CV 1/5] END max_depth=None, min_samples_split=2;, score=0.773 total time= 1.2min
[CV 1/5] END max_depth=None, min_samples_split=5;, score=0.773 total time=  44.5s
[CV 1/5] END max_depth=None, min_sam

In [None]:
# Get the best parameters
rf_random.best_params_

{'min_samples_split': 2, 'max_depth': 50}

In [None]:
# Get the best train performance
rf_random.best_score_

0.7732789808837659

In [None]:
# Get the performance metrics
rf_best_random = rf_random.best_estimator_
rf_y_pred = rf_best_random.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)

print("Random Forest - Best Parameters:", rf_random.best_params_)
print("Random Forest - Accuracy:", rf_accuracy)
print("Random Forest - Precision:", rf_precision)
print("Random Forest - Recall:", rf_recall)
print("Random Forest - F1:", rf_f1)

Random Forest - Best Parameters: {'min_samples_split': 2, 'max_depth': 50}
Random Forest - Accuracy: 0.985738255033557
Random Forest - Precision: 1.0
Random Forest - Recall: 0.6666666666666666
Random Forest - F1: 0.8


###K-Nearest Neighbors (KNN)###

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# KNN Classifier
knn_classifier = KNeighborsClassifier(weights='distance',
                                      n_neighbors=4,
                                      leaf_size=30,
                                      algorithm='brute',
                                      p=1)
knn_classifier.fit(X_train, y_train)

# Predictions
y_pred_knn = knn_classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_knn)
precision = precision_score(y_test, y_pred_knn)
recall = recall_score(y_test, y_pred_knn)
f1 = f1_score(y_test, y_pred_knn)
print("KNN - Accuracy:", accuracy)
print("KNN - Precision:", precision)
print("KNN - Recall:", recall)
print("KNN - F1:", f1)


KNN - Accuracy: 0.9664429530201343
KNN - Precision: 0.5684647302904564
KNN - Recall: 0.8954248366013072
KNN - F1: 0.6954314720812181


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
knn = KNeighborsClassifier(p=1)

# Create the random grid
params = {'n_neighbors': [3, 4, 5, 6, 7, 9],
          'weights': ['uniform', 'distance'],
          'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
          'leaf_size': [5, 10, 20, 30, 40]}

# Random search of parameters, using 5 fold cross validation,
# search across 100 different combinations, and use all available cores
knn_random = RandomizedSearchCV(estimator=knn,
                                param_distributions=params,
                                n_iter=100,
                                cv=5,
                                verbose=3,
                                random_state=61,
                                n_jobs=-1,
                                scoring='f1')

# Fit the random search model
knn_random.fit(X_train, y_train)

In [None]:
# Get the best parameters
knn_random.best_params_

{'weights': 'distance',
 'p': 1,
 'n_neighbors': 5,
 'leaf_size': 20,
 'algorithm': 'auto'}

In [None]:
# Get the best train performance
knn_random.best_score_

0.8027288156904578

In [None]:
# Get the performance metrics
knn_best_random = knn_random.best_estimator_
knn_y_pred = knn_best_random.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_precision = precision_score(y_test, knn_y_pred)
knn_recall = recall_score(y_test, knn_y_pred)
knn_f1 = f1_score(y_test, knn_y_pred)

print("KNN - Best Parameters:", knn_random.best_params_)
print("KNN - Accuracy:", knn_accuracy)
print("KNN - Precision:", knn_precision)
print("KNN - Recall:", knn_recall)
print("KNN - F1:", knn_f1)

KNN - Best Parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 5, 'leaf_size': 20, 'algorithm': 'auto'}
KNN - Accuracy: 0.9854586129753915
KNN - Precision: 0.8796992481203008
KNN - Recall: 0.7647058823529411
KNN - F1: 0.8181818181818182


###Gradient Boosting (XGBoost)###

In [None]:
from xgboost import XGBClassifier

# XGBoost Classifier
xgb_classifier = XGBClassifier(subsample=0.9,
                               scale_pos_weight=4,
                               reg_lambda=0.2,
                               reg_alpha=0.6,
                               n_estimators=500,
                               min_child_weight=2,
                               max_depth=300,
                               learning_rate=0.1,
                               gamma=0.5,
                               colsample_bytree=0.5)
xgb_classifier.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_classifier.predict(X_test)


In [None]:
accuracy = accuracy_score(y_test, y_pred_xgb)
precision = precision_score(y_test, y_pred_xgb)
recall = recall_score(y_test, y_pred_xgb)
f1 = f1_score(y_test, y_pred_xgb)
print("XGBoost - Accuracy:", accuracy)
print("XGBoost - Precision:", precision)
print("XGBoost - Recall:", recall)
print("XGBoost - F1:", f1)


XGBoost - Accuracy: 0.9879753914988815
XGBoost - Precision: 0.8525641025641025
XGBoost - Recall: 0.869281045751634
XGBoost - F1: 0.8608414239482199


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
xgb = XGBClassifier()

# Create the random grid

params = {'max_depth': [3, 5, 7, 10, 15, 20, 50, 100, 200, 300],
          'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.5, 1],
          'n_estimators': [300, 400, 500, 600, 700],
          'gamma': [0, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 5, 10],
          'min_child_weight': [0.5, 1, 2, 3, 4],
          'subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
          'colsample_bytree': [0.3, 0.5, 0.6, 0.7, 0.8, 0.9],
          'reg_alpha': [0, 0.1, 0.2, 0.4, 0.6, 0.8],
          'reg_lambda': [0, 0.1, 0.2, 0.4, 0.6, 0.8],
          'scale_pos_weight': [0.1, 0.5, 1, 2, 3, 4]}

# Random search of parameters, using 5 fold cross validation,
# search across 200 different combinations, and use all available cores

xgb_random = RandomizedSearchCV(estimator=xgb,
                                param_distributions=params,
                                n_iter=200,
                                cv=5,
                                verbose=3,
                                random_state=61,
                                n_jobs=-1,
                                scoring='f1')

# Fit the random search model
xgb_random.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV 1/5] END colsample_bytree=0.7, gamma=0.5, learning_rate=0.5, max_depth=15, min_child_weight=0.5, n_estimators=300, reg_alpha=0.2, reg_lambda=0.6, scale_pos_weight=0.1, subsample=0.8;, score=0.759 total time=  24.0s
[CV 2/5] END colsample_bytree=0.7, gamma=0.5, learning_rate=0.5, max_depth=15, min_child_weight=0.5, n_estimators=300, reg_alpha=0.2, reg_lambda=0.6, scale_pos_weight=0.1, subsample=0.8;, score=0.719 total time=  24.0s
[CV 4/5] END colsample_bytree=0.7, gamma=0.5, learning_rate=0.5, max_depth=15, min_child_weight=0.5, n_estimators=300, reg_alpha=0.2, reg_lambda=0.6, scale_pos_weight=0.1, subsample=0.8;, score=0.722 total time=  23.9s
[CV 3/5] END colsample_bytree=0.7, gamma=0.5, learning_rate=0.5, max_depth=15, min_child_weight=0.5, n_estimators=300, reg_alpha=0.2, reg_lambda=0.6, scale_pos_weight=0.1, subsample=0.8;, score=0.685 total time=  24.4s
[CV 5/5] END colsample_bytree=0.7, gamma=0.5, learning_rate=

In [None]:
# Get the best parameters
xgb_random.best_params_

{'subsample': 0.9,
 'scale_pos_weight': 4,
 'reg_lambda': 0.2,
 'reg_alpha': 0.6,
 'n_estimators': 500,
 'min_child_weight': 2,
 'max_depth': 300,
 'learning_rate': 0.1,
 'gamma': 0.5,
 'colsample_bytree': 0.5}

In [None]:
# Get the best train performance
xgb_random.best_score_

0.848474759614924

In [None]:
# Get the performance metrics
xgb_best_random = xgb_random.best_estimator_
xgb_y_pred = xgb_best_random.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_precision = precision_score(y_test, xgb_y_pred)
xgb_recall = recall_score(y_test, xgb_y_pred)
xgb_f1 = f1_score(y_test, xgb_y_pred)

print("XGBoost - Best Parameters:", xgb_random.best_params_)
print("XGBoost - Accuracy:", xgb_accuracy)
print("XGBoost - Precision:", xgb_precision)
print("XGBoost - Recall:", xgb_recall)
print("XGBoost - F1:", xgb_f1)

XGBoost - Best Parameters: {'subsample': 0.9, 'scale_pos_weight': 4, 'reg_lambda': 0.2, 'reg_alpha': 0.6, 'n_estimators': 500, 'min_child_weight': 2, 'max_depth': 300, 'learning_rate': 0.1, 'gamma': 0.5, 'colsample_bytree': 0.5}
XGBoost - Accuracy: 0.9890939597315436
XGBoost - Precision: 0.9384615384615385
XGBoost - Recall: 0.7973856209150327
XGBoost - F1: 0.8621908127208481


###LightGBM###

In [None]:
classifier_lgbm = LGBMClassifier(random_state=0,
                                 objective='binary',
                                 verbose=-1,
                                 subsample=0.9,
                                 scale_pos_weight=4,
                                 reg_lambda=0,
                                 reg_alpha=0.2,
                                 num_leaves=20,
                                 n_estimators=700,
                                 min_child_samples=3,
                                 max_depth=20,
                                 learning_rate=0.1,
                                 colsample_bytree=0.7)

# train
classifier_lgbm.fit(X_train, y_train)
# predict
y_pred_lgbm = classifier_lgbm.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_lgbm)
precision = precision_score(y_test, y_pred_lgbm)
recall = recall_score(y_test, y_pred_lgbm)
f1 = f1_score(y_test, y_pred_lgbm)
print("LightGBM - Accuracy:", accuracy)
print("LightGBM - Precision:", precision)
print("LightGBM - Recall:", recall)
print("LightGBM - F1:", f1)


LightGBM - Accuracy: 0.9902125279642058
LightGBM - Precision: 0.9097222222222222
LightGBM - Recall: 0.8562091503267973
LightGBM - F1: 0.8821548821548821


In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
lgbm = LGBMClassifier(random_state=0,objective='binary', verbose=-1)

# Create the random grid
params = {'max_depth': [3, 5, 7, 10, 15, 20, 50, 100, 200, 300],
          'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.5, 1],
          'n_estimators': [300, 400, 500, 600, 700],
          'num_leaves': [5, 10, 20, 30, 40, 50, 70, 90, 100, 150, 200, 300],
          'min_child_samples': [1, 2, 3, 4, 5],
          'subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
          'colsample_bytree': [0.3, 0.5, 0.6, 0.7, 0.8, 0.9],
          'reg_alpha': [0, 0.1, 0.2, 0.4, 0.6, 0.8],
          'reg_lambda': [0, 0.1, 0.2, 0.4, 0.6, 0.8],
          'scale_pos_weight': [0.1, 0.5, 1, 2, 3, 4]}

# Random search of parameters, using 5 fold cross validation,
# search across 300 different combinations, and use all available cores
lgbm_random = RandomizedSearchCV(estimator=lgbm,
                                    param_distributions=params,
                                    n_iter=300,
                                    cv=5,
                                    verbose=3,
                                    random_state=61,
                                    n_jobs=-1,
                                    scoring='f1')

# Fit the random search model
lgbm_random.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[CV 2/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=100, min_child_samples=4, n_estimators=400, num_leaves=5, reg_alpha=0.4, reg_lambda=0.8, scale_pos_weight=0.1, subsample=0.5;, score=0.201 total time=  11.2s
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=100, min_child_samples=4, n_estimators=400, num_leaves=5, reg_alpha=0.4, reg_lambda=0.8, scale_pos_weight=0.1, subsample=0.5;, score=0.268 total time=  11.2s
[CV 3/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=100, min_child_samples=4, n_estimators=400, num_leaves=5, reg_alpha=0.4, reg_lambda=0.8, scale_pos_weight=0.1, subsample=0.5;, score=0.190 total time=  13.6s
[CV 4/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=100, min_child_samples=4, n_estimators=400, num_leaves=5, reg_alpha=0.4, reg_lambda=0.8, scale_pos_weight=0.1, subsample=0.5;, score=0.178 total time=  14.3s
[CV 5/5] END colsample_bytree=0.8, learning_

In [None]:
# Get the best parameters
lgbm_random.best_params_

{'subsample': 0.9,
 'scale_pos_weight': 4,
 'reg_lambda': 0,
 'reg_alpha': 0.2,
 'num_leaves': 20,
 'n_estimators': 700,
 'min_child_samples': 3,
 'max_depth': 20,
 'learning_rate': 0.1,
 'colsample_bytree': 0.7}

In [None]:
# Get the best train performance
lgbm_random.best_score_

0.8543665409942223

In [None]:
# Get the performance metrics
lgbm_best_random = lgbm_random.best_estimator_
lgbm_y_pred = lgbm_best_random.predict(X_test)
lgbm_accuracy = accuracy_score(y_test, lgbm_y_pred)
lgbm_precision = precision_score(y_test, lgbm_y_pred)
lgbm_recall = recall_score(y_test, lgbm_y_pred)
lgbm_f1 = f1_score(y_test, lgbm_y_pred)

print("LightGBM - Best Parameters:", lgbm_random.best_params_)
print("LightGBM - Accuracy:", lgbm_accuracy)
print("LightGBM - Precision:", lgbm_precision)
print("LightGBM - Recall:", lgbm_recall)
print("LightGBM - F1:", lgbm_f1)

LightGBM - Best Parameters: {'subsample': 0.9, 'scale_pos_weight': 4, 'reg_lambda': 0, 'reg_alpha': 0.2, 'num_leaves': 20, 'n_estimators': 700, 'min_child_samples': 3, 'max_depth': 20, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
LightGBM - Accuracy: 0.9910514541387024
LightGBM - Precision: 0.9548872180451128
LightGBM - Recall: 0.8300653594771242
LightGBM - F1: 0.8881118881118881


###GaussianNB###

In [None]:
# Gaussian Naive Bayes classifier
gnb_classifier = GaussianNB(var_smoothing=1e-05, priors=[0.99, 0.01])
gnb_classifier.fit(X_train, y_train)  # Converting sparse matrix to dense matrix

# Predictions
y_pred_gnb = gnb_classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_gnb)
precision = precision_score(y_test, y_pred_gnb)
recall = recall_score(y_test, y_pred_gnb)
f1 = f1_score(y_test, y_pred_gnb)
print("GaussianNB - Accuracy:", accuracy)
print("GaussianNB - Precision:", precision)
print("GaussianNB - Recall:", recall)
print("GaussianNB - F1:", f1)


GaussianNB - Accuracy: 0.918903803131991
GaussianNB - Precision: 0.32020997375328086
GaussianNB - Recall: 0.7973856209150327
GaussianNB - F1: 0.45692883895131087


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Randomized search for best hyperparameters

# Create the model
gnb = GaussianNB()

# Create the random grid
params = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        'priors': [None, [0.5, 0.5], [0.25, 0.75], [0.75, 0.25], [0.1, 0.9], [0.9, 0.1], [0.01, 0.99], [0.99, 0.01]]}

# Random search of parameters, using 5 fold cross validation,
# search across 200 different combinations, and use all available cores
gnb_random = RandomizedSearchCV(estimator=gnb,
                                param_distributions=params,
                                n_iter=200,
                                cv=5,
                                verbose=3,
                                random_state=61,
                                n_jobs=-1,
                                scoring='f1')

# Fit the random search model
gnb_random.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END ..priors=None, var_smoothing=1e-09;, score=0.391 total time=   0.2s
[CV 2/5] END ..priors=None, var_smoothing=1e-09;, score=0.372 total time=   0.2s
[CV 3/5] END ..priors=None, var_smoothing=1e-09;, score=0.363 total time=   0.2s
[CV 4/5] END ..priors=None, var_smoothing=1e-09;, score=0.404 total time=   0.2s
[CV 4/5] END ..priors=None, var_smoothing=1e-08;, score=0.411 total time=   0.2s
[CV 5/5] END ..priors=None, var_smoothing=1e-08;, score=0.393 total time=   0.2s
[CV 2/5] END ..priors=None, var_smoothing=1e-08;, score=0.375 total time=   0.4s
[CV 3/5] END ..priors=None, var_smoothing=1e-08;, score=0.372 total time=   0.4s
[CV 2/5] END ..priors=None, var_smoothing=1e-07;, score=0.385 total time=   0.2s
[CV 3/5] END ..priors=None, var_smoothing=1e-07;, score=0.383 total time=   0.2s
[CV 4/5] END ..priors=None, var_smoothing=1e-07;, score=0.423 total time=   0.3s
[CV 5/5] END ..priors=None, var_smoothing=1e-07

In [None]:
# Get the best parameters
gnb_random.best_params_

{'var_smoothing': 1e-05, 'priors': [0.99, 0.01]}

In [None]:
# Get the best train performance
gnb_random.best_score_

0.41917721208351405

In [None]:
# Get the performance metrics
gnb_best_random = gnb_random.best_estimator_
gnb_y_pred = gnb_best_random.predict(X_test)
gnb_accuracy = accuracy_score(y_test, gnb_y_pred)
gnb_precision = precision_score(y_test, gnb_y_pred)
gnb_recall = recall_score(y_test, gnb_y_pred)
gnb_f1 = f1_score(y_test, gnb_y_pred)

print("GaussianNB - Best Parameters:", gnb_random.best_params_)
print("GaussianNB - Accuracy:", gnb_accuracy)
print("GaussianNB - Precision:", gnb_precision)
print("GaussianNB - Recall:", gnb_recall)
print("GaussianNB - F1:", gnb_f1)

GaussianNB - Best Parameters: {'var_smoothing': 1e-05, 'priors': [0.99, 0.01]}
GaussianNB - Accuracy: 0.8822706935123042
GaussianNB - Precision: 0.24716981132075472
GaussianNB - Recall: 0.8562091503267973
GaussianNB - F1: 0.383601756954612


###LSTM###

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow_addons as tfa
from tensorflow.keras.metrics import Recall, Precision

# Convert DataFrame and Series to NumPy arrays
X_train_np = X_train.to_numpy()
y_train_np = np.array(y_train)

# Reshape the train data for LSTM
X_train_np = np.reshape(X_train_np, (X_train_np.shape[0], 1, X_train_np.shape[1]))

# Reshpae the test data for LSTM
X_test_np = np.reshape(X_test.to_numpy(), (X_test.to_numpy().shape[0], 1, X_test.to_numpy().shape[1]))

model = Sequential()
model.add(LSTM(64, dropout=0.5, recurrent_dropout=0.5, return_sequences=True))
model.add(Dense(32, activation='tanh'))
model.add(LSTM(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True))
model.add(LSTM(16, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[Recall(), Precision(), 'accuracy'])

# Training the model
model.fit(X_train_np, y_train_np, epochs=105, batch_size=64, verbose=0)

# Generate probabilities
y_pred_prob = model.predict(X_test_np, verbose=0)

# Convert probabilities into class labels
y_pred_lstm = (y_pred_prob > 0.5).astype("int32")

lstm_accuracy = accuracy_score(y_test, y_pred_lstm)
lstm_precision = precision_score(y_test, y_pred_lstm)
lstm_recall = recall_score(y_test, y_pred_lstm)
lstm_f1 = f1_score(y_test, y_pred_lstm)

In [None]:
# Get the performance metrics
print("LSTM - Accuracy:", lstm_accuracy)
print("LSTM - Precision:", lstm_precision)
print("LSTM - Recall:", lstm_recall)
print("LSTM - F1:", lstm_f1)

LSTM - Accuracy: 0.9916107382550335
LSTM - Precision: 0.9624060150375939
LSTM - Recall: 0.8366013071895425
LSTM - F1: 0.895104895104895


###Rare Event Logistic###

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE  # For synthetic data generation

# Use SMOTE to generate synthetic data
sm = SMOTE(random_state=61)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Create the logistic regression model with class_weight='balanced'
logistic_model = LogisticRegression(class_weight='balanced',
                                    solver='newton-cg',
                                    penalty=None,
                                    max_iter=400,
                                    C=1)

# Fit the model on the training data
logistic_model.fit(X_train_res, y_train_res)

# Predict on the test set
y_pred_logistic = logistic_model.predict(X_test)


In [None]:
accuracy = accuracy_score(y_test, y_pred_logistic)
precision = precision_score(y_test, y_pred_logistic)
recall = recall_score(y_test, y_pred_logistic)
f1 = f1_score(y_test, y_pred_logistic)

# Print the evaluation metrics
print("Rare Event Logistic Model Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Rare Event Logistic Model Metrics:
Accuracy: 0.9804250559284117
Precision: 0.7318435754189944
Recall: 0.8562091503267973
F1 Score: 0.789156626506024


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE  # For synthetic data generation

# Randomized search for best hyperparameters

# Use SMOTE to generate synthetic data
sm = SMOTE(random_state=61)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Create the model
logistic = LogisticRegression(class_weight='balanced')

# Create the random grid
params = {'penalty': ['l1', 'l2', 'elasticnet', None],
          'C': [0.01, 0.1, 1, 10, 100],
          'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
          'max_iter': [100, 200, 300, 400, 500]}

# Random search of parameters, using 5 fold cross validation,
# search across 100 different combinations, and use all available cores
logistic_random = RandomizedSearchCV(estimator=logistic,
                                     param_distributions=params,
                                     n_iter=100,
                                     cv=5,
                                     verbose=3,
                                     random_state=61,
                                     n_jobs=-1,
                                     scoring='f1')

# Fit the random search model
logistic_random.fit(X_train_res, y_train_res)

In [None]:
# Get the best parameters
logistic_random.best_params_

{'solver': 'newton-cg', 'penalty': None, 'max_iter': 400, 'C': 1}

In [None]:
# Get the best train performance
logistic_random.best_score_

0.9895173181608292

In [None]:
# Get the performance metrics
logistic_best_random = logistic_random.best_estimator_
logistic_y_pred = logistic_best_random.predict(X_test)
logistic_accuracy = accuracy_score(y_test, logistic_y_pred)
logistic_precision = precision_score(y_test, logistic_y_pred)
logistic_recall = recall_score(y_test, logistic_y_pred)
logistic_f1 = f1_score(y_test, logistic_y_pred)

print("Rare Event Logistic - Best Parameters:", logistic_random.best_params_)
print("Rare Event Logistic - Accuracy:", logistic_accuracy)
print("Rare Event Logistic - Precision:", logistic_precision)
print("Rare Event Logistic - Recall:", logistic_recall)
print("Rare Event Logistic - F1:", logistic_f1)

Rare Event Logistic - Best Parameters: {'solver': 'newton-cg', 'penalty': None, 'max_iter': 400, 'C': 1}
Rare Event Logistic - Accuracy: 0.9807046979865772
Rare Event Logistic - Precision: 0.7359550561797753
Rare Event Logistic - Recall: 0.8562091503267973
Rare Event Logistic - F1: 0.7915407854984894


###DistilBERT###

In [None]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch


In [None]:
# Concatenate all textual columns
textual_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
data['combined_text'] = data[textual_columns].fillna('').apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

texts = data['combined_text'].tolist()
labels = data['fraudulent'].tolist()

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
# Dataset class
class JobDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset
dataset = JobDataset(texts, labels, tokenizer)

In [None]:
# Splitting the dataset into train and test set
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

In [None]:
# DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
# Train the model
trainer.train()