In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Data_set/fake_job_postings.csv')

In [3]:
# for text columns, we can replace missing values with missing
text_columns = ['company_profile', 'description', 'requirements', 'benefits']
df[text_columns] = df[text_columns].fillna('missing')

In [4]:
# for categorical columns, we can replace missing values with the relevant placeholders
df['employment_type'] = df['employment_type'].fillna('Not Specified')
df['required_experience'] = df['required_experience'].fillna('Not Specified')
df['required_education'] = df['required_education'].fillna('Not Specified')
df['industry'] = df['industry'].fillna('Not Specified')
df['function'] = df['function'].fillna('Not Specified')
df['location'] = df['location'].fillna('Unknown')
df['department'] = df['department'].fillna('Unknown')


In [5]:
df['salary_range'] = df['salary_range'].fillna('Not Specified')

In [6]:
# encode target labels
from sklearn.preprocessing import LabelEncoder
df['fraudulent'] = LabelEncoder().fit_transform(df['fraudulent'])

In [7]:
# combine text columns
df['text'] = df['company_profile'] + ' ' + df['description'] + ' ' + df['requirements'] + ' ' + df['benefits']

In [8]:
# train test split
X = df['text']
y = df['fraudulent']
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['fraudulent'], test_size=0.2, random_state=0, stratify=y)

In [9]:
# Random Oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train.values.reshape(-1, 1), y_train)

In [10]:
# text preprocessing
import re

def preprocess_text(text):
    # remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # remove digits
    text = re.sub(r'\d+', '', text)
    # convert text to lowercase
    text = text.lower()
    return text

In [11]:
X_resampled = np.array([preprocess_text(text[0]) for text in X_resampled])

In [12]:
max_length = 100  # Set the maximum length for padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_resampled) 
X_sequences = tokenizer.texts_to_sequences(X_resampled)
X_padded = pad_sequences(X_sequences, maxlen=max_length)

In [13]:
# LSTM Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
def build_lstm_model(input_length):
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=input_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.5))  # Adjust dropout rate
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [14]:
# Training LSTM Model with Callbacks
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
lstm_model = build_lstm_model(max_length)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-5)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, save_format='h5')

lstm_model.fit(X_padded, y_resampled, epochs=20, batch_size=32, validation_split=0.1,
                callbacks=[early_stopping, reduce_lr, model_checkpoint])

Epoch 1/20
Epoch 2/20


  saving_api.save_model(


Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


<keras.src.callbacks.History at 0x1768fcc4be0>

In [15]:
# test data for prediction
X_test_resampled = np.array([preprocess_text(text) for text in X_test])
X_test_sequences = tokenizer.texts_to_sequences(X_test_resampled)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length)

In [16]:
# evaluate the model
lstm_model.evaluate(X_test_padded, y_test)



[0.16127221286296844, 0.9725950956344604]

In [17]:
# classification report and confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
y_pred = lstm_model.predict(X_test_padded)
y_pred = np.where(y_pred > 0.5, 1, 0)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3403
           1       0.75      0.65      0.70       173

    accuracy                           0.97      3576
   macro avg       0.87      0.82      0.84      3576
weighted avg       0.97      0.97      0.97      3576

[[3366   37]
 [  61  112]]
