In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [9]:
df = pd.read_csv('Data_set/fake_job_postings.csv')

In [13]:
# for text columns, we can replace missing values with missing
text_columns = ['company_profile', 'description', 'requirements', 'benefits']
df[text_columns] = df[text_columns].fillna('missing')

In [14]:
# for categorical columns, we can replace missing values with the relevant placeholders
df['employment_type'] = df['employment_type'].fillna('Not Specified')
df['required_experience'] = df['required_experience'].fillna('Not Specified')
df['required_education'] = df['required_education'].fillna('Not Specified')
df['industry'] = df['industry'].fillna('Not Specified')
df['function'] = df['function'].fillna('Not Specified')
df['location'] = df['location'].fillna('Unknown')
df['department'] = df['department'].fillna('Unknown')


In [15]:
df['salary_range'] = df['salary_range'].fillna('Not Specified')

In [21]:
# text preprocessing
import re

def preprocess_text(text):
    # remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # remove digits
    text = re.sub(r'\d+', '', text)
    # convert text to lowercase
    text = text.lower()
    return text

In [11]:
# applying preprocessing to text columns
text_columns = ['company_profile', 'description', 'requirements', 'benefits']
for col in text_columns:
    df[col] = df[col].apply(preprocess_text)

In [16]:
# combine text columns
df['text'] = df['company_profile'] + ' ' + df['description'] + ' ' + df['requirements'] + ' ' + df['benefits']

In [17]:
# encode target labels
from sklearn.preprocessing import LabelEncoder
df['fraudulent'] = LabelEncoder().fit_transform(df['fraudulent'])

In [26]:
# train-test split
X = df['text']
y = df['fraudulent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [27]:
# Random Oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train.values.reshape(-1, 1), y_train)

In [28]:
X_resampled = np.array([preprocess_text(text[0]) for text in X_train])

In [29]:
# tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_resampled)
X_train_seq = tokenizer.texts_to_sequences(X_resampled)
X_train_pad = pad_sequences(X_train_seq, maxlen=100)