In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv('fake_job_postings.csv')

data.head()
data.shape
data.columns
data.dtypes
data.isna().sum()
data['fraudulent'].value_counts()

Unnamed: 0_level_0,count
fraudulent,Unnamed: 1_level_1
0,17014
1,866


In [None]:
columns_to_drop = ['job_id', 'salary_range', 'telecommuting',
                   'has_company_logo', 'has_questions']

# Get existing columns in the DataFrame
existing_columns = data.columns.tolist()

# Filter out columns that are not in the DataFrame
cols_to_actually_drop = [col for col in columns_to_drop if col in existing_columns]

# Drop only the columns that exist
if cols_to_actually_drop:
    data.drop(cols_to_actually_drop, axis=1, inplace=True)

data.fillna(' ', inplace=True)

In [None]:
data['text'] = (
    data['title'] + ' ' +
    data['location'] + ' ' +
    data['company_profile'] + ' ' +
    data['description'] + ' ' +
    data['requirements'] + ' ' +
    data['benefits'] + ' ' +
    data['industry'] + ' ' +
    data['function'] + ' ' +
    data['employment_type']
)


In [None]:
data['text'] = data['text'].str.lower()

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

data['text'] = data['text'].apply(
    lambda x: ' '.join(word for word in x.split() if word not in stop_words)
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from imblearn.under_sampling import RandomUnderSampler

X = data['text']
y = data['fraudulent']

under = RandomUnderSampler(random_state=42)
X_res, y_res = under.fit_resample(X.values.reshape(-1,1), y)

X_res = X_res.flatten()


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.3, random_state=42
)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(max_features=5000)
vect.fit(X_train)

X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)


Model training and Evaluation


Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_dtm, y_train)

y_pred_lr = lr.predict(X_test_dtm)
print(accuracy_score(y_test, y_pred_lr))

0.8980769230769231


Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)

y_pred_nb = nb.predict(X_test_dtm)
print(accuracy_score(y_test, y_pred_nb))


0.8884615384615384


Linear SVM


In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_dtm, y_train)

y_pred_svm = svm.predict(X_test_dtm)
print(accuracy_score(y_test, y_pred_svm))


0.8769230769230769




Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_dtm, y_train)

y_pred_rf = rf.predict(X_test_dtm)
print(accuracy_score(y_test, y_pred_rf))


0.9076923076923077


Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train_dtm, y_train)

y_pred_dt = dt.predict(X_test_dtm)
print(accuracy_score(y_test, y_pred_dt))


0.85


PREDICTION / DEPLOYMENT PREP

In [None]:
sample = ["Work from home job. Pay registration fee."]
sample_vec = vect.transform(sample)

rf.predict(sample_vec)


array([1])

Saving model

In [None]:
import pickle

pickle.dump(rf, open('model.pkl', 'wb'))
pickle.dump(vect, open('vectorizer.pkl', 'wb'))
