## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from imblearn.combine import SMOTETomek

In [2]:
# Constants
FILENAME = "cleaned_fake_job_postings.csv"
INDEX = "job_id"
TARGET_VARIABLE = "fraudulent"
CATEGORICAL_COLUMNS = ["telecommuting", "has_company_logo", "has_questions", "employment_type", "required_experience", "required_education", "function", "fraudulent"]
CORPUS_COLUMN_NAME = "text"
TFIDF_MAX_FEATURES = 100
TFIDF_NGRAM_RANGE = (1,1)
RANDOM_STATE = 12345

## Load data

In [3]:
df = pd.read_csv(FILENAME)
df.set_index(INDEX, inplace=True)
df.head()

Unnamed: 0_level_0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent,text
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,False,True,False,Other,Junior,,Business,False,marketing intern we're food52 we've created gr...
2,False,True,False,Full-time,,,Business,False,customer service cloud video production 90 se...
3,False,True,False,,,,,False,commissioning machinery assistant cma valor se...
4,False,True,False,Full-time,Standard,Undergraduate,Business,False,account executive washington dc passion impro...
5,False,True,True,Full-time,Standard,Undergraduate,Healthcare,False,bill review manager spotsource solution llc gl...


## Encode Categorical data

In [4]:
encoders = {}

for col in CATEGORICAL_COLUMNS:    
    le = preprocessing.LabelEncoder()
    df.loc[:, col] = le.fit_transform(df[col])
    encoders[col] = le

df.head()

Unnamed: 0_level_0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent,text
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,1,0,1,0,4,0,0,marketing intern we're food52 we've created gr...
2,0,1,0,0,3,4,0,0,customer service cloud video production 90 se...
3,0,1,0,3,3,4,10,0,commissioning machinery assistant cma valor se...
4,0,1,0,0,2,3,0,0,account executive washington dc passion impro...
5,0,1,1,0,2,3,5,0,bill review manager spotsource solution llc gl...


## Baseline Classifier

In [5]:
X = df.drop(TARGET_VARIABLE, axis=1)
y = df.loc[:, TARGET_VARIABLE]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE)

bc = DummyClassifier(strategy="most_frequent")
bc.fit(X_train, y_train)
bc.score(X_test, y_test)

0.9516778523489933

## Random Forest Classification

### TF-IDF Feature Extraction

In [6]:
X = df.drop(TARGET_VARIABLE, axis=1)
y = df.loc[:, TARGET_VARIABLE]

tf_idf = TfidfVectorizer(stop_words="english", ngram_range=TFIDF_NGRAM_RANGE, max_features=TFIDF_MAX_FEATURES)
tf_idf.fit(X[CORPUS_COLUMN_NAME])
tf_idf_feature_names = "tfidf_" + np.array(tf_idf.get_feature_names(), dtype=object)
tf_idf_features = pd.DataFrame(tf_idf.transform(X[CORPUS_COLUMN_NAME]).toarray(), columns=tf_idf_feature_names, index=X.index)
X = X.drop(CORPUS_COLUMN_NAME, axis=1).join(tf_idf_features)

### Random Forest Classification without Synthetic Minority Oversampling Technique (SMOTE)

In [7]:
skf = StratifiedKFold(n_splits=10, random_state=RANDOM_STATE, shuffle=True)
rfc = RandomForestClassifier(n_estimators=100, criterion="entropy")
scores = cross_val_score(rfc, X, y, scoring='accuracy', cv=skf, n_jobs=-1)
scores

array([0.97035794, 0.9753915 , 0.97147651, 0.97762864, 0.97315436,
       0.9753915 , 0.97147651, 0.96979866, 0.97147651, 0.97706935])

In [8]:
scores.mean()

0.9733221476510068

### Random Forest Classification with SMOTE

In [9]:
smt = SMOTETomek(random_state=RANDOM_STATE)
X_res, y_res = smt.fit_resample(X, y)

In [10]:
skf = StratifiedKFold(n_splits=10, random_state=RANDOM_STATE, shuffle=True)
rfc = RandomForestClassifier(n_estimators=100, criterion="entropy")
scores = cross_val_score(rfc, X_res, y_res, scoring='accuracy', cv=skf, n_jobs=-1)
scores

array([0.99471055, 0.99265354, 0.9929474 , 0.99559212, 0.99324126,
       0.99500441, 0.99559212, 0.99441669, 0.99323927, 0.99500294])

In [11]:
scores.mean()

0.9942400306262258