In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../data/fake_job_postings.csv')

In [3]:
location = data["location"].str.split(",", expand= True, n= 2)
location.columns = ["country", "state", "city"]
data[["country", "state", "city"]] = location
data = data.drop(columns= "location")

In [4]:
salary = data["salary_range"].str.split("-", expand= True, n= 1)
data[["min_salary", "max_salary"]] = salary
data = data.drop(columns= "salary_range")

In [5]:
data = data.fillna("N/A")
data["state"] = data["state"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["country"] = data["country"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["city"] = data["city"].str.strip().apply(lambda x: "N/A" if x == '' else x)

In [6]:
for i in ["company_profile", "description", "requirements", "benefits"]:
    data[i] = data[i].str.lower()

In [7]:
data.isnull().sum()

job_id                 0
title                  0
department             0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
state                  0
city                   0
min_salary             0
max_salary             0
dtype: int64

In [8]:
data.columns

Index(['job_id', 'title', 'department', 'company_profile', 'description',
       'requirements', 'benefits', 'telecommuting', 'has_company_logo',
       'has_questions', 'employment_type', 'required_experience',
       'required_education', 'industry', 'function', 'fraudulent', 'country',
       'state', 'city', 'min_salary', 'max_salary'],
      dtype='object')

TF-IDF - Logistic Regression

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_selection import RFE
from matplotlib import pyplot


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pintusingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pintusingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/pintusingh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [38]:
# tokenizer define
ps = PorterStemmer()
stop = set(stopwords.words('english'))
def tokenizer (doc):
    sentences = sent_tokenize(doc)
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [ps.stem(word) for word in words]
        tokens+=words
    return [w.lower() for w in tokens if w not in stop]

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine text features
df1 = data.copy()
text_feature = df1[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

# TF-IDF without custom tokenizer (much faster)
tfidf = TfidfVectorizer(
    stop_words='english',   # automatically removes stopwords
    max_features=5000,      # optional: limit number of features to speed up
    lowercase=True,
    use_idf=True,
    norm='l2',
    smooth_idf=True
)

text_feature = tfidf.fit_transform(text_feature)
print("TF-IDF shape:", text_feature.shape)


TF-IDF shape: (17880, 5000)


In [40]:
# encode label features
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df1[col] = lb.fit_transform(df1[col])


In [41]:
# scale
label_feature = df1[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [42]:
# build model
X = hstack((text_feature, label_feature))
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

tfidf_clf = LogisticRegression(C=1, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.993895  0.966172  0.979837      1685
           1   0.571429  0.883721  0.694064        86

    accuracy                       0.962168      1771
   macro avg   0.782662  0.924947  0.836951      1771
weighted avg   0.973380  0.962168  0.965960      1771

0.924946518528742


In [43]:
# checking feature importance
cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']
for i in range(8):
    print(cols[i], "|", tfidf_clf.coef_[0][-8:][i])


employment_type | 0.05953826736532636
required_experience | 0.3053465674503286
required_education | 0.22142551086187695
industry | -0.045865731389627386
function | -0.37745721371698776
country | 0.45750967491232875
state | 0.07470962007682601
city | 0.06434221931930184


In [44]:
# if only processed with text - lower accruacy and other scores
# if keep 'required_experience', 'required_education', 'function', 'country' - same result as using all varialbes
# label_feature = df1[['required_experience', 'required_education', 'function', 'country']]
# scaler = StandardScaler().fit(label_feature)

# label_feature = scaler.transform(label_feature)
# X = hstack((text_feature, label_feature))
X = text_feature
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

tfidf_clf = LogisticRegression(C=1.0, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.993309  0.969139  0.981075      1685
           1   0.590551  0.872093  0.704225        86

    accuracy                       0.964427      1771
   macro avg   0.791930  0.920616  0.842650      1771
weighted avg   0.973751  0.964427  0.967632      1771

0.9206162445655924


In [45]:
# tunning
X = hstack((text_feature, label_feature))
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

tfidf_clf = LogisticRegression(C=10, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.992216  0.983383  0.987779      1685
           1   0.722772  0.848837  0.780749        86

    accuracy                       0.976849      1771
   macro avg   0.857494  0.916110  0.884264      1771
weighted avg   0.979131  0.976849  0.977726      1771

0.9161099993099164


In [46]:
# tunning
tfidf_clf = LogisticRegression(C=0.1, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.992268  0.913947  0.951498      1685
           1   0.337900  0.860465  0.485246        86

    accuracy                       0.911350      1771
   macro avg   0.665084  0.887206  0.718372      1771
weighted avg   0.960492  0.911350  0.928857      1771

0.8872058519080809


In [47]:
confusion_matrix(y_val, tfidf_clf.predict(X_val))

array([[1540,  145],
       [  12,   74]])

In [48]:
# final model
X = hstack((text_feature, label_feature))
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)


tfidf_clf = LogisticRegression(C=1, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
#print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
#print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

In [49]:
# Apply on test set
print(classification_report(y_test, tfidf_clf.predict(X_test), digits=6))
print(roc_auc_score(y_test, tfidf_clf.predict(X_test), average='macro'))


              precision    recall  f1-score   support

           0   0.995154  0.965902  0.980310      1701
           1   0.576642  0.908046  0.705357        87

    accuracy                       0.963087      1788
   macro avg   0.785898  0.936974  0.842834      1788
weighted avg   0.974791  0.963087  0.966932      1788

0.9369741936791744


In [50]:
confusion_matrix(y_test, tfidf_clf.predict(X_test), labels=[1,0]) 

array([[  79,    8],
       [  58, 1643]])

In [51]:
# combine text features and vectorize
df2 = data.copy()
text_feature = df2[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

bow = CountVectorizer(tokenizer=tokenizer)

text_feature = bow.fit_transform(text_feature)



In [52]:
# encode label features
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df2[col] = lb.fit_transform(df2[col])

In [53]:
# scale
label_feature = df2[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [54]:
# build model
X = hstack((text_feature, label_feature))
y = df2['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

bow_clf = LogisticRegression(C=1.0, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, bow_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, bow_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.991652  0.986944  0.989292      1685
           1   0.765957  0.837209  0.800000        86

    accuracy                       0.979673      1771
   macro avg   0.878805  0.912076  0.894646      1771
weighted avg   0.980692  0.979673  0.980100      1771

0.9120764612518115


In [55]:
# feature importance
cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']
for i in range(8):
    print(cols[i], "|", bow_clf.coef_[0][-8:][i])


employment_type | 0.3155537439707987
required_experience | 0.4365414474994748
required_education | 0.4252372401814759
industry | -0.5649960248300391
function | -0.6783747326971861
country | 0.31108501845867276
state | -0.05559409285537898
city | 0.161398950885278


In [56]:
# adjusted model
label_feature = df2[['industry', 'function']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)
X = hstack((text_feature, label_feature))
y = df2['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

bow_clf = LogisticRegression(C=1.0, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, bow_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, bow_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.992262  0.989318  0.990788      1685
           1   0.802198  0.848837  0.824859        86

    accuracy                       0.982496      1771
   macro avg   0.897230  0.919077  0.907823      1771
weighted avg   0.983032  0.982496  0.982730      1771

0.9190773583603615


In [57]:
# tuning
bow_clf = LogisticRegression(C=5, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, bow_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, bow_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.991087  0.989911  0.990499      1685
           1   0.806818  0.825581  0.816092        86

    accuracy                       0.981931      1771
   macro avg   0.898953  0.907746  0.903295      1771
weighted avg   0.982139  0.981931  0.982030      1771

0.9077461872886621


In [58]:
# final model
label_feature = df2[['industry', 'function']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)
X = hstack((text_feature, label_feature))
# X = text_feature
y = df2['fraudulent']

bow_clf = LogisticRegression(C=5, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)

In [59]:
# apply on test set
print(classification_report(y_test, bow_clf.predict(X_test), digits=6))
print(roc_auc_score(y_test, bow_clf.predict(X_test), average='macro'))

              precision    recall  f1-score   support

           0   0.992962  0.995297  0.994128      1701
           1   0.903614  0.862069  0.882353        87

    accuracy                       0.988814      1788
   macro avg   0.948288  0.928683  0.938240      1788
weighted avg   0.988614  0.988814  0.988689      1788

0.9286829248515073


In [60]:
confusion_matrix(y_test, bow_clf.predict(X_test), labels=[1,0])

array([[  75,   12],
       [   8, 1693]])