In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../data/fake_job_postings.csv')


In [3]:
location = data["location"].str.split(",", expand= True, n= 2)
location.columns = ["country", "state", "city"]
data[["country", "state", "city"]] = location
data = data.drop(columns= "location")

In [4]:
salary = data["salary_range"].str.split("-", expand= True, n= 1)
data[["min_salary", "max_salary"]] = salary
data = data.drop(columns= "salary_range")

In [5]:
data = data.fillna("N/A")
data["state"] = data["state"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["country"] = data["country"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["city"] = data["city"].str.strip().apply(lambda x: "N/A" if x == '' else x)

In [6]:
for i in ["company_profile", "description", "requirements", "benefits"]:
    data[i] = data[i].str.lower()

In [7]:
data.isnull().sum()

job_id                 0
title                  0
department             0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
state                  0
city                   0
min_salary             0
max_salary             0
dtype: int64

In [10]:
data.columns

Index(['job_id', 'title', 'department', 'company_profile', 'description',
       'requirements', 'benefits', 'telecommuting', 'has_company_logo',
       'has_questions', 'employment_type', 'required_experience',
       'required_education', 'industry', 'function', 'fraudulent', 'country',
       'state', 'city', 'min_salary', 'max_salary'],
      dtype='object')

In [11]:
pd.crosstab(data.fraudulent, data.min_salary=='N/A')

min_salary,False,True
fraudulent,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2645,14369
1,223,643


 TF-IDF - XGBoost

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_selection import RFE
from matplotlib import pyplot


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pintusingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pintusingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# tokenizer define
ps = PorterStemmer()
stop = set(stopwords.words('english'))
def tokenizer (doc):
    sentences = sent_tokenize(doc)
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [ps.stem(word) for word in words]
        tokens+=words
    return [w.lower() for w in tokens if w not in stop]

In [15]:
# combine text features and vectorize
df1 = data.copy()
text_feature = df1[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # applied preprocessor in Data Cleaning
                        tokenizer=tokenizer,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)

text_feature = tfidf.fit_transform(text_feature)




In [16]:
# encode label features
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df1[col] = lb.fit_transform(df1[col])


In [17]:
# scale
label_feature = df1[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [19]:
# build model
X = hstack((text_feature, label_feature))
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

tfidf_clf = LogisticRegression(C=1, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.994565  0.977448  0.985932      1685
           1   0.669565  0.895349  0.766169        86

    accuracy                       0.973461      1771
   macro avg   0.832065  0.936398  0.876051      1771
weighted avg   0.978783  0.973461  0.975261      1771

0.9363984542129599


In [20]:
pred_val = tfidf_clf.predict(X_val)
print(classification_report(y_val, pred_val, digits= 6))

              precision    recall  f1-score   support

           0   0.994565  0.977448  0.985932      1685
           1   0.669565  0.895349  0.766169        86

    accuracy                       0.973461      1771
   macro avg   0.832065  0.936398  0.876051      1771
weighted avg   0.978783  0.973461  0.975261      1771



In [21]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, pred_val)
metrics.auc(fpr, tpr)

NameError: name 'metrics' is not defined

In [None]:
pred_test = tfidf_clf.predict(X_test)
print(classification_report(y_test, pred_test, digits= 6))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred_test)
metrics.auc(fpr, tpr)

BOW - XGBoost

In [None]:
# combine text features and vectorize
df2 = data.copy()
text_feature = df2[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

bow = CountVectorizer(tokenizer=tokenizer)

text_feature = bow.fit_transform(text_feature)

In [None]:
# encode label features
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df2[col] = lb.fit_transform(df2[col])


In [None]:
# scale
label_feature = df2[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [None]:
# build model
X = hstack((text_feature, label_feature))
y = df2['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

bow_clf = XGBClassifier(use_label_encoder=False, learning_rate=0.02, n_estimators=300, objective='binary:logistic', subsample= 1.0, min_child_weight= 5, max_depth=5, gamma=5, scale_pos_weight= 20).fit(X_train, y_train)
#pred_val = bow_clf.predict(X_val)
#print(classification_report(y_val, pred_val))

In [None]:
pred_val = bow_clf.predict(X_val)
print(classification_report(y_val, pred_val, digits= 6))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, pred_val)
metrics.auc(fpr, tpr)

In [None]:
pred_test = bow_clf.predict(X_test)
print(classification_report(y_test, pred_test, digits= 6))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred_test)
metrics.auc(fpr, tpr)