TODO:
- Do some lemmitization or stemming
- Find the best model
- Find the most important words used in fake and non-fake job postings
- clean text data before tf-idf

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

In [2]:
FILENAME = "cleaned_fake_job_postings.csv"
INDEX = "job_id"
TARGET_VARIABLE = "fraudulent"
RANDOM_STATE = 12345

## Load data

In [3]:
df = pd.read_csv(FILENAME)
df.set_index(INDEX, inplace=True)
df.head()

Unnamed: 0_level_0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent,text
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,False,True,False,Other,Junior,,Business,False,"Marketing Intern We're Food52, and we've creat..."
2,False,True,False,Full-time,,,Business,False,Customer Service - Cloud Video Production 90 S...
3,False,True,False,,,,,False,Commissioning Machinery Assistant (CMA) Valor ...
4,False,True,False,Full-time,Standard,Undergraduate,Business,False,Account Executive - Washington DC Our passion ...
5,False,True,True,Full-time,Standard,Undergraduate,Healthcare,False,Bill Review Manager SpotSource Solutions LLC i...


## Encode Categorical data

In [4]:
categorical_columns = ["telecommuting", "has_company_logo", "has_questions", "employment_type", "required_experience", "required_education", "function", "fraudulent"]
encoders = {}

for col in categorical_columns:    
    le = preprocessing.LabelEncoder()
    df.loc[:, col] = le.fit_transform(df[col])
    encoders[col] = le

df.head()

Unnamed: 0_level_0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent,text
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,1,0,1,0,4,0,0,"Marketing Intern We're Food52, and we've creat..."
2,0,1,0,0,3,4,0,0,Customer Service - Cloud Video Production 90 S...
3,0,1,0,3,3,4,10,0,Commissioning Machinery Assistant (CMA) Valor ...
4,0,1,0,0,2,3,0,0,Account Executive - Washington DC Our passion ...
5,0,1,1,0,2,3,5,0,Bill Review Manager SpotSource Solutions LLC i...


## Baseline Classifier

In [5]:
X = df.drop("fraudulent", axis=1)
y = df.loc[:, "fraudulent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE)

bc = DummyClassifier(strategy="most_frequent")
bc.fit(X_train, y_train)
bc.score(X_test, y_test)

0.9516778523489933

## Random Forest Classification using TF-IDF

In [6]:
TFIDF_MAX_FEATURES = 100
TFIDF_NGRAM_RANGE = (1,1)
CORPUS_COLUMN_NAME = "text"

def random_forest_classify(X_train, y_train, X_test, y_test):
    tf_idf = TfidfVectorizer(stop_words="english", ngram_range=TFIDF_NGRAM_RANGE, max_features=TFIDF_MAX_FEATURES)
    tf_idf.fit(X_train[CORPUS_COLUMN_NAME])
    tf_idf_feature_names = "tfidf_" + np.array(tf_idf.get_feature_names(), dtype=object)
    tf_idf_train = pd.DataFrame(tf_idf.transform(X_train[CORPUS_COLUMN_NAME]).toarray(), columns=tf_idf_feature_names, index=X_train.index)
    tf_idf_test = pd.DataFrame(tf_idf.transform(X_test[CORPUS_COLUMN_NAME]).toarray(), columns=tf_idf_feature_names, index=X_test.index)

    X_train = X_train.drop(CORPUS_COLUMN_NAME, axis=1).join(tf_idf_train)
    X_test = X_test.drop(CORPUS_COLUMN_NAME, axis=1).join(tf_idf_test)
    
    rfc = RandomForestClassifier(n_estimators=100, criterion="entropy")
    rfc.fit(X_train, y_train)
    return rfc.score(X_test, y_test)
    

In [7]:
X = df.drop("fraudulent", axis=1)
y = df.loc[:, "fraudulent"]
skf = StratifiedKFold(n_splits=10, random_state=RANDOM_STATE, shuffle=True)
accuracies = []
i = 1
for train_index, test_index in skf.split(X, y):
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index, :]
    y_test = y.iloc[test_index]
    
    accuracy = random_forest_classify(X_train, y_train, X_test, y_test)
    accuracies.append(accuracy)
    print("k-fold =",i,"complete")
    i += 1

accuracies

k-fold = 1 complete
k-fold = 2 complete
k-fold = 3 complete
k-fold = 4 complete
k-fold = 5 complete
k-fold = 6 complete
k-fold = 7 complete
k-fold = 8 complete
k-fold = 9 complete
k-fold = 10 complete


[0.9692393736017897,
 0.9742729306487695,
 0.9714765100671141,
 0.9776286353467561,
 0.9737136465324385,
 0.9759507829977628,
 0.9697986577181208,
 0.9697986577181208,
 0.9737136465324385,
 0.9781879194630873]

In [8]:
np.array(accuracies).mean()

0.9733780760626398

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("fraudulent", axis=1), df.loc[:, "fraudulent"], test_size=0.25, random_state=RANDOM_STATE)

In [10]:
MAX_FEATURES = 100
NGRAM_RANGE = (1,1)

corpus_train = X_train.loc[:, text_columns] \
    .fillna("") \
    .apply(lambda row: " ".join([row[col] for col in text_columns]), axis=1)

corpus_test = X_test.loc[:, text_columns] \
    .fillna("") \
    .apply(lambda row: " ".join([row[col] for col in text_columns]), axis=1)

tf_idf = TfidfVectorizer(stop_words="english", ngram_range=NGRAM_RANGE, max_features=MAX_FEATURES)

tf_idf.fit(corpus_train)
tf_idf_feature_names = "tfidf_" + np.array(tf_idf.get_feature_names(), dtype=object)
tf_idf_train = pd.DataFrame(tf_idf.transform(corpus_train).toarray(), columns=tf_idf_feature_names, index=corpus_train.index)
tf_idf_test = pd.DataFrame(tf_idf.transform(corpus_test).toarray(), columns=tf_idf_feature_names, index=corpus_test.index)

X_train = X_train.drop(text_columns, axis=1).join(tf_idf_train)
X_test = X_test.drop(text_columns, axis=1).join(tf_idf_test)
X_train.head()

NameError: name 'text_columns' is not defined