TODO:
- Use baseline classifier
- Do some lemmitization or stemming
- Use KFold stratified sampling
- Find the best model
- Find the most important words used in fake and non-fake job postings

### Import Libraries

In [7]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score 

In [2]:
FILENAME = "cleaned_fake_job_postings.csv"
INDEX = "job_id"
TARGET_VARIABLE = "fraudulent"
RANDOM_STATE = 12345

### Load data

In [3]:
df = pd.read_csv(FILENAME)
df.set_index(INDEX, inplace=True)
df.head()

Unnamed: 0_level_0,title,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,Marketing Intern,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,False,True,False,Other,Junior,,Business,False
2,Customer Service - Cloud Video Production,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,False,True,False,Full-time,,,Business,False
3,Commissioning Machinery Assistant (CMA),Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,False,True,False,,,,,False
4,Account Executive - Washington DC,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,False,True,False,Full-time,Standard,Undergraduate,Business,False
5,Bill Review Manager,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,False,True,True,Full-time,Standard,Undergraduate,Healthcare,False


### Encode Categorical data

In [4]:
categorical_columns = ["telecommuting", "has_company_logo", "has_questions", "employment_type", "required_experience", "required_education", "function", "fraudulent"]
encoders = {}

for col in categorical_columns:    
    le = preprocessing.LabelEncoder()
    df.loc[:, col] = le.fit_transform(df[col])
    encoders[col] = le

df[categorical_columns].head()

Unnamed: 0_level_0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,1,0,1,0,4,0,0
2,0,1,0,0,3,4,0,0
3,0,1,0,3,3,4,10,0
4,0,1,0,0,2,3,0,0
5,0,1,1,0,2,3,5,0


### Split Data into Training and Testing Sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("fraudulent", axis=1), df.loc[:, "fraudulent"], test_size=0.25, random_state=RANDOM_STATE)

### TF-IDF Feature Extraction from Text Data

In [6]:
text_columns = ["title", "company_profile", "description", "requirements", "benefits"]
MAX_FEATURES = 100
NGRAM_RANGE = (1,1)

corpus_train = X_train.loc[:, text_columns] \
    .fillna("") \
    .apply(lambda row: " ".join([row[col] for col in text_columns]), axis=1)

corpus_test = X_test.loc[:, text_columns] \
    .fillna("") \
    .apply(lambda row: " ".join([row[col] for col in text_columns]), axis=1)

tf_idf = TfidfVectorizer(stop_words="english", ngram_range=NGRAM_RANGE, max_features=MAX_FEATURES)

tf_idf.fit(corpus_train)
tf_idf_feature_names = "tfidf_" + np.array(tf_idf.get_feature_names(), dtype=object)
tf_idf_train = pd.DataFrame(tf_idf.transform(corpus_train).toarray(), columns=tf_idf_feature_names, index=corpus_train.index)
tf_idf_test = pd.DataFrame(tf_idf.transform(corpus_test).toarray(), columns=tf_idf_feature_names, index=corpus_test.index)

X_train = X_train.drop(text_columns, axis=1).join(tf_idf_train)
X_test = X_test.drop(text_columns, axis=1).join(tf_idf_test)
X_train.head()

Unnamed: 0_level_0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,tfidf_ability,tfidf_able,tfidf_amp,...,tfidf_training,tfidf_understanding,tfidf_using,tfidf_want,tfidf_web,tfidf_work,tfidf_working,tfidf_world,tfidf_year,tfidf_years
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3124,0,1,0,2,0,0,0,0.085491,0.0,0.0,...,0.106298,0.0,0.0,0.0,0.0,0.285153,0.0,0.097496,0.0,0.072501
3220,0,1,1,0,2,2,6,0.0,0.0,0.0,...,0.0,0.0,0.216067,0.0,0.104085,0.056574,0.0,0.0,0.0,0.215762
10175,0,1,1,3,3,0,10,0.0,0.0,0.0,...,0.152189,0.0,0.0,0.0,0.0,0.163303,0.217048,0.0,0.325957,0.0
3563,0,0,1,3,3,4,10,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.228416,0.124153,0.0,0.0,0.0,0.0
13763,0,1,0,3,3,4,10,0.08043,0.0,0.153998,...,0.100006,0.0,0.0,0.0,0.197427,0.053655,0.0,0.0,0.0,0.068209


### Baseline Classifier

In [10]:
baseline_classifier = DummyClassifier(strategy="most_frequent")
baseline_classifier.fit(X_train, y_train)
baseline_classifier.score(X_test, y_test)

0.9550335570469799