In [3]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [4]:
fake_real_data = pd.read_csv("fake_job_postings.csv")

In [5]:
columns = list(fake_real_data.columns)
null_columns = dict()
type_null_columns = list()
for values in columns:
    null_values = fake_real_data[values].isnull().sum()
    if null_values > 0:
        null_columns[values] = null_values
        type_null_columns.append(type(fake_real_data[values][0]))
    print("Number of nan values in %s is %d"%(values,null_values))

Number of nan values in job_id is 0
Number of nan values in title is 0
Number of nan values in location is 346
Number of nan values in department is 11547
Number of nan values in salary_range is 15012
Number of nan values in company_profile is 3308
Number of nan values in description is 1
Number of nan values in requirements is 2695
Number of nan values in benefits is 7210
Number of nan values in telecommuting is 0
Number of nan values in has_company_logo is 0
Number of nan values in has_questions is 0
Number of nan values in employment_type is 3471
Number of nan values in required_experience is 7050
Number of nan values in required_education is 8105
Number of nan values in industry is 4903
Number of nan values in function is 6455
Number of nan values in fraudulent is 0


In [6]:
length_of_dataset = len(fake_real_data)
drop_list = list()
for key,value in null_columns.items():
    if value > int(length_of_dataset * 10 /100):
        print(key,":",value)
        drop_list.append(key)

department : 11547
salary_range : 15012
company_profile : 3308
requirements : 2695
benefits : 7210
employment_type : 3471
required_experience : 7050
required_education : 8105
industry : 4903
function : 6455


In [7]:
for values in drop_list:
    fake_real_data = fake_real_data.drop(values,axis = 1)

In [8]:
fake_real_data.head()

Unnamed: 0,job_id,title,location,description,telecommuting,has_company_logo,has_questions,fraudulent
0,1,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",0,1,0,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Organised - Focused - Vibrant - Awesome!Do you...,0,1,0,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever","Our client, located in Houston, is actively se...",0,1,0,0
3,4,Account Executive - Washington DC,"US, DC, Washington",THE COMPANY: ESRI – Environmental Systems Rese...,0,1,0,0
4,5,Bill Review Manager,"US, FL, Fort Worth",JOB TITLE: Itemization Review ManagerLOCATION:...,0,1,1,0


In [9]:
fake_real_data = fake_real_data.dropna()

In [10]:
fake_real_data.isna().sum()

job_id              0
title               0
location            0
description         0
telecommuting       0
has_company_logo    0
has_questions       0
fraudulent          0
dtype: int64

In [11]:
mapping = {k: v for v, k in enumerate(fake_real_data.title.unique())}
fake_real_data['title'] = fake_real_data.title.map(mapping)
fake_real_data.head()

Unnamed: 0,job_id,title,location,description,telecommuting,has_company_logo,has_questions,fraudulent
0,1,0,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",0,1,0,0
1,2,1,"NZ, , Auckland",Organised - Focused - Vibrant - Awesome!Do you...,0,1,0,0
2,3,2,"US, IA, Wever","Our client, located in Houston, is actively se...",0,1,0,0
3,4,3,"US, DC, Washington",THE COMPANY: ESRI – Environmental Systems Rese...,0,1,0,0
4,5,4,"US, FL, Fort Worth",JOB TITLE: Itemization Review ManagerLOCATION:...,0,1,1,0


In [12]:
mapping = {k: v for v, k in enumerate(fake_real_data.location.unique())}
fake_real_data['location'] = fake_real_data.location.map(mapping)
fake_real_data.head()

Unnamed: 0,job_id,title,location,description,telecommuting,has_company_logo,has_questions,fraudulent
0,1,0,0,"Food52, a fast-growing, James Beard Award-winn...",0,1,0,0
1,2,1,1,Organised - Focused - Vibrant - Awesome!Do you...,0,1,0,0
2,3,2,2,"Our client, located in Houston, is actively se...",0,1,0,0
3,4,3,3,THE COMPANY: ESRI – Environmental Systems Rese...,0,1,0,0
4,5,4,4,JOB TITLE: Itemization Review ManagerLOCATION:...,0,1,1,0


In [13]:
fake_real_data = fake_real_data.drop('description',axis = 1)

In [14]:
fake_real_data.head()

Unnamed: 0,job_id,title,location,telecommuting,has_company_logo,has_questions,fraudulent
0,1,0,0,0,1,0,0
1,2,1,1,0,1,0,0
2,3,2,2,0,1,0,0
3,4,3,3,0,1,0,0
4,5,4,4,0,1,1,0


In [18]:
# split data into X and y
X = fake_real_data.iloc[:,0:5]
Y = fake_real_data.iloc[:,6]

In [19]:
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [21]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [23]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [24]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 97.78%
