In [58]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [59]:
fake_real_data = pd.read_csv("fake_job_postings.csv")

In [60]:
columns = list(fake_real_data.columns)
null_columns = dict()
type_null_columns = list()
#Find all null values in the columns
for values in columns:
    null_values = fake_real_data[values].isnull().sum()
    #Add the to a dictionary for further processing
    if null_values > 0:
        null_columns[values] = null_values
        type_null_columns.append(type(fake_real_data[values][0]))
    print("Number of nan values in %s is %d"%(values,null_values))

Number of nan values in job_id is 0
Number of nan values in title is 0
Number of nan values in location is 346
Number of nan values in department is 11547
Number of nan values in salary_range is 15012
Number of nan values in company_profile is 3308
Number of nan values in description is 1
Number of nan values in requirements is 2695
Number of nan values in benefits is 7210
Number of nan values in telecommuting is 0
Number of nan values in has_company_logo is 0
Number of nan values in has_questions is 0
Number of nan values in employment_type is 3471
Number of nan values in required_experience is 7050
Number of nan values in required_education is 8105
Number of nan values in industry is 4903
Number of nan values in function is 6455
Number of nan values in fraudulent is 0


In [61]:
length_of_dataset = len(fake_real_data)
drop_list = list()
#Drop the list with atleast 30% null values
for key,value in null_columns.items():
    if value > int(length_of_dataset * 30 /100):
        print(key,":",value)
        drop_list.append(key)

department : 11547
salary_range : 15012
benefits : 7210
required_experience : 7050
required_education : 8105
function : 6455


In [62]:
for values in drop_list:
    fake_real_data = fake_real_data.drop(values,axis = 1)

In [63]:
fake_real_data.head()

Unnamed: 0,job_id,title,location,company_profile,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,industry,fraudulent
0,1,Marketing Intern,"US, NY, New York","We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,Other,,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland","90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,Full-time,Marketing and Advertising,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,0,1,0,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,Full-time,Computer Software,0
4,5,Bill Review Manager,"US, FL, Fort Worth",SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,Full-time,Hospital & Health Care,0


In [64]:
#Drop all the null values from rest of columns
fake_real_data = fake_real_data.dropna()

In [65]:
fake_real_data.isna().sum()

job_id              0
title               0
location            0
company_profile     0
description         0
requirements        0
telecommuting       0
has_company_logo    0
has_questions       0
employment_type     0
industry            0
fraudulent          0
dtype: int64

In [66]:
#Encoding non numerical attributes to numerical attributes
mapping = {k: v for v, k in enumerate(fake_real_data.title.unique())}
fake_real_data['title'] = fake_real_data.title.map(mapping)
fake_real_data.head()

Unnamed: 0,job_id,title,location,company_profile,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,industry,fraudulent
1,2,0,"NZ, , Auckland","90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,Full-time,Marketing and Advertising,0
3,4,1,"US, DC, Washington",Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,Full-time,Computer Software,0
4,5,2,"US, FL, Fort Worth",SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,Full-time,Hospital & Health Care,0
6,7,3,"DE, BE, Berlin","Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,0,1,1,Full-time,Online Media,0
8,9,4,"US, FL, Pensacola",Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,0,1,1,Full-time,Information Technology and Services,0


In [67]:
mapping = {k: v for v, k in enumerate(fake_real_data.location.unique())}
fake_real_data['location'] = fake_real_data.location.map(mapping)
fake_real_data.head()

Unnamed: 0,job_id,title,location,company_profile,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,industry,fraudulent
1,2,0,0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,Full-time,Marketing and Advertising,0
3,4,1,1,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,Full-time,Computer Software,0
4,5,2,2,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,Full-time,Hospital & Health Care,0
6,7,3,3,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,0,1,1,Full-time,Online Media,0
8,9,4,4,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,0,1,1,Full-time,Information Technology and Services,0


In [68]:
mapping = {k: v for v, k in enumerate(fake_real_data.employment_type.unique())}
fake_real_data['employment_type'] = fake_real_data.employment_type.map(mapping)
fake_real_data.head()

Unnamed: 0,job_id,title,location,company_profile,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,industry,fraudulent
1,2,0,0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,0,Marketing and Advertising,0
3,4,1,1,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,0,Computer Software,0
4,5,2,2,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,0,Hospital & Health Care,0
6,7,3,3,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,0,1,1,0,Online Media,0
8,9,4,4,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,0,1,1,0,Information Technology and Services,0


In [69]:
mapping = {k: v for v, k in enumerate(fake_real_data.industry.unique())}
fake_real_data['industry'] = fake_real_data.industry.map(mapping)
fake_real_data.head()

Unnamed: 0,job_id,title,location,company_profile,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,industry,fraudulent
1,2,0,0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,0,0,0
3,4,1,1,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,0,1,0
4,5,2,2,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,0,2,0
6,7,3,3,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,0,1,1,0,3,0
8,9,4,4,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,0,1,1,0,4,0


In [70]:
#Drop the descriptive columns to improve prediction
fake_real_data = fake_real_data.drop('description',axis = 1).drop('company_profile',axis=1).drop('requirements',axis=1)

In [71]:
fake_real_data.head()

Unnamed: 0,job_id,title,location,telecommuting,has_company_logo,has_questions,employment_type,industry,fraudulent
1,2,0,0,0,1,0,0,0,0
3,4,1,1,0,1,0,0,1,0
4,5,2,2,0,1,1,0,2,0
6,7,3,3,0,1,1,0,3,0
8,9,4,4,0,1,1,0,4,0


In [72]:
# split data into X and y
X = fake_real_data.iloc[:,0:5]
Y = fake_real_data.iloc[:,6]

In [73]:
#Splitting X and Y to training and testing dataset with 80% training sample and 20% testing sample
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [74]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [75]:
#Predict the model
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [76]:
#Get the Accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy for columns with less than 30% null values: %.2f%%" % (accuracy * 100.0))

Accuracy: 88.46%


In [77]:
#For Columns with atleast 20% Null Values
fake_real_data = pd.read_csv("fake_job_postings.csv")
columns = list(fake_real_data.columns)
null_columns = dict()
type_null_columns = list()
for values in columns:
    null_values = fake_real_data[values].isnull().sum()
    if null_values > 0:
        null_columns[values] = null_values
        type_null_columns.append(type(fake_real_data[values][0]))
length_of_dataset = len(fake_real_data)
drop_list = list()
for key,value in null_columns.items():
    if value > int(length_of_dataset * 20 /100):
        print(key,":",value)
        drop_list.append(key)
for values in drop_list:
    fake_real_data = fake_real_data.drop(values,axis = 1)

department : 11547
salary_range : 15012
benefits : 7210
required_experience : 7050
required_education : 8105
industry : 4903
function : 6455


In [78]:
fake_real_data.head()

Unnamed: 0,job_id,title,location,company_profile,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,fraudulent
0,1,Marketing Intern,"US, NY, New York","We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,Other,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland","90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,Full-time,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,0,1,0,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,Full-time,0
4,5,Bill Review Manager,"US, FL, Fort Worth",SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,Full-time,0


In [79]:
#Drop the rows with null values
fake_real_data = fake_real_data.dropna()

In [80]:
#Encode the Non Numerical attributes to numerical attributes
mapping = {k: v for v, k in enumerate(fake_real_data.title.unique())}
fake_real_data['title'] = fake_real_data.title.map(mapping)
mapping = {k: v for v, k in enumerate(fake_real_data.location.unique())}
fake_real_data['location'] = fake_real_data.location.map(mapping)
mapping = {k: v for v, k in enumerate(fake_real_data.employment_type.unique())}
fake_real_data['employment_type'] = fake_real_data.employment_type.map(mapping)
fake_real_data.head()

Unnamed: 0,job_id,title,location,company_profile,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,fraudulent
0,1,0,0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,0,0
1,2,1,1,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,1,0
3,4,2,2,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,1,0
4,5,3,3,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,1,0
6,7,4,4,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,0,1,1,1,0


In [81]:
#Drop Descriptive attributes
fake_real_data = fake_real_data.drop('description',axis = 1).drop('company_profile',axis=1).drop('requirements',axis=1)

In [82]:
#Fit the Model
X = fake_real_data.iloc[:,0:5]
Y = fake_real_data.iloc[:,6]
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [83]:
#Predict the Accuracy
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy for columns with less than 20% null values: %.2f%%" % (accuracy * 100.0))

Accuracy: 86.55%


In [84]:
#For dropping columns with atleast 10% of Null values
fake_real_data = pd.read_csv("fake_job_postings.csv")
columns = list(fake_real_data.columns)
null_columns = dict()
type_null_columns = list()
for values in columns:
    null_values = fake_real_data[values].isnull().sum()
    if null_values > 0:
        null_columns[values] = null_values
        type_null_columns.append(type(fake_real_data[values][0]))
length_of_dataset = len(fake_real_data)
drop_list = list()
for key,value in null_columns.items():
    if value > int(length_of_dataset * 10 /100):
        print(key,":",value)
        drop_list.append(key)
for values in drop_list:
    fake_real_data = fake_real_data.drop(values,axis = 1)

department : 11547
salary_range : 15012
company_profile : 3308
requirements : 2695
benefits : 7210
employment_type : 3471
required_experience : 7050
required_education : 8105
industry : 4903
function : 6455


In [85]:
fake_real_data.head()

Unnamed: 0,job_id,title,location,description,telecommuting,has_company_logo,has_questions,fraudulent
0,1,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",0,1,0,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Organised - Focused - Vibrant - Awesome!Do you...,0,1,0,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever","Our client, located in Houston, is actively se...",0,1,0,0
3,4,Account Executive - Washington DC,"US, DC, Washington",THE COMPANY: ESRI – Environmental Systems Rese...,0,1,0,0
4,5,Bill Review Manager,"US, FL, Fort Worth",JOB TITLE: Itemization Review ManagerLOCATION:...,0,1,1,0


In [86]:
#Drop the rows with Null values
fake_real_data = fake_real_data.dropna()

In [88]:
#Encode non numerical attributes to Numerical attributes
mapping = {k: v for v, k in enumerate(fake_real_data.title.unique())}
fake_real_data['title'] = fake_real_data.title.map(mapping)
mapping = {k: v for v, k in enumerate(fake_real_data.location.unique())}
fake_real_data['location'] = fake_real_data.location.map(mapping)
fake_real_data.head()

Unnamed: 0,job_id,title,location,description,telecommuting,has_company_logo,has_questions,fraudulent
0,1,0,0,"Food52, a fast-growing, James Beard Award-winn...",0,1,0,0
1,2,1,1,Organised - Focused - Vibrant - Awesome!Do you...,0,1,0,0
2,3,2,2,"Our client, located in Houston, is actively se...",0,1,0,0
3,4,3,3,THE COMPANY: ESRI – Environmental Systems Rese...,0,1,0,0
4,5,4,4,JOB TITLE: Itemization Review ManagerLOCATION:...,0,1,1,0


In [89]:
#Drop Descriptive attributes
fake_real_data = fake_real_data.drop('description',axis = 1)

In [90]:
#Fit the model
X = fake_real_data.iloc[:,0:5]
Y = fake_real_data.iloc[:,6]
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [91]:
#Predict the Accuracy
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy for columns with less than 10% null values: %.2f%%" % (accuracy * 100.0))

Accuracy: 97.78%
