In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd


In [4]:
df_job=pd.read_csv(r'fake_job_postings.csv')
df_job.head(2)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [5]:
#Check columns 
df_job.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [6]:
df_job.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [9]:
df_job.dtypes

job_id                  int64
title                  object
location               object
department             object
salary_range           object
company_profile        object
description            object
requirements           object
benefits               object
telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
function               object
fraudulent              int64
dtype: object

In [10]:
df_job['department'] = df_job['department'].fillna(df_job['department'].mode()[0])

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import feature_selection
from sklearn.impute import SimpleImputer

In [12]:
data_cat_imp=SimpleImputer(strategy="constant",fill_value="Missing")
cat_imp_feature=["title","location","department","salary_range","company_profile","description","requirements","benefits",
                 "employment_type","required_experience","required_education","industry","function"]

# Filling the Numerical values through existing value
data_num_imp=SimpleImputer(strategy="constant",fill_value=None)
num_imp_feature =["job_id","telecommuting","has_company_logo","has_questions","fraudulent"]

# Transforming into column
data_imp_trans=ColumnTransformer([("data_cat_imp",data_cat_imp,cat_imp_feature),
                                 ("data_num_imp",data_num_imp,num_imp_feature)])

# Transforming and assigning the data
transformed_data=data_imp_trans.fit_transform(df_job)
transformed_data

array([['Marketing Intern', 'US, NY, New York', 'Marketing', ..., 1, 0,
        0],
       ['Customer Service - Cloud Video Production', 'NZ, , Auckland',
        'Success', ..., 1, 0, 0],
       ['Commissioning Machinery Assistant (CMA)', 'US, IA, Wever',
        'Sales', ..., 1, 0, 0],
       ...,
       ['Project Cost Control Staff Engineer - Cost Control Exp - TX',
        'US, TX, Houston', 'Sales', ..., 0, 0, 0],
       ['Graphic Designer', 'NG, LA, Lagos', 'Sales', ..., 0, 1, 0],
       ['Web Application Developers', 'NZ, N, Wellington', 'Engineering',
        ..., 1, 1, 0]], dtype=object)

In [13]:
#Transforming the data into data frame
df_job_transformed_data=pd.DataFrame(transformed_data,
                         columns=["title","location","department","salary_range","company_profile","description",
                                  "requirements","benefits", "employment_type","required_experience","required_education",
                                  "industry","function","job_id","telecommuting","has_company_logo","has_questions",
                                  "fraudulent"])

In [14]:
df_job_transformed_data.head(2)

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,employment_type,required_experience,required_education,industry,function,job_id,telecommuting,has_company_logo,has_questions,fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,Missing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Missing,Other,Internship,Missing,Missing,Marketing,1,0,1,0,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,Missing,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,Full-time,Not Applicable,Missing,Marketing and Advertising,Customer Service,2,0,1,0,0


In [15]:
df_job_transformed_data.isna().sum()

title                  0
location               0
department             0
salary_range           0
company_profile        0
description            0
requirements           0
benefits               0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
job_id                 0
telecommuting          0
has_company_logo       0
has_questions          0
fraudulent             0
dtype: int64

In [16]:
X_trans = df_job_transformed_data.drop("fraudulent",axis=1)
y_trans = df_job_transformed_data.fraudulent
y_trans=y_trans.astype('int')

#shape(row,column) of features and label
X_trans.shape, y_trans.shape,X_trans.columns

((17880, 17),
 (17880,),
 Index(['title', 'location', 'department', 'salary_range', 'company_profile',
        'description', 'requirements', 'benefits', 'employment_type',
        'required_experience', 'required_education', 'industry', 'function',
        'job_id', 'telecommuting', 'has_company_logo', 'has_questions'],
       dtype='object'))

In [17]:
X_trans

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,employment_type,required_experience,required_education,industry,function,job_id,telecommuting,has_company_logo,has_questions
0,Marketing Intern,"US, NY, New York",Marketing,Missing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Missing,Other,Internship,Missing,Missing,Marketing,1,0,1,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,Missing,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,Full-time,Not Applicable,Missing,Marketing and Advertising,Customer Service,2,0,1,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Sales,Missing,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,Missing,Missing,Missing,Missing,Missing,Missing,3,0,1,0
3,Account Executive - Washington DC,"US, DC, Washington",Sales,Missing,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,4,0,1,0
4,Bill Review Manager,"US, FL, Fort Worth",Sales,Missing,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,5,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,"CA, ON, Toronto",Sales,Missing,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,Full-time,Mid-Senior level,Missing,Computer Software,Sales,17876,0,1,1
17876,Payroll Accountant,"US, PA, Philadelphia",Accounting,Missing,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,17877,0,1,1
17877,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",Sales,Missing,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,Missing,Full-time,Missing,Missing,Missing,Missing,17878,0,0,0
17878,Graphic Designer,"NG, LA, Lagos",Sales,Missing,Missing,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,Contract,Not Applicable,Professional,Graphic Design,Design,17879,0,0,1


In [18]:
# Instantation of One Hot Encoder for categorical data tarnsformatio into Numeric 
one_hot=OneHotEncoder()
clf_trans=ColumnTransformer([("one_hot",one_hot,cat_imp_feature)],remainder="passthrough")
X_trans_fin=clf_trans.fit_transform(X_trans)
np.array(X_trans_fin)

array(<17880x51438 sparse matrix of type '<class 'numpy.float64'>'
	with 274099 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [19]:
#splitting the data into train and test with 23% reserved for testing and 77% for training
X_train,X_test,y_train,y_test=train_test_split(X_trans_fin,y_trans,test_size=0.23, random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((13767, 51438), (4113, 51438), (13767,), (4113,))

In [20]:
model_rfm=RandomForestClassifier()

#fitting the data into model
model_rfm.fit(X_train,y_train)

RandomForestClassifier()

In [21]:
print(f"Fake Job Random Forest Model Accuracy : {model_rfm.score(X_test,y_test)*100:.2f}%")

Fake Job Random Forest Model Accuracy : 98.47%


In [22]:
y_pred_rfm=model_rfm.predict(X_test)
y_pred_rfm

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
from sklearn.metrics import classification_report

In [24]:
#classification report
print(classification_report(y_test,y_pred_rfm))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3909
           1       1.00      0.69      0.82       204

    accuracy                           0.98      4113
   macro avg       0.99      0.85      0.90      4113
weighted avg       0.98      0.98      0.98      4113

