In [1]:
#I have imported the below libraries as needed, seaborn for visualization, matlob to create various kinds of charts and graphs, numpy to perform numerical operations in python, pandas for data manipilation and analysis.
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd

In [2]:
#load the dataset 
df_job=pd.read_csv(r'fake_job_postings.csv')
df_job.head(2)

Unnamed: 0,job_id,title,location,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0.0,1.0,0.0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0.0,1.0,0.0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [3]:
#Check columns 
df_job.columns

Index(['job_id', 'title', 'location', 'department', 'company_profile',
       'description', 'requirements', 'benefits', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [4]:
#I am using the below function to count the number of missing values (NaN or null values) in each column of a DataFrame.
df_job.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
company_profile         3308
description                2
requirements            2697
benefits                7213
telecommuting              1
has_company_logo           1
has_questions              1
employment_type         3472
required_experience     7051
required_education      8106
industry                4904
function                6456
fraudulent                 0
dtype: int64

In [5]:
#I have used the below function to handle missing values in a DataFrame (df) by replacing them with 0. 
df_job.dtypes


job_id                   int64
title                   object
location                object
department              object
company_profile         object
description             object
requirements            object
benefits                object
telecommuting          float64
has_company_logo       float64
has_questions          float64
employment_type         object
required_experience     object
required_education      object
industry                object
function                object
fraudulent               int64
dtype: object

In [6]:
#I've used the below function to retrieve the data types of each column in the dataframe
df_job.dtypes

job_id                   int64
title                   object
location                object
department              object
company_profile         object
description             object
requirements            object
benefits                object
telecommuting          float64
has_company_logo       float64
has_questions          float64
employment_type         object
required_experience     object
required_education      object
industry                object
function                object
fraudulent               int64
dtype: object

In [7]:
#I've used the below function to clean and preprocess text data in the 'description' column of a DataFrame (df) such that any character that is not a letter (either uppercase or lowercase) or a whitespace character to nothing.
df_job['department'] = df_job['department'].fillna(df_job['department'].mode()[0])
df_job.head(2)

Unnamed: 0,job_id,title,location,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0.0,1.0,0.0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0.0,1.0,0.0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [8]:
#importing the required packages
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import feature_selection
from sklearn.impute import SimpleImputer

In [12]:
#selecting feature engineering
data_cat_imp=SimpleImputer(strategy="constant",fill_value="missing")
cat_imp_feature=["title","location","department","company_profile","description","requirements","benefits",
                 "employment_type","required_experience","required_education","industry","function"]

In [13]:
# Filling the Numerical values through existing value
data_num_imp=SimpleImputer(strategy="constant",fill_value=0)
num_imp_feature =["job_id","telecommuting","has_company_logo","has_questions","fraudulent"]

In [14]:
# Transforming into column
data_imp_trans=ColumnTransformer([("data_cat_imp",data_cat_imp,cat_imp_feature),
                                 ("data_num_imp",data_num_imp,num_imp_feature)])

# Transforming and assigning the data
transformed_data=data_imp_trans.fit_transform(df_job)
transformed_data

array([['Marketing Intern', 'US, NY, New York', 'Marketing', ..., 1.0,
        0.0, 0.0],
       ['Customer Service - Cloud Video Production', 'NZ, , Auckland',
        'Success', ..., 1.0, 0.0, 0.0],
       ['Commissioning Machinery Assistant (CMA)', 'US, IA, Wever',
        'Sales', ..., 1.0, 0.0, 0.0],
       ...,
       ['Graphic Designer', 'NG, LA, Lagos', 'Sales', ..., 0.0, 1.0, 0.0],
       ['Web Application Developers', 'NZ, N, Wellington', 'Engineering',
        ..., 1.0, 1.0, 0.0],
       ['Data Analyst', 'US,TX, Houston', 'Engineering', ..., 0.0, 0.0,
        0.0]], dtype=object)

In [16]:
#Transforming the data into data frame
df_job_transformed_data=pd.DataFrame(transformed_data,
                         columns=["title","location","department","company_profile","description",
                                  "requirements","benefits", "employment_type","required_experience","required_education",
                                  "industry","function","job_id","telecommuting","has_company_logo","has_questions",
                                  "fraudulent"])

In [17]:
df_job_transformed_data.head(2)

Unnamed: 0,title,location,department,company_profile,description,requirements,benefits,employment_type,required_experience,required_education,industry,function,job_id,telecommuting,has_company_logo,has_questions,fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,missing,Other,Internship,missing,missing,Marketing,1.0,0.0,1.0,0.0,0.0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,Full-time,Not Applicable,missing,Marketing and Advertising,Customer Service,2.0,0.0,1.0,0.0,0.0


In [18]:
df_job_transformed_data.isna().sum()

title                  0
location               0
department             0
company_profile        0
description            0
requirements           0
benefits               0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
job_id                 0
telecommuting          0
has_company_logo       0
has_questions          0
fraudulent             0
dtype: int64

In [19]:
#part of a feature engineering process, specifically dealing with imputation of missing values in categorical features
X_trans = df_job_transformed_data.drop("fraudulent",axis=1)
y_trans = df_job_transformed_data.fraudulent
y_trans=y_trans.astype('int')

#shape(row,column) of features and label
X_trans.shape, y_trans.shape,X_trans.columns

((17881, 16),
 (17881,),
 Index(['title', 'location', 'department', 'company_profile', 'description',
        'requirements', 'benefits', 'employment_type', 'required_experience',
        'required_education', 'industry', 'function', 'job_id', 'telecommuting',
        'has_company_logo', 'has_questions'],
       dtype='object'))

In [20]:
X_trans

Unnamed: 0,title,location,department,company_profile,description,requirements,benefits,employment_type,required_experience,required_education,industry,function,job_id,telecommuting,has_company_logo,has_questions
0,Marketing Intern,"US, NY, New York",Marketing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,missing,Other,Internship,missing,missing,Marketing,1.0,0.0,1.0,0.0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,Full-time,Not Applicable,missing,Marketing and Advertising,Customer Service,2.0,0.0,1.0,0.0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Sales,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,missing,missing,missing,missing,missing,missing,3.0,0.0,1.0,0.0
3,Account Executive - Washington DC,"US, DC, Washington",Sales,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,4.0,0.0,1.0,0.0
4,Bill Review Manager,"US, FL, Fort Worth",Sales,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,5.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17876,Payroll Accountant,"US, PA, Philadelphia",Accounting,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,17877.0,0.0,1.0,1.0
17877,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",Sales,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,missing,Full-time,missing,missing,missing,missing,17878.0,0.0,0.0,0.0
17878,Graphic Designer,"NG, LA, Lagos",Sales,missing,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,Contract,Not Applicable,Professional,Graphic Design,Design,17879.0,0.0,0.0,1.0
17879,Web Application Developers,"NZ, N, Wellington",Engineering,Vend is looking for some awesome new talent to...,Who are we?Vend is an award winning web based ...,We want to hear from you if:You have an in-dep...,missing,Full-time,Mid-Senior level,missing,Computer Software,Engineering,17880.0,0.0,1.0,1.0


In [21]:
# Instantation of One Hot Encoder for categorical data tarnsformatio into Numeric 
one_hot=OneHotEncoder()
clf_trans=ColumnTransformer([("one_hot",one_hot,cat_imp_feature)],remainder="passthrough")
X_trans_fin=clf_trans.fit_transform(X_trans)
np.array(X_trans_fin)

array(<17881x50560 sparse matrix of type '<class 'numpy.float64'>'
	with 256232 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [22]:
#splitting the data into train and test with 23% reserved for testing and 77% for training
X_train,X_test,y_train,y_test=train_test_split(X_trans_fin,y_trans,test_size=0.23, random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((13768, 50560), (4113, 50560), (13768,), (4113,))

In [23]:
model_rfm=RandomForestClassifier()

#fitting the data into model
model_rfm.fit(X_train,y_train)

In [21]:
print(f"Fake Job Random Forest Model Accuracy : {model_rfm.score(X_test,y_test)*100:.2f}%")

Fake Job Random Forest Model Accuracy : 98.57%


In [22]:
y_pred_rfm=model_rfm.predict(X_test)
y_pred_rfm

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
from sklearn.metrics import classification_report
#classification report
print(classification_report(y_test,y_pred_rfm))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3916
           1       1.00      0.70      0.82       197

    accuracy                           0.99      4113
   macro avg       0.99      0.85      0.91      4113
weighted avg       0.99      0.99      0.98      4113



In [24]:
# Splitting the data into train and test with 23% reserved for testing and 77% for training
X_train, X_test, y_train, y_test = train_test_split(X_trans_fin, y_trans, test_size=0.23, random_state=42)

In [25]:
# Logistic Regression model
model_lr = LogisticRegression()


In [26]:
# Fitting the data into the model
model_lr.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# Model accuracy
print(f"Logistic Regression Model Accuracy: {model_lr.score(X_test, y_test) * 100:.2f}%")


Logistic Regression Model Accuracy: 95.87%


In [28]:
# Predictions
y_pred_lr = model_lr.predict(X_test)
y_pred_lr

# Classification report
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3916
           1       0.64      0.31      0.42       197

    accuracy                           0.96      4113
   macro avg       0.80      0.65      0.70      4113
weighted avg       0.95      0.96      0.95      4113



In [31]:
from sklearn.metrics import confusion_matrix

# Assuming y_true contains the true labels and y_pred contains the predicted labels
conf_matrix = confusion_matrix(y_test, y_pred_lr)

print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[3881   35]
 [ 135   62]]
