In [1]:
# Pandas for csv processing
import pandas as pd

# Natural Language Toolkit (NLTK) is a state-of-the-art solution to handle text data
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# We'll vectorize our input (turn textual data into numerical form) with
# TF-IDF model implemented in Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Importing Numpy just to find average CV score
import numpy as np

# We'll use Random Forest as classifier and F1 along with ROC AUC as model quality score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer, roc_auc_score

In [5]:
# Reading and viewing our data
# Most of the features are textual, but Telecomunication and Comnpany_Logo are boolean
data = pd.read_csv('/content/Job_Frauds.csv', encoding='latin1')
len_data = len(data)
data

Unnamed: 0,Job Title,Job Location,Department,Range_of_Salary,Profile,Job_Description,Requirements,Job_Benefits,Telecomunication,Comnpany_Logo,Type_of_Employment,Experience,Qualification,Type_of_Industry,Operations,Fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,Other,Internship,,,Marketing,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,,,,,,0
3,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI â Environmental Systems Re...,"EDUCATION:Â Bachelorâs or Masterâs in GIS,...",Our culture is anything but corporateâwe hav...,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time youâve v...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17876,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting-Â Desire to have ...,Health &amp; WellnessMedical planPrescription ...,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,Full-time,,,,,0
17878,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [6]:
# Our dataset is highly imbalanced as we have the ratio
# of legal and fraudulent jobs ~17:1
data.Fraudulent.value_counts()

Fraudulent
0    17014
1      866
Name: count, dtype: int64

In [7]:
# Data description says that we have many missing values among the data,
# so we'll drop the features that have 50% or more values missing,
# others' NAs will be filled with mode
na_cols = []

for col in data.columns:
    na_rate = data[col].isna().sum() / len_data
    print(f"Column {col} has {round(na_rate, 2) * 100}% of missing values")
    if na_rate > 0.5:
        na_cols.append(col)

print(f"Columns {na_cols} have more than 50% of missing values and will be dropped")
data = data.drop(columns=na_cols)

for col in data.drop(columns=['Fraudulent']).columns:
    data[col] = data[col].fillna(data[col].mode().iloc[0])

Column Job Title has 0.0% of missing values
Column Job Location has 2.0% of missing values
Column Department has 65.0% of missing values
Column Range_of_Salary has 84.0% of missing values
Column Profile has 19.0% of missing values
Column Job_Description has 0.0% of missing values
Column Requirements has 15.0% of missing values
Column Job_Benefits has 40.0% of missing values
Column Telecomunication has 0.0% of missing values
Column Comnpany_Logo has 0.0% of missing values
Column Type_of_Employment has 19.0% of missing values
Column Experience has 39.0% of missing values
Column Qualification has 45.0% of missing values
Column Type_of_Industry has 27.0% of missing values
Column Operations has 36.0% of missing values
Column Fraudulent has 0.0% of missing values
Columns ['Department', 'Range_of_Salary'] have more than 50% of missing values and will be dropped


In [8]:
# Final look of our data; two features are dropped as they have too many NAs,
# others are filled with their modes
data

Unnamed: 0,Job Title,Job Location,Profile,Job_Description,Requirements,Job_Benefits,Telecomunication,Comnpany_Logo,Type_of_Employment,Experience,Qualification,Type_of_Industry,Operations,Fraudulent
0,Marketing Intern,"US, NY, New York","We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,See job description,0,1,Other,Internship,Bachelor's Degree,Information Technology and Services,Marketing,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland","90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,Full-time,Not Applicable,Bachelor's Degree,Marketing and Advertising,Customer Service,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,See job description,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Information Technology and Services,Information Technology,0
3,Account Executive - Washington DC,"US, DC, Washington",Our passion for improving quality of life thro...,THE COMPANY: ESRI â Environmental Systems Re...,"EDUCATION:Â Bachelorâs or Masterâs in GIS,...",Our culture is anything but corporateâwe hav...,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,Bill Review Manager,"US, FL, Fort Worth",SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,"CA, ON, Toronto",Vend is looking for some awesome new talent to...,Just in case this is the first time youâve v...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
17876,Payroll Accountant,"US, PA, Philadelphia",WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting-Â Desire to have ...,Health &amp; WellnessMedical planPrescription ...,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,See job description,0,0,Full-time,Mid-Senior level,Bachelor's Degree,Information Technology and Services,Information Technology,0
17878,Graphic Designer,"NG, LA, Lagos",We help teachers get safe &amp; secure jobs ab...,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [11]:
# Let's perform the most important task - the text preprocessing
data_inline = pd.DataFrame()

import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')

stop_words = stopwords.words('english') # We define a set of stopwords...
sbs = SnowballStemmer(language='english') # ...and a stemmer to delete word endings

descripts = [] # List of final lines of job descriptions

# Each entry in dataset will be represented as a single line of concatenated
# textual features and Telecomunication and Comnpany_Logo flags
for i in range(len(data)):
    line = ''
    for col in data.drop(columns=['Telecomunication', 'Comnpany_Logo', 'Fraudulent']).columns:
        clear_text = ''
        cell_text = str(data.iloc[i][col])

        # Tokenizing, stopwords filtering and stemming
        cell_tokens = word_tokenize(cell_text)
        for token in cell_tokens:
            if token.lower() not in stop_words:
                clear_text += sbs.stem(token) + " "

        line += clear_text + " "

    descripts.append(line)

data_inline['Description'] = descripts
data_inline['Telecomunication'] = data.Telecomunication
data_inline['Conmpany_Logo'] = data.Comnpany_Logo
data_inline['Fraudulent'] = data.Fraudulent

data_inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,Description,Telecomunication,Conmpany_Logo,Fraudulent
0,"market intern us , ny , new york re food52 ,...",0,1,0
1,"custom servic - cloud video product nz , , au...",0,1,0
2,"commiss machineri assist ( cma ) us , ia , we...",0,1,0
3,"account execut - washington dc us , dc , wash...",0,1,0
4,"bill review manag us , fl , fort worth spots...",0,1,0
...,...,...,...,...
17875,"account director - distribut ca , , toronto ...",0,1,0
17876,"payrol account us , pa , philadelphia weblin...",0,1,0
17877,project cost control staff engin - cost contro...,0,0,0
17878,"graphic design ng , la , lago help teacher g...",0,0,0


In [12]:
# Performing TF-IDF vectoring to represent text as a matrix of coefficients
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data_inline.Description.values)
Y = data_inline.Fraudulent

In [13]:
# We have quite big number of columns as there are many unique words in the dataset;
# but hopefully it won't stop our classifier from being efficient
X.shape

(17880, 98853)

In [14]:
# Training and cross-validating of random forest with balanced weights that'll help us cope with class imbalance
rfc = RandomForestClassifier(random_state=42, class_weight='balanced')

cv_rfc = cross_validate(rfc, X, Y, cv=StratifiedKFold(random_state=42, shuffle=True), scoring=['f1_weighted', 'roc_auc'],
                       n_jobs=-1)
print(f"Average Random Forest F1 on CV is {round(np.mean(cv_rfc['test_f1_weighted']), 4)}")
print(f"Average Random Forest AUC on CV is {round(np.mean(cv_rfc['test_roc_auc']), 4)}")

Average Random Forest F1 on CV is 0.9726
Average Random Forest AUC on CV is 0.9864


We have reached almost 1.0 of F1 and AUC, but I bet this solution can be further enhanced. We can play around with vectoring and classification methods and even use recurrent neural networks that are proven to be outstanding in NLP tasks.