# BT4012 Without Text Analysis

### Setting up the environment

In [1]:
# libraries importing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [2]:
# setting up
# import packages here
# from google.colab import drive
# drive.mount('/content/drive')
# url = '/content/drive/MyDrive/fake_job_postings.csv'

# import from github repo
url = 'https://raw.githubusercontent.com/LordZhiHao/BT4012_Fraud_Analytics_Project/main/fake_job_postings.csv'

# read data
data = pd.read_csv(url)
df = data.copy()
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


### Handling Null Values

EDA plz do refer other scripts

Will dive straight into handling the null values and outliers

In [3]:
# keep track of what columns to keep and drop
cols_to_keep = []
cols_to_drop = []

In [4]:
# check for NA values
df.isna().any()

job_id                 False
title                  False
location                True
department              True
salary_range            True
company_profile         True
description             True
requirements            True
benefits                True
telecommuting          False
has_company_logo       False
has_questions          False
employment_type         True
required_experience     True
required_education      True
industry                True
function                True
fraudulent             False
dtype: bool

In [5]:
# check for num of na values
df.isna().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [6]:
# handle location and description first since easier

# handle location - split into nation and city and fillna with unknown
df['location'] = df['location'].fillna('NA, Unknown')
df['country'] = df['location'].apply(lambda x : x.strip()[:2])
df['city'] = df['location'].apply(lambda x : x.split(',')[-1])

# a lot of sparse values noted for countries, may lead to unexpected results - to handle - keep countries with counts >10 only and put unknown for the rest
ls_of_countries = [country if df[df['country']==country]['country'].count() >= 10 else 'NA' for country in df['country'].unique()]
df['country'] = df['country'].apply(lambda x : x if x in ls_of_countries else 'NA')

ls_of_cities = [city if df[df['city']==city]['city'].count() >= 10 else 'Unknown' for city in df['city'].unique()]
df['city'] = df['city'].apply(lambda x : x if x in ls_of_cities else 'Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('country')
cols_to_keep.append('city')
cols_to_drop.append('location')

In [7]:
# handle description - convert to binary - with or without
df['has_description'] = df['description'].apply(lambda x: 0 if pd.isna(x) else 1)

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_description')
cols_to_drop.append('description')

In [8]:
# handle department - q a lot of departments - keep only those with high count and take everything else as unknown
ls_of_dept = [dept if df[df['department']==dept]['department'].count() >= 10 else 'Unknown' for dept in df['department'].unique()]
df['has_department'] = df['department'].apply(lambda x : x if x in ls_of_dept else 'Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_department')

In [9]:
# handle salary_range - q a lot of ranges - keep as binary - has_salary or not
df['has_salary'] = df['salary_range'].apply(lambda x : 0 if pd.isna(x) else 1)

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_salary')
cols_to_drop.append('salary_range')

In [10]:
cols_to_drop # handled these colummns

['location', 'description', 'salary_range']

In [11]:
# handle company_profile, requirements and benefits next - for simplicity - keep as binary - has or not
df['has_company_profile'] = df['company_profile'].apply(lambda x : 0 if pd.isna(x) else 1)
df['has_requirements'] = df['requirements'].apply(lambda x : 0 if pd.isna(x) else 1)
df['has_benefits'] = df['benefits'].apply(lambda x : 0 if pd.isna(x) else 1)

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_company_profile')
cols_to_keep.append('has_requirements')
cols_to_keep.append('has_benefits')

In [12]:
# handle employment_type - keep na values as unknown
df['employment_type'] = df['employment_type'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('employment_type')

In [13]:
df['required_experience'].unique()

array(['Internship', 'Not Applicable', nan, 'Mid-Senior level',
       'Associate', 'Entry level', 'Executive', 'Director'], dtype=object)

In [14]:
# handle required_experience - keep na values as unknown
df['required_experience'] = df['required_experience'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('required_experience')

In [15]:
df['required_education'].unique()

array([nan, "Bachelor's Degree", "Master's Degree",
       'High School or equivalent', 'Unspecified',
       'Some College Coursework Completed', 'Vocational', 'Certification',
       'Associate Degree', 'Professional', 'Doctorate',
       'Some High School Coursework', 'Vocational - Degree',
       'Vocational - HS Diploma'], dtype=object)

In [16]:
# handle required_education - keep na values as unknown
df['required_education'] = df['required_education'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('required_education')

In [17]:
cols_to_keep

['country',
 'city',
 'has_description',
 'has_department',
 'has_salary',
 'has_company_profile',
 'has_requirements',
 'has_benefits',
 'employment_type',
 'required_experience',
 'required_education']

In [18]:
# handle industry - keep the industries - fillna with unknown
df['industry'] = df['industry'].fillna('Unknown')

# a lot of sparse values noted for industries, may lead to unexpected results - to handle - keep countries with counts >10 only and put unknown for the rest
ls_of_industries = [industry if df[df['industry']==industry]['industry'].count() >= 10 else 'NA' for industry in df['industry'].unique()]
df['industry'] = df['industry'].apply(lambda x : x if x in ls_of_industries else 'Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('industry')

In [19]:
# handle function - keep the functions - fillna with unknown
df['function'] = df['function'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('function')

In [20]:
cols_to_keep

['country',
 'city',
 'has_description',
 'has_department',
 'has_salary',
 'has_company_profile',
 'has_requirements',
 'has_benefits',
 'employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function']

In [21]:
cols_to_drop

['location', 'description', 'salary_range']

In [22]:
# fill up the null values with - no available data 
df[['company_profile' , 'description','requirements','benefits']] = df[['company_profile' , 'description','requirements','benefits']].fillna('no available data')

In [23]:
# create new feature - job_field - based on the title, dept, industry and function
df['job_field'] = df['title'] +' '+ df['department'] +' '+ df['industry'] +' '+ df['function']
df['job_field'] = df['job_field'].fillna('No available data')
df.drop(['title','department'] , axis = 1 , inplace = True )

In [24]:
df.columns

Index(['job_id', 'location', 'salary_range', 'company_profile', 'description',
       'requirements', 'benefits', 'telecommuting', 'has_company_logo',
       'has_questions', 'employment_type', 'required_experience',
       'required_education', 'industry', 'function', 'fraudulent', 'country',
       'city', 'has_description', 'has_department', 'has_salary',
       'has_company_profile', 'has_requirements', 'has_benefits', 'job_field'],
      dtype='object')

## Next we look into the non null columns to extract what we wanna keep

In [25]:
# drop the cols inside cols_to_drop as it is not useful anymore
df = df.drop(cols_to_drop, axis=1)

In [26]:
cols_to_drop = []

In [27]:
df.isna().sum()

job_id                 0
company_profile        0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
city                   0
has_description        0
has_department         0
has_salary             0
has_company_profile    0
has_requirements       0
has_benefits           0
job_field              0
dtype: int64

In [28]:
# handle job_id - it is unique for all - so not much value - remove the col
df = df.drop('job_id', axis=1)

In [29]:
# handle telecommuting, has_company_logo, has_questions - all are binary - keep as features
# keep track in cols_to_keep
cols_to_keep.append('telecommuting')
cols_to_keep.append('has_company_logo')
cols_to_keep.append('has_questions')

And with that all the columns are processed accordingly, except the text features that are going to be used for text processing

In [30]:
cols_to_keep

['country',
 'city',
 'has_description',
 'has_department',
 'has_salary',
 'has_company_profile',
 'has_requirements',
 'has_benefits',
 'employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function',
 'telecommuting',
 'has_company_logo',
 'has_questions']

## Text Processing - including the text information into the model

This is referred from an implementation by AbulRahman, from the github repo - https://github.com/AbdulrahmenSalem/Text-Preprocessing

Details can be found on the github page, together with the README file explaining the methodologies.

In [31]:
# !pip install nltk # install package if still not installed

In [32]:
# import relevant packages
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import re
from nltk.corpus import stopwords 
import string 
from nltk.stem import WordNetLemmatizer
punc = string.punctuation
lmt = WordNetLemmatizer()
s_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [33]:
# taken from the github repo
class TextPreprocessing() :
    def __init__(self, df : pd.DataFrame = pd.DataFrame) :
        self.df = df
        
        
    def Clean(self , df) :
        self.df = df
        df_copy = df.copy(deep = True)
        text_cols = list(df_copy.select_dtypes(include = ['object']).columns)
        for col in text_cols :
            for idx, text in enumerate(df_copy[col]) :
                te = []
                word = re.sub(r'(@|#)\w+' , '' , text)
                word = re.sub("[,.]", "", word)
                word = re.sub(r'https?://\S+' , '' , word)
                word = re.sub(r'(\?|!)+' , '' , word)
                word = re.sub(r"\(|\)", "", word)
                word = re.sub(r'(^\s+)' , '' , word)
                word = re.sub(r'(\s+$)' , '' , word)
                word = re.sub(r'\d+' , '' , word)
                word = word.split()
                for i in word :
                    if (i not in s_words) & (i not in punc) :
                        i = i.lower()
                        i = lmt.lemmatize(i , 'v')
                        te.append(i)
                df_copy.at[idx , col] = te
        return df_copy
    
    def Vactorize (self, df , target_name) :
        self.df = df
        self.target_name = target_name
        df_cleaned = df.copy(deep = True)
        text_cols = list(df_cleaned.select_dtypes(include = ['object']).columns)
        pos_word = {}
        neg_word = {}

        pos_df = df_cleaned[df_cleaned[target_name] == 1].reset_index(drop=True)
        neg_df = df_cleaned[df_cleaned[target_name] == 0].reset_index(drop=True)
        
        for col in text_cols :

            pos_word[col] = [word for sublist in pos_df[col] for word in sublist]
            neg_word[col] = [word for sublist in neg_df[col] for word in sublist]



        pos_freq = {}
        neg_freq = {}
        for key in pos_word.keys() :
            positive_dict = {}
            for word in pos_word[key] :
                positive_dict[word] = positive_dict.get(word , 0) + 1

            pos_freq[key] = positive_dict


        for key in neg_word.keys() :
            negative_dict = {}
            for word in neg_word[key] :
                negative_dict[word] = negative_dict.get(word , 0) + 1

            neg_freq[key] = negative_dict
            
        return pos_freq , neg_freq

        
        
        
    def Vactorization (self , df , target_name) :
        self.df = df
        self.target_name = target_name
        df_cleaned = df.copy(deep = True)
        text_cols = list(df_cleaned.select_dtypes(include = ['object']).columns)
        pos_freq , neg_freq = TextPreprocessing().Vactorize(df_cleaned , target_name)

        for col in text_cols :
            df_cleaned['{}_pos'.format(col)] = 0
            df_cleaned['{}_neg'.format(col)] = 0
            for idx, List in enumerate(df_cleaned[col]) :
                pos_frequent = 0
                neg_frequent = 0
                for word in List :
                    pos_frequent += pos_freq[col].get(word , 0)
                    neg_frequent += neg_freq[col].get(word , 0)



                df_cleaned.at[idx ,'{}_pos'.format(col)] = pos_frequent
                df_cleaned.at[idx ,'{}_neg'.format(col)] = neg_frequent
            df_cleaned.drop([col] , axis = 1 , inplace = True)
        return df_cleaned
    
    
    def fit_transform(self , df , target_name) :
        self.df = df
        self.target_name = target_name
        
        df_cleaned = TextPreprocessing().Clean(df)
        df_vact = TextPreprocessing().Vactorization(df_cleaned , target_name)
        
        return df_vact
    
    
    def Naive_Bayes(self , df, target_name) :
        self.df = df
        self.target_name = target_name
        df_naive = TextPreprocessing().Clean(df)
        pos_freq , neg_freq = TextPreprocessing().Vactorize(df_naive , target_name)
        text_cols = list(df_naive.select_dtypes(include = ['object']).columns)
        
        v_n_pos , v_n_neg = {} , {}

        for key in pos_freq.keys() :
            v_n_pos[key] = len(pos_freq[key])
            n = 0
            for word in pos_freq[key] :
                n += pos_freq[key].get(word , 0)
            v_n_pos[key]+=n
        for key in neg_freq.keys() :
            v_n_neg[key] = len(neg_freq[key])
            n = 0
            for word in neg_freq[key] :
                n += neg_freq[key].get(word , 0)
            v_n_neg[key]+=n
            
            
        prob_pos_dict = {}
        for key in pos_freq.keys():
            positive_dict = {}
            for word in pos_freq[key] :
                positive_dict[word] = (pos_freq[key].get(word , 0) + 1) / (v_n_pos[key])
            prob_pos_dict[key] = positive_dict




        prob_neg_dict = {}
        for key in neg_freq.keys():
            negative_dict = {}
            for word in neg_freq[key] :
                negative_dict[word] = (neg_freq[key].get(word , 0) + 1) / (v_n_neg[key])
            prob_neg_dict[key] = negative_dict
            
            
            
        for col in text_cols :
            df_naive['{}_probs'.format(col)] = 0
            for idx, List in enumerate(df_naive[col]) :
                score = 0
                for word in List :
                    try :
                        # b = np.log((prob_pos_dict[col].get(word , 0)) / (prob_neg_dict[col].get(word , 0)))
                        if (prob_neg_dict[col].get(word , 0)) != 0:
                            b = np.log((prob_pos_dict[col].get(word , 0)) / (prob_neg_dict[col].get(word , 0)))  
                        else:
                            # Handle the case where neg_word_prob is zero
                            # # You can set a default value or handle it based on your use case
                            b = np.log((prob_pos_dict[col].get(word , 0))) if (prob_pos_dict[col].get(word , 0)) != 0 else 0
                        if b == -float('inf') :
                            pass
                        else :
                            score +=b
                    except :
                        pass

                df_naive['{}_probs'.format(col)][idx] = score
            df_naive.drop([col] , axis = 1 , inplace = True)
            
            
        return {'probs_pos':prob_pos_dict ,'probs_neg':prob_neg_dict } , df_naive  

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   company_profile      17880 non-null  object
 1   requirements         17880 non-null  object
 2   benefits             17880 non-null  object
 3   telecommuting        17880 non-null  int64 
 4   has_company_logo     17880 non-null  int64 
 5   has_questions        17880 non-null  int64 
 6   employment_type      17880 non-null  object
 7   required_experience  17880 non-null  object
 8   required_education   17880 non-null  object
 9   industry             17880 non-null  object
 10  function             17880 non-null  object
 11  fraudulent           17880 non-null  int64 
 12  country              17880 non-null  object
 13  city                 17880 non-null  object
 14  has_description      17880 non-null  int64 
 15  has_department       17880 non-null  object
 16  has_

In [35]:
# cleaning text - check the performance of the function
pre = TextPreprocessing()
df_cleaned = pre.Clean(df)
df_cleaned.head()

Unnamed: 0,company_profile,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,...,fraudulent,country,city,has_description,has_department,has_salary,has_company_profile,has_requirements,has_benefits,job_field
0,"[we're, food, we've, create, groundbreaking, a...","[experience, content, management, systems, maj...","[available, data]",0,1,0,[other],[internship],[unknown],[unknown],...,0,[us],"[new, york]",1,[market],0,1,1,0,"[market, intern, market, unknown, market]"
1,"[second, worlds, cloud, video, production, ser...","[what, expect, you:your, key, responsibility, ...","[what, get, usthrough, part, second, team, gai...",0,1,0,[full-time],"[not, applicable]",[unknown],"[market, advertise]",...,0,[nz],[auckland],1,[unknown],0,1,1,1,"[customer, service, cloud, video, production, ..."
2,"[valor, service, provide, workforce, solutions...","[implement, pre-commissioning, commission, pro...","[available, data]",0,1,0,[unknown],[unknown],[unknown],[unknown],...,0,[us],[unknown],1,[unknown],0,1,1,0,"[no, available, data]"
3,"[our, passion, improve, quality, life, geograp...","[education:, bachelor’s, master’s, gi, busines...","[our, culture, anything, corporate—we, collabo...",0,1,0,[full-time],"[mid-senior, level]","[bachelor's, degree]","[computer, software]",...,0,[us],[washington],1,[sales],0,1,1,1,"[account, executive, washington, dc, sales, co..."
4,"[spotsource, solutions, llc, global, human, ca...","[qualifications:rn, license, state, texasdiplo...","[full, benefit, offer]",0,1,1,[full-time],"[mid-senior, level]","[bachelor's, degree]","[hospital, health, care]",...,0,[us],"[fort, worth]",1,[unknown],0,1,1,1,"[no, available, data]"


## Train-test split and preprocessing before inputting into model

In [36]:
# seperate out the correctly formatted cols and those which still needs processing through ohe
binary_cols = ['has_description', 'has_salary', 'has_company_profile', 'has_requirements', 'has_benefits', 'telecommuting', 'has_company_logo', 'has_questions']

In [37]:
# select the fraudulent column as target, rest as features
nontext_features = df[binary_cols]
text_features = df.drop(binary_cols, axis=1)
target_var = df['fraudulent']

In [38]:
nontext_features # check the colummns if its in correct format

Unnamed: 0,has_description,has_salary,has_company_profile,has_requirements,has_benefits,telecommuting,has_company_logo,has_questions
0,1,0,1,1,0,0,1,0
1,1,0,1,1,1,0,1,0
2,1,0,1,1,0,0,1,0
3,1,0,1,1,1,0,1,0
4,1,0,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...
17875,1,0,1,1,1,0,1,1
17876,1,0,1,1,1,0,1,1
17877,1,0,1,1,0,0,0,0
17878,1,0,0,1,1,0,0,1


In [39]:
# use naive bayes to generate probabilities and values for text data
proba , df_naive = pre.Naive_Bayes(text_features , target_name='fraudulent')

  b = np.log((prob_pos_dict[col].get(word , 0)) / (prob_neg_dict[col].get(word , 0)))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_naive['{}_probs'.format(col)][idx] = score
  b = np.log((prob_pos_dict[col].get(word , 0)) / (prob_neg_dict[col].get(word , 0)))
  b = np.log((prob_pos_dict[col].get(word , 0)) / (prob_neg_dict[col].get(word , 0)))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_naive['{}_probs'.format(col)][idx] = score
  b = np.log((prob_pos_dict[col].get(word , 0)) / (prob_neg_dict[col].get(word , 0)))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stabl

In [40]:
proba # the probability of positive and negative based on the text data provided 

{'probs_pos': {'company_profile': {'staff': 0.006500705502922798,
   '&amp;': 0.004333803668615199,
   'recruit': 0.008113283612174964,
   'do': 0.001159040516024995,
   'right': 0.0011086474501108647,
   'oil': 0.003023583954847813,
   'energy': 0.0012094335819391251,
   'industryrepresented': 0.0011086474501108647,
   'candidates': 0.008818786534972787,
   'automatically': 0.0011086474501108647,
   'grant': 0.0011086474501108647,
   'follow': 0.0028724047571054225,
   'perks:': 0.0011086474501108647,
   'expert': 0.0012598266478532553,
   'negotiations': 0.0028724047571054225,
   'behalf': 0.0011086474501108647,
   'maximize': 0.004484982866357589,
   'compensation': 0.0028724047571054225,
   'package': 0.0031243700866760734,
   'implimenting': 0.0011086474501108647,
   'ongoing': 0.001159040516024995,
   'increase': 0.0024188671638782503,
   'significant': 0.0021669018343075993,
   'sign': 0.005694416448296715,
   'bonus': 0.007458173755291272,
   'refine': 0.004031445273130417,
   

In [41]:
len(proba)

2

In [42]:
df_naive.head()

Unnamed: 0,fraudulent,company_profile_probs,requirements_probs,benefits_probs,employment_type_probs,required_experience_probs,required_education_probs,industry_probs,function_probs,country_probs,city_probs,has_department_probs,job_field_probs
0,0,-17.654849,-3.853466,0.199053,0.382652,-0.536156,0.106159,0.128965,-1.27615,0.350714,-0.981632,-1.815775,-4.549965
1,0,-4.910586,-24.838182,3.975001,-0.14969,0.265053,0.106159,0.203923,0.394455,0.0,0.0,-0.091646,-0.299919
2,0,-1.837206,24.891364,0.199053,0.379746,0.265239,0.106159,0.128965,0.14037,0.350714,0.236872,-0.091646,-0.441021
3,0,0.291504,-11.949994,-15.339014,-0.14969,-0.563942,-1.761893,-3.63333,-0.489883,0.350714,-1.445653,-0.496718,-3.509396
4,0,-13.015182,-1.915117,0.733283,-0.14969,-0.563942,-1.761893,2.331096,-6.280263,0.350714,1.73943,-0.091646,-0.441021


In [46]:
# # train test split
# from sklearn.model_selection import train_test_split

# xtrain, xtest, ytrain, ytest = train_test_split(features_encoded, target_var, random_state=0, test_size=0.2)

In [43]:
# train test split v2 - with text features included
from sklearn.model_selection import train_test_split

# concat text features and ohe encoded features
text_df = df_naive.drop('fraudulent', axis=1)
features_concated = pd.concat([nontext_features, text_df], axis=1)

# train test split
xtrain, xtest, ytrain, ytest = train_test_split(features_concated, target_var, random_state=0, test_size=0.2)

## Model training - Random Forest, XGBoost, Logistic Regression, Support vector machines

In [44]:
# model packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report
from xgboost import XGBClassifier

In [45]:
# model training - Random Forest
# train model
rfc = RandomForestClassifier(random_state=0)
xtrain.columns = xtrain.columns.astype(str) # keep column name
rfc.fit(xtrain, ytrain)

# predictions
xtest.columns = xtest.columns.astype(str) # keep column name
rfc_ypred = rfc.predict(xtest)

# Evaluate the Random Forest model
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(ytest, rfc_ypred))
print("\nClassification Report:\n", classification_report(ytest, rfc_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, rfc_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, rfc_ypred)
precision = precision_score(ytest, rfc_ypred)
recall = recall_score(ytest, rfc_ypred)
f1 = f1_score(ytest, rfc_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Random Forest Classifier:
Accuracy: 0.988255033557047

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      3423
           1       0.97      0.75      0.84       153

    accuracy                           0.99      3576
   macro avg       0.98      0.87      0.92      3576
weighted avg       0.99      0.99      0.99      3576

Confusion Matrix:
[[3420    3]
 [  39  114]]
Accuracy: 0.988255033557047
Precision: 0.9743589743589743
Recall: 0.7450980392156863
F1 Score: 0.8444444444444443


In [46]:
# model training - logistic regression
# train model
lr = LogisticRegression(random_state=0)
xtrain.columns = xtrain.columns.astype(str) # keep column name
lr.fit(xtrain, ytrain)

# predictions
xtest.columns = xtest.columns.astype(str) # keep column name
lr_ypred = lr.predict(xtest)

# Evaluate the logistic regression model
print("Logistic Regression:")
print("Accuracy:", accuracy_score(ytest, lr_ypred))
print("\nClassification Report:\n", classification_report(ytest, lr_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, lr_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, lr_ypred)
precision = precision_score(ytest, lr_ypred)
recall = recall_score(ytest, lr_ypred)
f1 = f1_score(ytest, lr_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Logistic Regression:
Accuracy: 0.9731543624161074

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      3423
           1       0.80      0.50      0.61       153

    accuracy                           0.97      3576
   macro avg       0.89      0.75      0.80      3576
weighted avg       0.97      0.97      0.97      3576

Confusion Matrix:
[[3404   19]
 [  77   76]]
Accuracy: 0.9731543624161074
Precision: 0.8
Recall: 0.49673202614379086
F1 Score: 0.6129032258064516


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
# model training - xgboost
# Create an instance of XGBClassifier
xgb_classifier = XGBClassifier(random_state=0)
xtrain.columns = xtrain.columns.astype(str) # keep column name
xgb_classifier.fit(xtrain, ytrain)

# Make predictions on the test set
xtest.columns = xtest.columns.astype(str) # keep column name
xgb_ypred = xgb_classifier.predict(xtest)

# Evaluate the logistic regression model
print("XGBoost:")
print("Accuracy:", accuracy_score(ytest, xgb_ypred))
print("\nClassification Report:\n", classification_report(ytest, xgb_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, xgb_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, xgb_ypred)
precision = precision_score(ytest, xgb_ypred)
recall = recall_score(ytest, xgb_ypred)
f1 = f1_score(ytest, xgb_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

XGBoost:
Accuracy: 0.9888143176733781

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      3423
           1       0.95      0.78      0.86       153

    accuracy                           0.99      3576
   macro avg       0.97      0.89      0.93      3576
weighted avg       0.99      0.99      0.99      3576

Confusion Matrix:
[[3417    6]
 [  34  119]]
Accuracy: 0.9888143176733781
Precision: 0.952
Recall: 0.7777777777777778
F1 Score: 0.8561151079136692


In [48]:
# import the models 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier

# perform model training and model comparisons
list_of_models = [LogisticRegression() , KNeighborsClassifier() , 
                  DecisionTreeClassifier() ,ExtraTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(xtrain, ytrain)
    for i in range(2) :
        if i == 0 :
            to_pred = xtrain
            pred = ytrain
            title = 'Train'
            
        else :
            to_pred = xtest
            pred = ytest
            title = 'Test'
        y_pred = model.predict(to_pred)
        acc = round(accuracy_score(pred , y_pred)*100)
        f1 = round(f1_score(pred , y_pred)*100)
        prec = round(precision_score(pred , y_pred)*100)
        recall = round(recall_score(pred , y_pred)*100)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall]).reshape(1,4) 
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall'])  
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model , title) } ,inplace=True )
pd.options.display.max_rows = 15
classification_report

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,Accuracy,F1_score,Precision,Recall
LogisticRegression() _ Train Details,97,61,82,49
LogisticRegression() _ Test Details,97,61,80,50
KNeighborsClassifier() _ Train Details,98,76,89,66
KNeighborsClassifier() _ Test Details,98,73,80,67
DecisionTreeClassifier() _ Train Details,100,99,100,97
DecisionTreeClassifier() _ Test Details,98,76,73,79
ExtraTreeClassifier() _ Train Details,100,99,100,97
ExtraTreeClassifier() _ Test Details,98,71,74,67
RandomForestClassifier() _ Train Details,100,99,100,97
RandomForestClassifier() _ Test Details,99,85,97,75


## Use SMOTE for oversampling

In [49]:
target_var.value_counts()

fraudulent
0    17014
1      866
Name: count, dtype: int64

In [50]:
# looking at the target distribution, lets resample to make it around 60:40 ratio
majority_class_samples = 13591
minority_class_samples = (majority_class_samples / 0.6) * 0.4
resampling_strategy = {0: majority_class_samples, 1: minority_class_samples}

In [51]:
# !pip install imblearn # install package if the package is not available

In [52]:
from imblearn.over_sampling import SMOTENC # smotenc is used as it can handle categorical variable
from imblearn.over_sampling import SMOTE

# oversampling
smote_nc = SMOTENC(categorical_features=[x for x in range(8)], random_state=0) # [x for x in range(544)]
xtrain_resampled, ytrain_resampled = smote_nc.fit_resample(xtrain, ytrain)

# smote = SMOTE(random_state=0)
# xtrain_resampled, ytrain_resampled = smote.fit_resample(xtrain, ytrain)

In [53]:
# model training - Random Forest
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report

# train model
rfc = RandomForestClassifier(random_state=0)
rfc.fit(xtrain_resampled, ytrain_resampled)

# predictions
rfc_ypred = rfc.predict(xtest)

# Evaluate the Random Forest model
print("Random Forest:")
print("Accuracy:", accuracy_score(ytest, rfc_ypred))
print("\nClassification Report:\n", classification_report(ytest, rfc_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, rfc_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, rfc_ypred)
precision = precision_score(ytest, rfc_ypred)
recall = recall_score(ytest, rfc_ypred)
f1 = f1_score(ytest, rfc_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Random Forest:
Accuracy: 0.985738255033557

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3423
           1       0.85      0.81      0.83       153

    accuracy                           0.99      3576
   macro avg       0.92      0.90      0.91      3576
weighted avg       0.99      0.99      0.99      3576

Confusion Matrix:
[[3401   22]
 [  29  124]]
Accuracy: 0.985738255033557
Precision: 0.8493150684931506
Recall: 0.8104575163398693
F1 Score: 0.8294314381270903


In [54]:
# model training - Random Forest
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report

# train model
lr = LogisticRegression(random_state=0)
lr.fit(xtrain_resampled, ytrain_resampled)

# predictions
lr_ypred = lr.predict(xtest)

# Evaluate the Random Forest model
print("Logistic Regression:")
print("Accuracy:", accuracy_score(ytest, lr_ypred))
print("\nClassification Report:\n", classification_report(ytest, lr_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, lr_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, lr_ypred)
precision = precision_score(ytest, lr_ypred)
recall = recall_score(ytest, lr_ypred)
f1 = f1_score(ytest, lr_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Logistic Regression:
Accuracy: 0.8973713646532439

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.90      0.94      3423
           1       0.28      0.86      0.42       153

    accuracy                           0.90      3576
   macro avg       0.63      0.88      0.68      3576
weighted avg       0.96      0.90      0.92      3576

Confusion Matrix:
[[3078  345]
 [  22  131]]
Accuracy: 0.8973713646532439
Precision: 0.27521008403361347
Recall: 0.8562091503267973
F1 Score: 0.4165341812400636


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
# model training - xgboost
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report

# Create an instance of XGBClassifier
xgb_classifier = XGBClassifier(random_state=0)
xtrain.columns = xtrain.columns.astype(str) # keep column name
xgb_classifier.fit(xtrain_resampled, ytrain_resampled)

# Make predictions on the test set
xtest.columns = xtest.columns.astype(str) # keep column name
xgb_ypred = xgb_classifier.predict(xtest)

# Evaluate the xgboost model
print("XGBoost:")
print("Accuracy:", accuracy_score(ytest, xgb_ypred))
print("\nClassification Report:\n", classification_report(ytest, xgb_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, xgb_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, xgb_ypred)
precision = precision_score(ytest, xgb_ypred)
recall = recall_score(ytest, xgb_ypred)
f1 = f1_score(ytest, xgb_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

XGBoost:
Accuracy: 0.9860178970917226

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3423
           1       0.82      0.86      0.84       153

    accuracy                           0.99      3576
   macro avg       0.91      0.92      0.92      3576
weighted avg       0.99      0.99      0.99      3576

Confusion Matrix:
[[3395   28]
 [  22  131]]
Accuracy: 0.9860178970917226
Precision: 0.8238993710691824
Recall: 0.8562091503267973
F1 Score: 0.8397435897435896


In [57]:
# perform model training and model comparisons
list_of_models = [LogisticRegression() , KNeighborsClassifier() , 
                  DecisionTreeClassifier() ,ExtraTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(xtrain_resampled, ytrain_resampled)
    for i in range(2) :
        if i == 0 :
            to_pred = xtrain_resampled
            pred = ytrain_resampled
            title = 'Train'
            
        else :
            to_pred = xtest
            pred = ytest
            title = 'Test'
        y_pred = model.predict(to_pred)
        acc = round(accuracy_score(pred , y_pred)*100)
        f1 = round(f1_score(pred , y_pred)*100)
        prec = round(precision_score(pred , y_pred)*100)
        recall = round(recall_score(pred , y_pred)*100)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall]).reshape(1,4) 
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall'])  
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model , title) } ,inplace=True )
pd.options.display.max_rows = 15
classification_report

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,Accuracy,F1_score,Precision,Recall
LogisticRegression() _ Train Details,91,91,90,92
LogisticRegression() _ Test Details,90,42,28,86
KNeighborsClassifier() _ Train Details,98,98,97,100
KNeighborsClassifier() _ Test Details,95,59,45,87
DecisionTreeClassifier() _ Train Details,100,100,100,100
DecisionTreeClassifier() _ Test Details,97,72,65,80
ExtraTreeClassifier() _ Train Details,100,100,100,100
ExtraTreeClassifier() _ Test Details,97,66,56,80
RandomForestClassifier() _ Train Details,100,100,100,100
RandomForestClassifier() _ Test Details,99,82,86,78


## Search for best params using gridsearch

In [None]:
# for random forest - on the original data
rfc_grid = RandomForestClassifier()

# Define the grid of parameters to search through
param_grid = {
    'n_estimators': [100, 300, 500],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required at each leaf node
}

# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=rfc_grid, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform the grid search on your data
grid_search.fit(xtrain_resampled, ytrain_resampled)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

In [None]:
# Extracting the best model from the grid search
best_rf_model = grid_search.best_estimator_

# predictions
rfc_grid_ypred = best_rf_model.predict(xtest)

# Evaluate the Random Forest model
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(ytest, rfc_grid_ypred))
print("\nClassification Report:\n", classification_report(ytest, rfc_grid_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, rfc_grid_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, rfc_grid_ypred)
precision = precision_score(ytest, rfc_grid_ypred)
recall = recall_score(ytest, rfc_grid_ypred)
f1 = f1_score(ytest, rfc_grid_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# Create the Logistic Regression classifier
lr_grid = LogisticRegression()

# Define the grid of parameters to search through
param_grid = {
    'C': [0.001, 0.1, 1, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Penalty (L1 or L2 regularization)
    'solver': ['liblinear', 'saga', 'newton-cholesky']  # Algorithm to use in the optimization problem
}

# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=lr_grid, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform the grid search on your training data
grid_search.fit(xtrain, ytrain)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

In [None]:
# Use the best model to predict on the test set
best_logreg_model = grid_search.best_estimator_

# predictions
lr_grid_ypred = best_logreg_model.predict(xtest)

# Evaluate the Logistic Regression model
print("Logistic Regression:")
print("Accuracy:", accuracy_score(ytest, lr_grid_ypred))
print("\nClassification Report:\n", classification_report(ytest, lr_grid_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, lr_grid_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, lr_grid_ypred)
precision = precision_score(ytest, lr_grid_ypred)
recall = recall_score(ytest, lr_grid_ypred)
f1 = f1_score(ytest, lr_grid_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)