In [None]:
#importing libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, recall_score, precision_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [None]:
#loading the dataset 
df = pd.read_csv('/content/drive/MyDrive/fake_job_postings.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15185 non-null  object
 8   benefits             10670 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [None]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [None]:
df = df[df['location'].notna()]

In [None]:
#drop unwanted columns 
columns_to_drop = ['description', 'requirements', 'benefits', 'company_profile', 'location', 'salary_range', 'job_id']
df_dropped = df.drop(columns_to_drop, axis = 1 )

In [None]:
#creating 2 columns country and state from location 
county_col = df["location"].str.split(pat = ",")
county_col.dropna()
country = []
state = []
for var in county_col:
  if var != None:
    if len(var) <= 3:
      if len(var) > 0:
        country.append(var[0])
      else:
        country.append(None)
      if len(var) > 1:
        state.append(var[1])
      else:
        state.append(None)
    else:
      country.append(None)
      state.append(None)

df_dropped["country"] = country
df_dropped["state"] = state
  
dataset = df_dropped[df_dropped['state'].notna()]

In [None]:
dataset.head()

Unnamed: 0,title,department,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country,state
0,Marketing Intern,Marketing,0,1,0,Other,Internship,,,Marketing,0,US,NY
1,Customer Service - Cloud Video Production,Success,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,
2,Commissioning Machinery Assistant (CMA),,0,1,0,,,,,,0,US,IA
3,Account Executive - Washington DC,Sales,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,DC
4,Bill Review Manager,,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,FL


In [None]:
#categorizing columns with NaN values and Not Na values 
non_na_columns = ['title', 'telecommuting', 'has_company_logo', 'has_questions', 'fraudulent', 'country', 'state']
columns_to_predict = ['employment_type','function', 'required_experience', 'industry', 'required_education', 'department' ]



In [None]:
def labelencode(dataset, columns):
  le = LabelEncoder()
  for feature in columns:
      try:
          dataset.loc[:, feature] = le.fit_transform(dataset.loc[:, feature])
          #print(feature)
      except:
          print('Error encoding '+feature)

In [None]:
def onehotencode(dataset, columns):
  ohe = OneHotEncoder(sparse=True)
  for feature in columns:
      try:
          dataset.loc[:, feature] = ohe.fit_transform(dataset.loc[:, feature])
      except:
          print('Error encoding '+feature)

In [None]:
#random forest model function 
def RandomForest(dataset):
  x_data = dataset.copy(deep = True)
  y_data = x_data['fraudulent']
  x_data.drop('fraudulent', axis=1, inplace = True)
  
  x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.15, random_state = 0)
  
  classifier = RandomForestClassifier(n_estimators= 10, criterion="entropy")
  
  classifier.fit(x_train, y_train)
  
  y_predr = classifier.predict(x_test)
  print(confusion_matrix(y_test, y_predr))

  print("accuracy score ->", round(accuracy_score(y_test, y_predr)*100, 2),"%")
  print("recall score ->", round(recall_score(y_test, y_predr)*100, 2), "%")
  print("precision score ->",round(precision_score(y_test, y_predr)*100, 2), "%")
  print("f1_score ->",round(f1_score(y_test,y_predr)*100, 2), "%")

  from sklearn.metrics import roc_auc_score
  print("auc score ->" , round(roc_auc_score(y_test, y_predr)*100, 2), "%")

In [None]:
#logistic regression model function
def Logistic(dataset):
  x_data = dataset.copy(deep = True)
  y_data = x_data['fraudulent']
  x_data.drop('fraudulent', axis=1, inplace = True)
  
  x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.15, random_state = 0)
  
  classifier = LogisticRegression()
  
  classifier.fit(x_train, y_train)
  
  y_predr = classifier.predict(x_test)
  print(confusion_matrix(y_test, y_predr))

  print("accuracy score ->", round(accuracy_score(y_test, y_predr)*100, 2),"%")
  print("recall score ->", round(recall_score(y_test, y_predr)*100, 2), "%")
  print("precision score ->",round(precision_score(y_test, y_predr)*100, 2), "%")
  print("f1_score ->",round(f1_score(y_test,y_predr)*100, 2), "%")

  from sklearn.metrics import roc_auc_score
  print("auc score ->" , round(roc_auc_score(y_test, y_predr)*100, 2), "%")

In [None]:
label_encoding_columns = ['required_education', 'required_experience', 'employment_type','title', 'industry', 'function', 'country', 'state', 'department']
onehot_encoding_columns = []

non_na_columns = ['title', 'telecommuting', 'has_company_logo', 'has_questions', 'fraudulent', 'country', 'state']
columns_to_predict = ['employment_type','function', 'required_experience', 'industry', 'required_education', 'department' ]

In [None]:
columnsToEncode = list(dataset.select_dtypes(include=['category','object']))
le = LabelEncoder()
for feature in columnsToEncode and non_na_columns:
    
    try:
        dataset[feature] = le.fit_transform(dataset[feature])
        #print(feature)
    except:
        print('Error encoding '+feature)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
dataset.head()

Unnamed: 0,title,department,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country,state
0,5835,Marketing,0,1,0,Other,Internship,,,Marketing,0,85,228
1,2106,Success,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,60,0
2,1701,,0,1,0,,,,,,0,85,156
3,289,Sales,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,85,105
4,948,,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,85,131


In [None]:
#selecting each column training the data on random forest and logistic regression model and predicting NAN values of each column

#set i=0 for random forest and i = 1 for logistic regression
i = 0
classify = [RandomForestClassifier(n_estimators= 10, criterion="entropy"), LogisticRegression()]

for column in columns_to_predict:
  
  data = dataset.copy(deep = True)

  data = data[data[column].notna()]

  target = le.fit_transform(data[column])
  
  dataset.loc[dataset[column].notna(),column] =  le.fit_transform(target)

  
  data.drop(columns_to_predict, axis = 1, inplace = True ) 

  dataset_to_predict = dataset.copy(deep = True)

  dataset_to_predict = dataset_to_predict[dataset_to_predict[column].isna()]
 
  dataset_to_predict.drop(columns_to_predict, axis = 1, inplace = True)

  x_train, x_test, y_train, y_test = train_test_split(data, target, test_size = 0.15, random_state = 0)

  classifier = classify[i].fit(x_train, y_train)

  y_predicted = classifier.predict(dataset_to_predict)

  dataset.loc[dataset[column].isnull(),column] = y_predicted

  columns_to_predict.remove(column)
  non_na_columns.append(column)

print(columns_to_predict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


[]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [None]:
print(columns_to_predict)

[]


In [None]:
RandomForest(dataset)

[[2472    7]
 [  40   79]]
accuracy score -> 98.19 %
recall score -> 66.39 %
precision score -> 91.86 %
f1_score -> 77.07 %
auc score -> 83.05 %


In [None]:
Logistic(dataset)

[[2479    0]
 [ 114    5]]
accuracy score -> 95.61 %
recall score -> 4.2 %
precision score -> 100.0 %
f1_score -> 8.06 %
auc score -> 52.1 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
