In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import nltk
from nltk.corpus import stopwords
import re
from tabulate import tabulate
import plotly.express as px
import plotly.graph_objects as go
df = pd.read_csv("/content/fake_job_postings.csv")
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [None]:
print("Dimension of rows, columns:", df.shape)

Dimension of rows, columns: (17880, 18)


In [None]:
print("***Features***")
df.info()

***Features***
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  o

In [None]:
print(df['fraudulent'].value_counts())

fraudulent
0    17014
1      866
Name: count, dtype: int64


In [None]:
text_columns = ['title','company_profile','description','requirements','benefits']
df[text_columns] = df[text_columns].fillna(' ')

In [None]:
df['location'].fillna('Unknown', inplace = True)
df['department'].fillna('Unknown', inplace = True)
df['salary_range'].fillna('Unknown', inplace = True)
df['employment_type'].fillna('Unknown', inplace = True)
df['required_experience'].fillna('Unknown', inplace = True)
df['required_education'].fillna('Unknown', inplace = True)
df['industry'].fillna('Unknown', inplace = True)
df['function'].fillna('Unknown', inplace = True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['location'].fillna('Unknown', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['department'].fillna('Unknown', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

In [None]:
# Text preprocessing function
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = [word for word in text.split() if word not in stop_words]
    return ' '.join(text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
for col in text_columns:
    df[col] = df[col].apply(preprocess_text)

In [None]:
df['text'] = df[text_columns].apply(lambda x: ' '.join(x), axis = 1)


In [None]:
# Distribution of fraudulent vs non-fraudulent job postings
fig = px.histogram(df, x='fraudulent', title='Distribution of Fraudulent vs Non-Fraudulent Job Postings',
                   labels={'fraudulent': 'Fraudulent'}, color='fraudulent',
                   color_discrete_sequence=['#1f77b4', '#ff7f0e'])
fig.update_layout(
    template='plotly_dark',
    xaxis_title='Fraudulent',
    yaxis_title='Count',
    title_x=0.5,
    font=dict(family="Arial, sans-serif", size=14, color="white"),
    paper_bgcolor='#1e1e1e',
    plot_bgcolor='#1e1e1e',
    xaxis=dict(gridcolor='gray'),
    yaxis=dict(gridcolor='gray')
)
fig.show()

In [None]:
# Top words in fraudulent job postings
fraudulent_jobs = df[df['fraudulent'] == 1]['text']
non_fraudulent_jobs = df[df['fraudulent'] == 0]['text']

def plot_top_words(text, title):
    word_freq = pd.Series(' '.join(text).split()).value_counts().head(20)
    fig = px.bar(word_freq, x=word_freq.index, y=word_freq.values, title=title,
                 labels={'index': 'Words', 'y': 'Frequency'},
                 color=word_freq.values, color_continuous_scale='Blues')
    fig.update_layout(template='plotly_dark')
    fig.show()

plot_top_words(fraudulent_jobs, 'Top Words in Fraudulent Job Postings')
plot_top_words(non_fraudulent_jobs, 'Top Words in Non-Fraudulent Job Postings')

In [None]:
# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['text']).toarray()
y = df['fraudulent']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Model training using Multinomial Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_prob))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3395
           1       1.00      0.32      0.49       181

    accuracy                           0.97      3576
   macro avg       0.98      0.66      0.73      3576
weighted avg       0.97      0.97      0.96      3576

Confusion Matrix:
[[3395    0]
 [ 123   58]]
ROC AUC Score: 0.9394576033979121


In [None]:
# Visualization of Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
fig = px.imshow(conf_matrix, text_auto=True, title='Confusion Matrix')
fig.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_fig = go.Figure()
roc_fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve', line=dict(color='cyan')))
roc_fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Classifier', line=dict(dash='dash', color='red')))
roc_fig.update_layout(title='ROC Curve', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', template='plotly_dark')
roc_fig.show()
ROC

NameError: name 'ROC' is not defined

In [None]:
# Display 10 samples with actual and predicted values
samples = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
samples = samples.sample(10).reset_index(drop=True)
print("10 Sample Predictions:")
print(tabulate(samples, headers='keys', tablefmt='fancy_grid'))

10 Sample Predictions:
╒════╤══════════╤═════════════╕
│    │   Actual │   Predicted │
╞════╪══════════╪═════════════╡
│  0 │        0 │           0 │
├────┼──────────┼─────────────┤
│  1 │        0 │           0 │
├────┼──────────┼─────────────┤
│  2 │        0 │           0 │
├────┼──────────┼─────────────┤
│  3 │        0 │           0 │
├────┼──────────┼─────────────┤
│  4 │        1 │           0 │
├────┼──────────┼─────────────┤
│  5 │        0 │           0 │
├────┼──────────┼─────────────┤
│  6 │        0 │           0 │
├────┼──────────┼─────────────┤
│  7 │        0 │           0 │
├────┼──────────┼─────────────┤
│  8 │        0 │           0 │
├────┼──────────┼─────────────┤
│  9 │        1 │           1 │
╘════╧══════════╧═════════════╛
