In [2]:
!pip install pandas numpy scikit-learn tensorflow keras nltk flask flask-cors imbalanced-learn matplotlib seaborn joblib scipy




In [3]:
import pandas as pd
import numpy as np
import nltk
import re
import joblib
import tensorflow as tf

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
df = pd.read_csv("fake_job_postings.csv")
df.head()


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [5]:
df = df[['title', 'description', 'fraudulent']]
df.dropna(inplace=True)

ps = PorterStemmer()
corpus = []

for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['description'].iloc[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


In [6]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(corpus)

sequences = tokenizer.texts_to_sequences(corpus)
X = pad_sequences(sequences, maxlen=200)

le = LabelEncoder()
y = le.fit_transform(df['fraudulent'])


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
model = Sequential()
model.add(Embedding(5000, 128, input_length=200))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()




In [9]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_test, y_test)
)


Epoch 1/5
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.9432 - loss: 0.2559 - val_accuracy: 0.9645 - val_loss: 0.1283
Epoch 2/5
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9730 - loss: 0.0901 - val_accuracy: 0.9690 - val_loss: 0.1214
Epoch 3/5
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9806 - loss: 0.0627 - val_accuracy: 0.9709 - val_loss: 0.1218
Epoch 4/5
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.9877 - loss: 0.0383 - val_accuracy: 0.9734 - val_loss: 0.1289
Epoch 5/5
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9920 - loss: 0.0244 - val_accuracy: 0.9732 - val_loss: 0.1319


In [10]:
model.save("fraud_lstm_model.h5")
joblib.dump(tokenizer, "tokenizer.pkl")




['tokenizer.pkl']

In [11]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
Accuracy: 0.9731543624161074
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3394
           1       0.88      0.55      0.68       182

    accuracy                           0.97      3576
   macro avg       0.93      0.77      0.83      3576
weighted avg       0.97      0.97      0.97      3576



In [12]:
def detect_fraud(job_text):
    job_text = re.sub('[^a-zA-Z]', ' ', job_text)
    job_text = job_text.lower().split()
    job_text = ' '.join(job_text)

    seq = tokenizer.texts_to_sequences([job_text])
    pad = pad_sequences(seq, maxlen=200)

    result = model.predict(pad)[0][0]

    if result > 0.5:
        return f"⚠️ FRAUDULENT JOB POSTING ({round(result*100,2)}%)"
    else:
        return f"✅ REAL JOB POSTING ({round((1-result)*100,2)}%)"


In [13]:
sample_job = """
Work from home! Earn Rs.50,000/week.
No experience required.
WhatsApp for instant hiring. Pay registration fee.
"""

print(detect_fraud(sample_job))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
⚠️ FRAUDULENT JOB POSTING (52.779998779296875%)


In [14]:
real_job = """
Software Developer required with 2 years experience in Python and ML.
Office location: Bangalore. Fixed salary.
"""

print(detect_fraud(real_job))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
✅ REAL JOB POSTING (94.41999816894531%)
