In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from sentence_transformers import SentenceTransformer

# ================================
# Load dataset
# ================================
dataset_path = r"C:\Users\KULKA\.cache\kagglehub\datasets\shivamb\real-or-fake-fake-jobposting-prediction\versions\1\fake_job_postings.csv"
df = pd.read_csv(dataset_path)

# Fill missing descriptions
X = df['description'].fillna("")
y = df['fraudulent']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ================================
# Encode text using BERT embeddings
# ================================
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Encoding train set...")
X_train_emb = bert_model.encode(X_train.tolist(), batch_size=32, show_progress_bar=True)
print("Encoding test set...")
X_test_emb = bert_model.encode(X_test.tolist(), batch_size=32, show_progress_bar=True)

# ==================================
# Linear SVM on BERT embeddings
# ================================
svm_clf = LinearSVC(class_weight="balanced", dual="auto", max_iter=5000)
svm_clf.fit(X_train_emb, y_train)

# Predictions
y_pred = svm_clf.predict(X_test_emb)

# ================================
# Evaluation
# ================================
print("\nðŸ”¹ BERT + Linear SVM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ================================
# Test on a new unseen fraudulent job posting
# ================================
test_post = [
    "Join our innovative software startup as a Junior Developer! Work remotely and earn $8000 per month with flexible hours. "
    "No prior experience needed. Just provide your personal details and a copy of your ID to start immediately. "
    "Limited positions available â€“ apply now!"
]

test_emb = bert_model.encode(test_post)
pred_test = svm_clf.predict(test_emb)
print("Prediction for test post (1 = Fraud, 0 = Real):", pred_test[0])


Encoding train set...


Batches:   0%|          | 0/447 [00:00<?, ?it/s]

Encoding test set...


Batches:   0%|          | 0/112 [00:00<?, ?it/s]


ðŸ”¹ BERT + Linear SVM Results:
Accuracy: 0.8618568232662193
              precision    recall  f1-score   support

           0       0.99      0.87      0.92      3403
           1       0.23      0.79      0.36       173

    accuracy                           0.86      3576
   macro avg       0.61      0.83      0.64      3576
weighted avg       0.95      0.86      0.90      3576

Prediction for test post (1 = Fraud, 0 = Real): 1


In [3]:
# joblib - python library to save models 
# models are saved to avoid retraining it again
# once this models are directly imported for using it in api
import joblib

# Save the trained model
joblib.dump(svm_clf, "svm_model.pkl")

# Load it later (without retraining)
svm_model = joblib.load("svm_model.pkl")
