In [4]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


In [11]:
df = pd.read_csv(r"C:\Users\haran\Desktop\Fake_Job_Detection\data\cleaned_dataset.csv")

df["text"] = (
    df["job_title"].astype(str) + " " +
    df["job_description"].astype(str) + " " +
    df["requirements"].astype(str)
)

X = df["text"]
y = df["label"]


In [20]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_vec = vectorizer.fit_transform(X)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42
)

In [10]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [11]:
y_pred = model.predict(X_test)

In [18]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Logistic Regression Confusion Matrix")
plt.savefig(r"C:\Users\haran\Desktop\Fake_Job_Detection\reports\confusion_matrices\logistic_regression_cm.png")
plt.close()

In [25]:
from sklearn.metrics import roc_curve, auc

os.makedirs("reports/roc_curves", exist_ok=True)

# Logistic probabilities
y_prob_log = logistic_model.predict_proba(X_test)[:, 1]

fpr, tpr, _ = roc_curve(y_test, y_prob_log)
roc_auc_log = auc(fpr, tpr)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"Logistic AUC = {roc_auc_log:.2f}")
plt.plot([0,1], [0,1], "k--")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Logistic Regression ROC Curve")
plt.legend()

plt.savefig(
    "reports/roc_curves/logistic_roc.png",
    dpi=300,
    bbox_inches="tight"
)

plt.show()
plt.close()


NameError: name 'logistic_model' is not defined

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
tokenizer = joblib.load("../backend/flask_api/models/tokenizer.pkl")

In [13]:
texts = df["job_description"].astype(str)

In [15]:
sequences = tokenizer.texts_to_sequences(texts)

In [16]:
MAX_LEN = 200  # use SAME value as training
X_seq = pad_sequences(sequences, maxlen=MAX_LEN)

In [22]:
from sklearn.model_selection import train_test_split

indices = np.arange(len(y))

train_idx, test_idx = train_test_split(
    indices, test_size=test_size, random_state=42, stratify=y
)


NameError: name 'test_size' is not defined

In [24]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc


In [26]:
SPLITS = {
    "80_20": 0.2,
    "70_30": 0.3,
    "60_40": 0.4
}

BASE_SAVE_PATH = "assets/images"

MODEL_NAMES = ["logistic", "random_forest", "cnn", "lstm"]


In [None]:
def make_dir(path):
    os.makedirs(path, exist_ok=True)

def save_plot(path):
    plt.tight_layout()
    plt.savefig(path)
    plt.close()


In [27]:
def plot_data_distribution(y_train, save_path):
    sns.countplot(x=y_train)
    plt.title("Data Distribution (Train)")
    save_plot(f"{save_path}/data_distribution.png")


def plot_split_ratio(train_size, test_size, save_path):
    plt.pie(
        [train_size, test_size],
        labels=["Train", "Test"],
        autopct="%1.1f%%",
        startangle=90
    )
    plt.title("Train‚ÄìTest Split Ratio")
    save_plot(f"{save_path}/split_ratio.png")


def plot_confusion(y_true, y_pred, save_path):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix")
    save_plot(f"{save_path}/confusion_matrix.png")


def plot_roc_auc(y_true, y_prob, save_path):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    save_plot(f"{save_path}/roc_curve.png")

    return roc_auc


def plot_auc_bar(auc_score, save_path):
    plt.bar(["Model"], [auc_score])
    plt.title("AUC Score")
    save_plot(f"{save_path}/auc_comparison.png")


In [28]:
def generate_all_images(X, y, X_seq, models):
    """
    X     -> vectorized features (TF-IDF) for ML models
    X_seq -> padded sequences for CNN/LSTM
    y     -> labels
    models -> dict of loaded models
    """

    for split_name, test_size in SPLITS.items():
        print(f"\nGenerating images for split: {split_name}")

        # üîπ create indices ONCE
        indices = np.arange(len(y))

        train_idx, test_idx = train_test_split(
            indices,
            test_size=test_size,
            random_state=42,
            stratify=y
        )

        # üîπ ML data
        X_train_ml = X[train_idx]
        X_test_ml = X[test_idx]

        # üîπ DL data
        X_train_dl = X_seq[train_idx]
        X_test_dl = X_seq[test_idx]

        y_train = y.iloc[train_idx]
        y_test = y.iloc[test_idx]

        for model_name in MODEL_NAMES:
            print(f"  ‚Üí Model: {model_name}")

            save_path = f"{BASE_SAVE_PATH}/{split_name}/{model_name}"
            make_dir(save_path)

            # ----- Common plots -----
            plot_data_distribution(y_train, save_path)
            plot_split_ratio(len(y_train), len(y_test), save_path)

            model = models[model_name]

            # ----- Predictions -----
            if model_name in ["cnn", "lstm"]:
                y_prob = model.predict(X_test_dl).ravel()
                y_pred = (y_prob > 0.5).astype(int)
            else:
                y_prob = model.predict_proba(X_test_ml)[:, 1]
                y_pred = model.predict(X_test_ml)

            # ----- Evaluation plots -----
            plot_confusion(y_test, y_pred, save_path)
            auc_score = plot_roc_auc(y_test, y_prob, save_path)
            plot_auc_bar(auc_score, save_path)

        print(f"Completed split: {split_name}")


In [34]:
import joblib

vectorizer = joblib.load("../backend/flask_api/models/tfidf_vectorizer.pkl")

texts = df["job_description"].astype(str)
y = df["label"]

X = vectorizer.transform(texts)   # ‚úÖ THIS is what ML models need


In [30]:
print(type(X))
print(X[:2])

<class 'pandas.core.series.Series'>
0    Marketing Intern Food52, a fast-growing, James...
1    Customer Service - Cloud Video Production Orga...
Name: text, dtype: object


In [35]:
generate_all_images(X, y, X_seq, MODELS)



Generating images for split: 80_20
  ‚Üí Model: logistic
  ‚Üí Model: random_forest
  ‚Üí Model: cnn
[1m112/112[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2s[0m 10ms/step
  ‚Üí Model: lstm
[1m112/112[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m10s[0m 88ms/step
Completed split: 80_20

Generating images for split: 70_30
  ‚Üí Model: logistic
  ‚Üí Model: random_forest
  ‚Üí Model: cnn
[1m168/168[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2s[0m 10ms/step
  ‚Üí Model: lstm
[1m168/168[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m14s[0m 85ms/step
Completed split: 70_30

Generating images for split: 60_40
  ‚Üí Model: logistic
  ‚Üí Model: random_forest
  ‚Üí Model: cnn
[1m224/224[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2s[0m 9ms/step
  ‚Üí Model: lstm
[1m224/224[