In [83]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score, roc_auc_score, recall_score, precision_score,
    confusion_matrix)

RANDOM_STATE = 42



In [84]:
# 2) Load cleaned dataset (must contain 'Ground_Truth' and 'full_text')
#    Change the path if your file lives elsewhere.
CSV_PATH = "fake_job_postings.csv"
assert os.path.exists(CSV_PATH), f"Could not find {CSV_PATH}. Put it next to the notebook or update CSV_PATH."

df = pd.read_csv(CSV_PATH)

# Basic sanity checks
required_cols = {"Ground_Truth", "full_text", "has_company_logo", "has_questions"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}. Ensure you used the cleaned file with these fields.")

#show head
df.head()

Unnamed: 0,Ground_Truth,job_id,title,company_profile,description,requirements,benefits,has_company_logo,has_questions,full_text
0,0,5231,SEM Coordinator,urlcaaedbcabfaaecaeceadcd is a modern online t...,the right candidate will be responsible for co...,previous experience of minimum years in sem wi...,attractive remuneration package work in an int...,1,1,SEM Coordinator urlcaaedbcabfaaecaeceadcd is a...
1,0,14113,Senior Data Scientist,as a growing and successful startup conversoci...,conversocial builds software that helps compan...,good working knowledge of python or similar la...,salary of k plus stock optionsk annual confere...,1,0,Senior Data Scientist as a growing and success...
2,0,3169,Junior Web Marketing Specialist,atnet communications ae x xx x interactive mar...,atnet communications junior web marketing spec...,xd xhandson x x adwords accounts google analyt...,h x x xxd x x,1,1,Junior Web Marketing Specialist atnet communic...
3,0,14805,New Product Development Project Leader - Full ...,we provide full time permanent positions for m...,coordination and project management of new pro...,location atlanta ga usajob type permanentjob r...,no information provided,0,0,New Product Development Project Leader - Full ...
4,0,5810,Data Intern - Retail & Apparel Analysis,we build software for fashion retailers to hel...,about editdeditds software is the market leade...,no information provided,no information provided,1,1,Data Intern - Retail & Apparel Analysis we bui...


In [85]:
# 3) Optional: simple numeric side-features
#    These are quick, informative signals to complement TF-IDF text:
df["word_count"] = df["full_text"].apply(lambda x: len(str(x).split()))
df["char_count"] = df["full_text"].apply(lambda x: len(str(x)))

In [86]:
# 4) Select features/target
TEXT_COL = "full_text"
NUM_COLS = ["has_company_logo", "has_questions", "word_count", "char_count"]
TARGET = "Ground_Truth"

X = df[[TEXT_COL] + NUM_COLS].copy()
y = df[TARGET].astype(int)


In [87]:
# 5) Train/Test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

In [88]:


# 6) ColumnTransformer for text + numeric branches
text_transformer = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

num_transformer = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=False))  
])

preprocess = ColumnTransformer(
    transformers=[
        ("text", text_transformer, TEXT_COL),
        ("num", num_transformer, NUM_COLS),
    ],
    remainder="drop"
)


In [89]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "Linear SVM": LinearSVC(class_weight="balanced", random_state=RANDOM_STATE),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    "Gradient Boosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
}



In [90]:
# 8) Helper to compute metrics (including specificity)
def evaluate_metrics(y_true, y_pred, y_score=None):
    """
    y_score: probabilities or decision scores for positive class (1), used for AUC.
    """
    acc = accuracy_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)          
    prec = precision_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    spec = tn / (tn + fp) if (tn + fp) > 0 else np.nan
    auc = roc_auc_score(y_true, y_score) if y_score is not None else np.nan
    return acc, auc, rec, prec, spec


In [91]:
# 9) Train, predict, evaluate
results = []
for name, clf in models.items():
    pipe = Pipeline(steps=[
        ("prep", preprocess),
        ("clf", clf)
    ])
    pipe.fit(X_train, y_train)

    # Predictions
    y_pred = pipe.predict(X_test)

    # Decision scores / probabilities for AUC:
    y_score = None
    # Try predict_proba
    if hasattr(pipe.named_steps["clf"], "predict_proba"):
        # column 1 is the positive class (Ground_Truth==1)
        y_score = pipe.predict_proba(X_test)[:, 1]
    # else try decision_function (e.g., LinearSVC returns distance to hyperplane)
    elif hasattr(pipe.named_steps["clf"], "decision_function"):
        raw = pipe.decision_function(X_test)
        # Scale to [0,1] with a simple min-max to approximate probability-like scores for ROC-AUC
        # ( comparing models; ROC-AUC only needs ranking.)
        raw_min, raw_max = raw.min(), raw.max()
        if raw_max > raw_min:
            y_score = (raw - raw_min) / (raw_max - raw_min)
        else:
            y_score = np.zeros_like(raw) 

    acc, auc, rec, prec, spec = evaluate_metrics(y_test, y_pred, y_score)
    results.append({
        "Model": name,
        "Accuracy": round(acc, 4),
        "AUC": round(auc, 4) if not np.isnan(auc) else "N/A",
        "Recall (Sensitivity)": round(rec, 4),
        "Precision": round(prec, 4),
        "Specificity": round(spec, 4)
    })




In [92]:
# 10) Display results
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Model,Accuracy,AUC,Recall (Sensitivity),Precision,Specificity
0,Logistic Regression,0.9437,0.9766,0.8205,0.4571,0.9501
1,KNN (k=5),0.9637,0.9026,0.4359,0.7083,0.9908
2,Linear SVM,0.9775,0.9711,0.6923,0.8182,0.9921
3,Random Forest,0.9675,0.9324,0.3333,1.0,1.0
4,Gradient Boosting,0.9712,0.9696,0.4103,1.0,1.0


In [93]:
from pyspark.sql import SparkSession
#Get the current Spark session and Stop it
spark = SparkSession.builder.getOrCreate()
spark.stop()

ModuleNotFoundError: No module named 'pyspark'