In [2]:
%pip install pandas==1.5.3 numpy==1.26.4 scikit-learn==1.2.2 shap==0.41.0 matplotlib==3.7.1 joblib==1.2.0

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np

In [5]:
%pip install setuptools

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

Note: you may need to restart the kernel to use updated packages.


In [6]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [7]:
import joblib

In [13]:
print("Configuring...")
DATA_PATH = "data/dataset.csv"
TARGET = "FAKE"
RANDOM_STATE = 42
N_SPLITS = 5
RESULTS_DIR = "model_results/classifiers"
MODELS_DIR = "models/classifiers"
EXCLUDED_COLS = ["time"]
os.makedirs(RESULTS_DIR, exist_ok=True)

Configuring...


In [14]:
df = pd.read_csv(DATA_PATH)
print("Data shape:", df.shape)

Data shape: (10000, 5)


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Use only the title and cap TF-IDF features to 10
if "title" not in df.columns:
    raise KeyError("Column 'title' not found in dataframe")

vectorizer_title = TfidfVectorizer(max_features=10, ngram_range=(1,2), stop_words='english')
X_title_tfidf = vectorizer_title.fit_transform(df['title'].astype(str))

# Choose SVD components (<=10, at least 1)
n_comp_title = max(1, min(10, X_title_tfidf.shape[1] - 1))
svd_title = TruncatedSVD(n_components=n_comp_title, random_state=RANDOM_STATE)
X_title_reduced = svd_title.fit_transform(X_title_tfidf)

X = pd.DataFrame(X_title_reduced, index=df.index, columns=[f"svd_title_{i}" for i in range(X_title_reduced.shape[1])])

y = df[TARGET]

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
results = {}

print(f"Prepared features (title only, capped to 10 TF-IDF features): X={X.shape}, X_scaled={X_scaled.shape}")


Prepared features (title only, capped to 10 TF-IDF features): X=(10000, 9), X_scaled=(10000, 9)


In [19]:
def return_results(model, use_scaled=False):
    X_data = X_scaled if use_scaled else X
    accuracy = cross_val_score(model, X_data, y, cv=cv, scoring="accuracy")
    precision = cross_val_score(model, X_data, y, cv=cv, scoring="precision")
    recall = cross_val_score(model, X_data, y, cv=cv, scoring="recall")
    f1 = 2 / (precision ** -1 + recall ** -1)
    return accuracy, precision, recall, f1

In [None]:
svm_pipeline = Pipeline([
    ("svm", SVC(probability=True))  # Removed StandardScaler since we pre-scaled
])

param_grid = {
    "svm__kernel": ["linear", "rbf", "poly", "sigmoid"],
    "svm__C": [0.1, 1, 10],
    "svm__gamma": ["scale", "auto"]
}

grid_svm = GridSearchCV(svm_pipeline, param_grid, cv=cv, scoring="accuracy", n_jobs=-1)
grid_svm.fit(X_scaled, y)  # Use scaled data

svm_best = grid_svm.best_estimator_
svm_acc, svm_prec, svm_rec, svm_f1 = return_results(svm_best, use_scaled=True)

print("SVM Best Params:", grid_svm.best_params_)
print("SVM Accuracy: %.4f ± %.4f" % (svm_acc.mean(), svm_acc.std()))
print("SVM Precision: %.4f ± %.4f" % (svm_prec.mean(), svm_prec.std()))
print("SVM Recall: %.4f ± %.4f" % (svm_rec.mean(), svm_rec.std()))
print("SVM F1 Score: %.4f ± %.4f" % (svm_f1.mean(), svm_f1.std()))

joblib.dump(svm_best, os.path.join(MODELS_DIR, "svm_model.pkl"))

In [None]:
knn_pipeline = Pipeline([
    ("knn", KNeighborsClassifier())  # Removed StandardScaler since we pre-scaled
])

param_grid_knn = {
    "knn__n_neighbors": [3, 5, 7, 9],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["euclidean", "manhattan"]
}

grid_knn = GridSearchCV(knn_pipeline, param_grid_knn, cv=cv, scoring="accuracy", n_jobs=-1)
grid_knn.fit(X_scaled, y)  # Use scaled data

knn_best = grid_knn.best_estimator_
knn_acc, knn_prec, knn_rec, knn_f1 = return_results(knn_best, use_scaled=True)

print("KNN Best Params:", grid_knn.best_params_)
print("KNN Accuracy: %.4f ± %.4f" % (knn_acc.mean(), knn_acc.std()))
print("KNN Precision: %.4f ± %.4f" % (knn_prec.mean(), knn_prec.std()))
print("KNN Recall: %.4f ± %.4f" % (knn_rec.mean(), knn_rec.std()))
print("KNN F1 Score: %.4f ± %.4f" % (knn_f1.mean(), knn_f1.std()))

joblib.dump(knn_best, os.path.join(MODELS_DIR, "knn_model.pkl"))

KNN Best Params: {'knn__metric': 'manhattan', 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
KNN Accuracy: 0.9802 ± 0.0031
KNN Precision: 0.9766 ± 0.0038
KNN Recall: 0.9598 ± 0.0075
KNN F1 Score: 0.9681 ± 0.0050


['models/classifiers\\knn_model.pkl']

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt_acc, dt_prec, dt_rec, dt_f1 = return_results(dt)

print("Decision Tree Accuracy: %.4f ± %.4f" % (dt_acc.mean(), dt_acc.std()))
print("Decision Tree Precision: %.4f ± %.4f" % (dt_prec.mean(), dt_prec.std()))
print("Decision Tree Recall: %.4f ± %.4f" % (dt_rec.mean(), dt_rec.std()))
print("Decision Tree F1 Score: %.4f ± %.4f" % (dt_f1.mean(), dt_f1.std()))

dt.fit(X, y)
joblib.dump(dt, os.path.join(MODELS_DIR, "decision_tree_model.pkl"))

Decision Tree Accuracy: 0.9782 ± 0.0038
Decision Tree Precision: 0.9657 ± 0.0109
Decision Tree Recall: 0.9649 ± 0.0073
Decision Tree F1 Score: 0.9652 ± 0.0060


['models/classifiers\\decision_tree_model.pkl']

In [None]:
%pip install xgboost lightgbm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# --- XGBoost ---
xgb_pipeline = Pipeline([("xgb", XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=RANDOM_STATE))])

param_grid_xgb = {
    "xgb__n_estimators": [100, 200],
    "xgb__max_depth": [3, 6],
    "xgb__learning_rate": [0.01, 0.1],
    "xgb__subsample": [0.8, 1.0],
    "xgb__colsample_bytree": [0.8, 1.0],
}

grid_xgb = GridSearchCV(xgb_pipeline, param_grid_xgb, cv=cv, scoring="accuracy", n_jobs=-1)
grid_xgb.fit(X, y)

xgb_best = grid_xgb.best_estimator_
xgb_acc, xgb_prec, xgb_rec, xgb_f1 = return_results(xgb_best)

print("XGBoost Best Params:", grid_xgb.best_params_)
print("XGBoost Results:")
print("  Accuracy: %.4f ± %.4f" % (xgb_acc.mean(), xgb_acc.std()))
print("  Precision: %.4f ± %.4f" % (xgb_prec.mean(), xgb_prec.std()))
print("  Recall: %.4f ± %.4f" % (xgb_rec.mean(), xgb_rec.std()))
print("  F1-Score: %.4f ± %.4f" % (xgb_f1.mean(), xgb_f1.std()))

xgb_best.fit(X, y)
joblib.dump(xgb_best, os.path.join(MODELS_DIR, "xgboost_model.pkl"))

# --- LightGBM ---
lgb_pipeline = Pipeline([("lgb", LGBMClassifier(random_state=RANDOM_STATE))])

param_grid_lgb = {
    "lgb__n_estimators": [100, 200],
    "lgb__num_leaves": [31, 63],
    "lgb__learning_rate": [0.01, 0.1],
    "lgb__subsample": [0.8, 1.0],
    "lgb__colsample_bytree": [0.8, 1.0],
}

grid_lgb = GridSearchCV(lgb_pipeline, param_grid_lgb, cv=cv, scoring="accuracy", n_jobs=-1)
grid_lgb.fit(X, y)

lgb_best = grid_lgb.best_estimator_
lgb_acc, lgb_prec, lgb_rec, lgb_f1 = return_results(lgb_best)

print("LightGBM Best Params:", grid_lgb.best_params_)
print("LightGBM Results:")
print("  Accuracy: %.4f ± %.4f" % (lgb_acc.mean(), lgb_acc.std()))
print("  Precision: %.4f ± %.4f" % (lgb_prec.mean(), lgb_prec.std()))
print("  Recall: %.4f ± %.4f" % (lgb_rec.mean(), lgb_rec.std()))
print("  F1-Score: %.4f ± %.4f" % (lgb_f1.mean(), lgb_f1.std()))

lgb_best.fit(X, y)
joblib.dump(lgb_best, os.path.join(MODELS_DIR, "lightgbm_model.pkl"))

XGBoost Best Params: {'xgb__colsample_bytree': 0.8, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__n_estimators': 200, 'xgb__subsample': 0.8}
XGBoost Results:
  Accuracy: 0.9904 ± 0.0019
  Precision: 0.9897 ± 0.0024
  Recall: 0.9796 ± 0.0048
  F1-Score: 0.9846 ± 0.0030
[LightGBM] [Info] Number of positive: 1568, number of negative: 3432
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 542
[LightGBM] [Info] Number of data points in the train set: 5000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.313600 -> initscore=-0.783342
[LightGBM] [Info] Start training from score -0.783342
[LightGBM] [Info] Number of positive: 1255, number of negative: 2745
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000170 secon

['models/classifiers\\lightgbm_model.pkl']

In [None]:
from sklearn.metrics import roc_auc_score

# Predict probabilities (positive class) and compute AUROC
xgb_probs = xgb_best.predict_proba(X)[:, 1]
lgb_probs = lgb_best.predict_proba(X)[:, 1]

xgb_auc = roc_auc_score(y, xgb_probs)
lgb_auc = roc_auc_score(y, lgb_probs)

print(f"XGBoost AUROC: {xgb_auc:.4f}")
print(f"LightGBM AUROC: {lgb_auc:.4f}")

# store results if needed
results.update({"xgb_auroc": xgb_auc, "lgb_auroc": lgb_auc})

XGBoost AUROC: 1.0000
LightGBM AUROC: 1.0000


In [21]:
rf = RandomForestClassifier(random_state=42, n_estimators=200)
rf_acc, rf_prec, rf_rec, rf_f1 = return_results(rf)

print("Random Forest Accuracy: %.4f ± %.4f" % (rf_acc.mean(), rf_acc.std()))
print("Random Forest Precision: %.4f ± %.4f" % (rf_prec.mean(), rf_prec.std()))
print("Random Forest Recall: %.4f ± %.4f" % (rf_rec.mean(), rf_rec.std()))
print("Random Forest F1 Score: %.4f ± %.4f" % (rf_f1.mean(), rf_f1.std()))

rf.fit(X, y)
joblib.dump(rf, os.path.join(MODELS_DIR, "random_forest_model.pkl"))

Random Forest Accuracy: 0.7457 ± 0.0054
Random Forest Precision: 0.8095 ± 0.0742
Random Forest Recall: 0.6596 ± 0.0699
Random Forest F1 Score: 0.7200 ± 0.0235


['models/classifiers\\random_forest_model.pkl']

In [None]:
logreg_pipeline = Pipeline([
    ("logreg", LogisticRegression(max_iter=1000, solver="liblinear"))  # Removed StandardScaler since we pre-scaled
])

logreg_acc, logreg_prec, logreg_rec, logreg_f1 = return_results(logreg_pipeline, use_scaled=True)

print("Logistic Regression Accuracy: %.4f ± %.4f" % (logreg_acc.mean(), logreg_acc.std()))
print("Logistic Regression Precision: %.4f ± %.4f" % (logreg_prec.mean(), logreg_prec.std()))
print("Logistic Regression Recall: %.4f ± %.4f" % (logreg_rec.mean(), logreg_rec.std()))
print("Logistic Regression F1 Score: %.4f ± %.4f" % (logreg_f1.mean(), logreg_f1.std()))

logreg_pipeline.fit(X_scaled, y)
joblib.dump(logreg_pipeline, os.path.join(MODELS_DIR, "logistic_regression_model.pkl"))

Logistic Regression Accuracy: 0.7658 ± 0.0071
Logistic Regression Precision: 0.6888 ± 0.0254
Logistic Regression Recall: 0.4643 ± 0.0203
Logistic Regression F1 Score: 0.5541 ± 0.0133


['models/classifiers\\logistic_regression_model.pkl']

In [None]:
# Save Accuracy Results
results = {
    "SVM": {"Accuracy": (svm_acc.mean(), svm_acc.std()),
            "Precision": (svm_prec.mean(), svm_prec.std()),
            "Recall": (svm_rec.mean(), svm_rec.std()),
            "F1 Score": (svm_f1.mean(), svm_f1.std())},
    "KNN": {"Accuracy": (knn_acc.mean(), knn_acc.std()),
            "Precision": (knn_prec.mean(), knn_prec.std()),
            "Recall": (knn_rec.mean(), knn_rec.std()),
            "F1 Score": (knn_f1.mean(), knn_f1.std())},
    "Decision Tree": {"Accuracy": (dt_acc.mean(), dt_acc.std()),
                     "Precision": (dt_prec.mean(), dt_prec.std()),
                     "Recall": (dt_rec.mean(), dt_rec.std()),
                     "F1 Score": (dt_f1.mean(), dt_f1.std())},
    "Random Forest": {"Accuracy": (rf_acc.mean(), rf_acc.std()),
                     "Precision": (rf_prec.mean(), rf_prec.std()),
                     "Recall": (rf_rec.mean(), rf_rec.std()),
                     "F1 Score": (rf_f1.mean(), rf_f1.std())},
    "Logistic Regression": {"Accuracy": (logreg_acc.mean(), logreg_acc.std()),
                           "Precision": (logreg_prec.mean(), logreg_prec.std()),
                           "Recall": (logreg_rec.mean(), logreg_rec.std()),
                           "F1 Score": (logreg_f1.mean(), logreg_f1.std())},
    "Cox Proportional Hazards": (c_index, 0)  # Concordance index doesn't have std deviation
}

for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    if type(metrics) != dict:
        print(f"  Concordance Index: {metrics[0]:.4f}")
    else:
        print(f"  Accuracy: {metrics['Accuracy'][0]:.4f} ± {metrics['Accuracy'][1]:.4f}")
        print(f"  Precision: {metrics['Precision'][0]:.4f} ± {metrics['Precision'][1]:.4f}")
        print(f"  Recall: {metrics['Recall'][0]:.4f} ± {metrics['Recall'][1]:.4f}")
        print(f"  F1-Score: {metrics['F1 Score'][0]:.4f} ± {metrics['F1 Score'][1]:.4f}")

joblib.dump(results, os.path.join(RESULTS_DIR, "model_accuracies.pkl"))


SVM:
  Accuracy: 0.9656 ± 0.0047
  Precision: 0.9458 ± 0.0080
  Recall: 0.9445 ± 0.0127
  F1-Score: 0.9451 ± 0.0078

KNN:
  Accuracy: 0.9802 ± 0.0031
  Precision: 0.9766 ± 0.0038
  Recall: 0.9598 ± 0.0075
  F1-Score: 0.9681 ± 0.0050

Decision Tree:
  Accuracy: 0.9782 ± 0.0038
  Precision: 0.9657 ± 0.0109
  Recall: 0.9649 ± 0.0073
  F1-Score: 0.9652 ± 0.0060

Random Forest:
  Accuracy: 0.9888 ± 0.0012
  Precision: 0.9909 ± 0.0047
  Recall: 0.9732 ± 0.0043
  F1-Score: 0.9820 ± 0.0019

Logistic Regression:
  Accuracy: 0.7658 ± 0.0071
  Precision: 0.6888 ± 0.0254
  Recall: 0.4643 ± 0.0203
  F1-Score: 0.5541 ± 0.0133

Cox Proportional Hazards:
  Concordance Index: 0.7545


['model_results/classifiers\\model_accuracies.pkl']

In [None]:
%pip install lifelines




In [None]:
%pip uninstall llvmlite numba -y
%pip install llvmlite numba --force-reinstall

Note: you may need to restart the kernel to use updated packages.Found existing installation: llvmlite 0.45.1
Uninstalling llvmlite-0.45.1:
  Successfully uninstalled llvmlite-0.45.1
Found existing installation: numba 0.62.1
Uninstalling numba-0.62.1:
  Successfully uninstalled numba-0.62.1



You can safely remove it manually.
You can safely remove it manually.
You can safely remove it manually.


Collecting llvmlite
  Using cached llvmlite-0.45.1-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting numba
  Using cached numba-0.62.1-cp312-cp312-win_amd64.whl.metadata (2.9 kB)
Collecting numpy<2.4,>=1.22 (from numba)
  Using cached numpy-2.3.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Using cached llvmlite-0.45.1-cp312-cp312-win_amd64.whl (38.1 MB)
Using cached numba-0.62.1-cp312-cp312-win_amd64.whl (2.7 MB)
Using cached numpy-2.3.3-cp312-cp312-win_amd64.whl (12.8 MB)
Installing collected packages: numpy, llvmlite, numba

  Attempting uninstall: numpy

    Found existing installation: numpy 1.26.4

   ---------------------------------------- 0/3 [numpy]
    Uninstalling numpy-1.26.4:
   ---------------------------------------- 0/3 [numpy]
   ---------------------------------------- 0/3 [numpy]
   ---------------------------------------- 0/3 [numpy]
   ---------------------------------------- 0/3 [numpy]
   ---------------------------------------- 0/3 [numpy]
   ---------------

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.3.3 which is incompatible.
langchain 0.2.13 requires numpy<2.0.0,>=1.26.0; python_version >= "3.12", but you have numpy 2.3.3 which is incompatible.
langchain-community 0.0.29 requires langchain-core<0.2.0,>=0.1.33, but you have langchain-core 0.2.30 which is incompatible.
langchain-community 0.0.29 requires numpy<2,>=1, but you have numpy 2.3.3 which is incompatible.
llama-index-readers-file 0.1.12 requires pypdf<5.0.0,>=4.0.1, but you have pypdf 5.2.0 which is incompatible.
matplotlib 3.8.3 requires numpy<2,>=1.21, but you have numpy 2.3.3 which is incompatible.
pandas 2.2.1 requires numpy<2,>=1.26.0; python_version >= "3.12", but you have numpy 2.3.3 which is incompatible.
sc