Lie Detector Classification (LIAR dataset)




In [1]:
import warnings
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)

warnings.filterwarnings("ignore", category=UserWarning)
RANDOM_STATE = 42
DATA_DIR = Path("data")


In [2]:
# 1. Load the pre-split TSV files
train_path = DATA_DIR / "train.tsv"
valid_path = DATA_DIR / "valid.tsv"
test_path = DATA_DIR / "test.tsv"

# LIAR dataset official columns (files have no header row)
liar_cols = [
    "id",
    "label",
    "statement",
    "subject",
    "speaker",
    "job_title",
    "state_info",
    "party_affiliation",
    "barely_true_counts",
    "false_counts",
    "half_true_counts",
    "mostly_true_counts",
    "pants_on_fire_counts",
    "context",
]

train_df = pd.read_csv(train_path, sep="\t", header=None, names=liar_cols)
valid_df = pd.read_csv(valid_path, sep="\t", header=None, names=liar_cols)
test_df = pd.read_csv(test_path, sep="\t", header=None, names=liar_cols)

print("Loaded shapes and columns:")
for name, df in [("train", train_df), ("valid", valid_df), ("test", test_df)]:
    print(name, df.shape, list(df.columns))

train_df.head()


Loaded shapes and columns:
train (10240, 14) ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']
valid (1284, 14) ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']
test (1267, 14) ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']


Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [3]:
# 2. Clean data and map labels to binary (Lie=1, Truth=0)
label_map = {
    "pants-fire": 1,
    "false": 1,
    "barely-true": 1,
    "half-true": 0,
    "mostly-true": 0,
    "true": 0,
}

def clean_and_map(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Remove rows with missing text or labels
    df = df.dropna(subset=["statement", "label"])
    # Drop exact duplicates based on text + label
    df = df.drop_duplicates(subset=["statement", "label"])
    # Map labels; rows with unmapped labels are removed
    df["target"] = df["label"].map(label_map)
    df = df.dropna(subset=["target"])
    df["target"] = df["target"].astype(int)
    return df

train_df = clean_and_map(train_df)
valid_df = clean_and_map(valid_df)
test_df = clean_and_map(test_df)

print("After cleaning:")
for name, df in [("train", train_df), ("valid", valid_df), ("test", test_df)]:
    counts = df["target"].value_counts(normalize=True).rename({0: "truth", 1: "lie"})
    print(name, df.shape, counts.to_dict())


After cleaning:
train (10229, 15) {'truth': 0.5619317626356437, 'lie': 0.43806823736435624}
valid (1284, 15) {'truth': 0.5202492211838006, 'lie': 0.4797507788161994}
test (1267, 15) {'truth': 0.56353591160221, 'lie': 0.43646408839779005}


In [4]:
X_train, y_train = train_df["statement"], train_df["target"]
X_valid, y_valid = valid_df["statement"], valid_df["target"]
X_test, y_test = test_df["statement"], test_df["target"]


In [None]:
# 3. Define pipelines and hyperparameter grids for classical ML models
base_tfidf_params = {
    "tfidf__max_features": [5000],
    "tfidf__ngram_range": [(1, 2)],
    "tfidf__stop_words": ["english"],
    "tfidf__min_df": [2],
}

models_and_grids = {
    "Logistic Regression": (
        Pipeline([
            ("tfidf", TfidfVectorizer()),
            ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE)),
        ]),
        {
            **base_tfidf_params,
            "clf__C": [0.5, 1.0, 2.0],
            "clf__penalty": ["l2"],
        },
    ),
    "Linear SVM": (
        Pipeline([
            ("tfidf", TfidfVectorizer()),
            ("clf", LinearSVC(class_weight="balanced", random_state=RANDOM_STATE)),
        ]),
        {
            **base_tfidf_params,
            "clf__C": [0.5, 1.0, 2.0],
        },
    ),
    "Naive Bayes": (
        Pipeline([
            ("tfidf", TfidfVectorizer()),
            ("clf", ComplementNB()),
        ]),
        {
            **base_tfidf_params,
            "clf__alpha": [0.5, 1.0, 2.0],
        },
    ),
    "KNN": (
        Pipeline([
            ("tfidf", TfidfVectorizer()),
            ("clf", KNeighborsClassifier()),
        ]),
        {
            **base_tfidf_params,
            "clf__n_neighbors": [3, 5, 7],
            "clf__weights": ["uniform", "distance"],
        },
    ),
    "Decision Tree": (
        Pipeline([
            ("tfidf", TfidfVectorizer()),
            ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE)),
        ]),
        {
            **base_tfidf_params,
            "clf__max_depth": [10, 30, None],
            "clf__min_samples_split": [2, 5],
        },
    ),
    "Random Forest": (
        Pipeline([
            ("tfidf", TfidfVectorizer()),
            ("clf", RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1, class_weight="balanced")),
        ]),
        {
            **base_tfidf_params,
            "clf__n_estimators": [100, 200],
            "clf__max_depth": [None, 30],
        },
    ),
    "Gradient Boosting": (
        Pipeline([
            ("tfidf", TfidfVectorizer()),
            ("clf", GradientBoostingClassifier(random_state=RANDOM_STATE)),
        ]),
        {
            **base_tfidf_params,
            "clf__n_estimators": [100, 200],
            "clf__learning_rate": [0.05, 0.1],
        },
    ),
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)


def compute_prob_scores(model, X):
    """Return probability scores for ROC-AUC; fallback to decision function."""
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        return model.decision_function(X)
    # Fallback: use predictions 
    return model.predict(X)


def evaluate_model(name, pipeline, param_grid):
    grid = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring="f1",
        n_jobs=-1,
        verbose=0,
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_valid)
    y_scores = compute_prob_scores(best_model, X_valid)

    metrics = {
        "model": name,
        "best_params": grid.best_params_,
        "valid_accuracy": accuracy_score(y_valid, y_pred),
        "valid_precision": precision_score(y_valid, y_pred),
        "valid_recall": recall_score(y_valid, y_pred),
        "valid_f1": f1_score(y_valid, y_pred),
    }
    try:
        metrics["valid_roc_auc"] = roc_auc_score(y_valid, y_scores)
    except Exception:
        metrics["valid_roc_auc"] = np.nan

    print(f"\n{name} best params: {grid.best_params_}")
    print(classification_report(y_valid, y_pred, target_names=["Truth", "Lie"]))
    return best_model, metrics



In [6]:
# 4. Train, tune, and evaluate on validation set
model_results = []
best_models = {}

for name, (pipeline, grid) in models_and_grids.items():
    best_model, metrics = evaluate_model(name, pipeline, grid)
    model_results.append(metrics)
    best_models[name] = best_model

results_df = pd.DataFrame(model_results)
results_df = results_df.sort_values(by="valid_f1", ascending=False).reset_index(drop=True)
results_df





Logistic Regression best params: {'clf__C': 0.5, 'clf__penalty': 'l2', 'tfidf__max_features': 5000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}
              precision    recall  f1-score   support

       Truth       0.62      0.59      0.61       668
         Lie       0.58      0.61      0.60       616

    accuracy                           0.60      1284
   macro avg       0.60      0.60      0.60      1284
weighted avg       0.60      0.60      0.60      1284


Linear SVM best params: {'clf__C': 0.5, 'tfidf__max_features': 5000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}
              precision    recall  f1-score   support

       Truth       0.60      0.57      0.58       668
         Lie       0.56      0.59      0.57       616

    accuracy                           0.58      1284
   macro avg       0.58      0.58      0.58      1284
weighted avg       0.58      0.58      0.58      1284


Naive Baye

Unnamed: 0,model,best_params,valid_accuracy,valid_precision,valid_recall,valid_f1,valid_roc_auc
0,Random Forest,"{'clf__max_depth': 30, 'clf__n_estimators': 20...",0.625389,0.607656,0.618506,0.613033,0.669928
1,Logistic Regression,"{'clf__C': 0.5, 'clf__penalty': 'l2', 'tfidf__...",0.602804,0.581538,0.613636,0.597156,0.655907
2,Naive Bayes,"{'clf__alpha': 0.5, 'tfidf__max_features': 500...",0.601246,0.583601,0.589286,0.58643,0.645406
3,Linear SVM,"{'clf__C': 0.5, 'tfidf__max_features': 5000, '...",0.577103,0.556068,0.587662,0.571429,0.620694
4,Decision Tree,"{'clf__max_depth': None, 'clf__min_samples_spl...",0.558411,0.542609,0.506494,0.523929,0.562645
5,KNN,"{'clf__n_neighbors': 7, 'clf__weights': 'dista...",0.558411,0.5727,0.313312,0.405037,0.582294
6,Gradient Boosting,"{'clf__learning_rate': 0.1, 'clf__n_estimators...",0.595016,0.686047,0.287338,0.405034,0.652465


In [7]:
# 5. Pick best model by validation F1 and evaluate on test set
best_row = results_df.iloc[0]
best_name = best_row["model"]
best_model = best_models[best_name]

print(f"Selected best model: {best_name}")

y_test_pred = best_model.predict(X_test)
y_test_scores = compute_prob_scores(best_model, X_test)

test_metrics = {
    "test_accuracy": accuracy_score(y_test, y_test_pred),
    "test_precision": precision_score(y_test, y_test_pred),
    "test_recall": recall_score(y_test, y_test_pred),
    "test_f1": f1_score(y_test, y_test_pred),
}
try:
    test_metrics["test_roc_auc"] = roc_auc_score(y_test, y_test_scores)
except Exception:
    test_metrics["test_roc_auc"] = np.nan

print("\nTest classification report:")
print(classification_report(y_test, y_test_pred, target_names=["Truth", "Lie"]))

test_metrics


Selected best model: Random Forest

Test classification report:
              precision    recall  f1-score   support

       Truth       0.66      0.63      0.64       714
         Lie       0.55      0.58      0.56       553

    accuracy                           0.61      1267
   macro avg       0.60      0.60      0.60      1267
weighted avg       0.61      0.61      0.61      1267



{'test_accuracy': 0.6077348066298343,
 'test_precision': 0.5477815699658704,
 'test_recall': 0.5804701627486437,
 'test_f1': 0.5636523266022827,
 'test_roc_auc': 0.6401851879992504}

In [8]:
# 6. Persist artifacts
comparison_path = Path("model_comparison.csv")
best_model_path = Path("best_model.pkl")

results_df.to_csv(comparison_path, index=False)
joblib.dump(best_model, best_model_path)

print(f"Saved comparison table -> {comparison_path.resolve()}")
print(f"Saved best model -> {best_model_path.resolve()}")


Saved comparison table -> C:\Users\PC\Documents\school\ML assignment\model_comparison.csv
Saved best model -> C:\Users\PC\Documents\school\ML assignment\best_model.pkl


### Notes
- TF-IDF is fit only on the training split to avoid leakage.
- All models use 3-fold Stratified CV with F1 scoring for tuning.
- `model_comparison.csv` holds validation metrics; `best_model.pkl` stores the tuned pipeline ready for inference via `predict` or `predict_proba`.
