#### Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize, pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
from scipy.sparse import hstack, csr_matrix

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

#### Data Preprocessing

In [3]:
# Load dataset
df = pd.read_csv("politifact_factcheck_data.csv")

# Step 1: Filter relevant sources
allowed_sources = ['news', 'blog', 'social_media']
df = df[df['statement_source'].str.lower().isin(allowed_sources)].reset_index(drop=True)

# Step 2: Sentiment using VADER
analyzer = SentimentIntensityAnalyzer()
df['sentiment'] = df['statement'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Step 3: POS ratios
def get_pos_features(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    total = len(tokens)
    return pd.Series({
        'noun_ratio': sum(1 for _, t in tags if t.startswith('NN')) / total if total else 0,
        'verb_ratio': sum(1 for _, t in tags if t.startswith('VB')) / total if total else 0,
        'pronoun_ratio': sum(1 for _, t in tags if t in ['PRP', 'PRP$', 'WP', 'WP$']) / total if total else 0
    })

pos_df = df['statement'].apply(get_pos_features)
df = pd.concat([df, pos_df], axis=1)

# Step 4: Label encoding
df['label'] = df['verdict']
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['label'])

# Step 5: TF-IDF on statements
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(df['statement'])

# Step 6: Scale numeric features
scaler = StandardScaler()
X_numeric = scaler.fit_transform(df[['sentiment', 'noun_ratio', 'verb_ratio', 'pronoun_ratio']])

# Step 7: Encode categorical metadata
df['statement_source'] = df['statement_source'].fillna("unknown")
df['statement_originator'] = df['statement_originator'].fillna("unknown")

# One-hot encode
df_encoded = pd.get_dummies(df[['statement_source', 'statement_originator']], drop_first=True)

# Ensure same scale
X_categorical = csr_matrix(df_encoded.values)

# Step 8: Combine all features
X_numeric_sparse = csr_matrix(X_numeric)
X = hstack([X_text, X_numeric_sparse, X_categorical])
y = df['label_enc']

# Step 9: Train-validation-test split (as per paper)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")


Train: (8492, 7865), Val: (944, 7865), Test: (2360, 7865)


#### Model Training

In [4]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Validation performance
y_val_pred = lr.predict(X_val)
print("LR Validation Accuracy:", accuracy_score(y_val, y_val_pred))

# Final test performance
y_test_pred = lr.predict(X_test)
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

LR Validation Accuracy: 0.3686440677966102
Logistic Regression Test Accuracy: 0.37203389830508476
              precision    recall  f1-score   support

           0       0.46      0.68      0.55       748
           1       0.23      0.19      0.21       321
           2       0.16      0.10      0.13       345
           3       0.29      0.33      0.31       320
           4       0.53      0.31      0.40       391
           5       0.26      0.18      0.22       235

    accuracy                           0.37      2360
   macro avg       0.32      0.30      0.30      2360
weighted avg       0.35      0.37      0.35      2360



In [5]:
# Re-do TF-IDF only for NB (exclude scaled metadata)
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['statement'])

# Re-split only the TF-IDF features
X_tfidf_temp, X_tfidf_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)
X_tfidf_train, X_tfidf_val, y_train, y_val = train_test_split(X_tfidf_temp, y_temp, test_size=0.1, random_state=42, stratify=y_temp)

# Now train Naive Bayes safely
nb = MultinomialNB()
nb.fit(X_tfidf_train, y_train)

print("NB Validation Accuracy:", accuracy_score(y_val, nb.predict(X_tfidf_val)))
print("Naive Bayes Test Accuracy:", accuracy_score(y_test, nb.predict(X_tfidf_test)))
print(classification_report(y_test, nb.predict(X_tfidf_test)))

NB Validation Accuracy: 0.3326271186440678
Naive Bayes Test Accuracy: 0.34915254237288135
              precision    recall  f1-score   support

           0       0.35      0.92      0.50       748
           1       0.27      0.09      0.13       321
           2       0.12      0.01      0.02       345
           3       0.37      0.18      0.25       320
           4       0.62      0.10      0.18       391
           5       0.50      0.03      0.05       235

    accuracy                           0.35      2360
   macro avg       0.37      0.22      0.19      2360
weighted avg       0.37      0.35      0.25      2360



In [6]:
svm = SVC(kernel='linear', C=1.0)
svm.fit(X_train, y_train)

print("SVM Validation Accuracy:", accuracy_score(y_val, svm.predict(X_val)))
print("SVM Test Accuracy:", accuracy_score(y_test, svm.predict(X_test)))
print(classification_report(y_test, svm.predict(X_test)))


SVM Validation Accuracy: 0.3559322033898305
SVM Test Accuracy: 0.3635593220338983
              precision    recall  f1-score   support

           0       0.46      0.67      0.54       748
           1       0.22      0.21      0.21       321
           2       0.18      0.11      0.13       345
           3       0.27      0.28      0.27       320
           4       0.51      0.32      0.39       391
           5       0.23      0.18      0.20       235

    accuracy                           0.36      2360
   macro avg       0.31      0.29      0.29      2360
weighted avg       0.34      0.36      0.34      2360



In [7]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

print("KNN Validation Accuracy:", accuracy_score(y_val, knn.predict(X_val)))
print("KNN Test Accuracy:", accuracy_score(y_test, knn.predict(X_test)))
print(classification_report(y_test, knn.predict(X_test)))


KNN Validation Accuracy: 0.3093220338983051
KNN Test Accuracy: 0.3207627118644068
              precision    recall  f1-score   support

           0       0.39      0.64      0.49       748
           1       0.22      0.18      0.20       321
           2       0.18      0.12      0.15       345
           3       0.26      0.21      0.23       320
           4       0.33      0.22      0.26       391
           5       0.18      0.10      0.13       235

    accuracy                           0.32      2360
   macro avg       0.26      0.25      0.24      2360
weighted avg       0.29      0.32      0.29      2360



In [8]:
dt = DecisionTreeClassifier(max_depth=10)
dt.fit(X_train, y_train)

print("DT Validation Accuracy:", accuracy_score(y_val, dt.predict(X_val)))
print("Decision Tree Test Accuracy:", accuracy_score(y_test, dt.predict(X_test)))
print(classification_report(y_test, dt.predict(X_test)))


DT Validation Accuracy: 0.3294491525423729
Decision Tree Test Accuracy: 0.3305084745762712
              precision    recall  f1-score   support

           0       0.40      0.71      0.51       748
           1       0.17      0.22      0.19       321
           2       0.11      0.01      0.02       345
           3       0.23      0.26      0.24       320
           4       0.43      0.23      0.30       391
           5       0.40      0.01      0.02       235

    accuracy                           0.33      2360
   macro avg       0.29      0.24      0.21      2360
weighted avg       0.31      0.33      0.27      2360



In [9]:
ab = AdaBoostClassifier(n_estimators=100, random_state=42)
ab.fit(X_train, y_train)

print("AB Validation Accuracy:", accuracy_score(y_val, ab.predict(X_val)))
print("AdaBoost Test Accuracy:", accuracy_score(y_test, ab.predict(X_test)))
print(classification_report(y_test, ab.predict(X_test)))


AB Validation Accuracy: 0.3411016949152542
AdaBoost Test Accuracy: 0.3389830508474576
              precision    recall  f1-score   support

           0       0.35      0.80      0.49       748
           1       0.26      0.04      0.07       321
           2       0.00      0.00      0.00       345
           3       0.29      0.34      0.31       320
           4       0.33      0.19      0.24       391
           5       0.29      0.01      0.02       235

    accuracy                           0.34      2360
   macro avg       0.25      0.23      0.19      2360
weighted avg       0.27      0.34      0.25      2360



In [10]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

print("XGB Validation Accuracy:", accuracy_score(y_val, xgb.predict(X_val)))
print("XGBoost Test Accuracy:", accuracy_score(y_test, xgb.predict(X_test)))
print(classification_report(y_test, xgb.predict(X_test)))


XGB Validation Accuracy: 0.3463983050847458
XGBoost Test Accuracy: 0.3580508474576271
              precision    recall  f1-score   support

           0       0.43      0.68      0.53       748
           1       0.21      0.17      0.19       321
           2       0.20      0.10      0.14       345
           3       0.27      0.29      0.28       320
           4       0.51      0.29      0.37       391
           5       0.24      0.16      0.19       235

    accuracy                           0.36      2360
   macro avg       0.31      0.28      0.28      2360
weighted avg       0.34      0.36      0.33      2360



#### HyperParameter Tuning + K-fold

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold
import numpy as np

# Define parameter search space
param_dist = {
    'C': np.logspace(-3, 2, 10),
    'solver': ['lbfgs', 'liblinear'],
    'penalty': ['l2']
}

# Step 1: Randomized Search for best hyperparameters
search_lr = RandomizedSearchCV(
    LogisticRegression(max_iter=1000),
    param_distributions=param_dist,
    n_iter=10,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    random_state=42
)

search_lr.fit(X, y)
print("Best Logistic Regression Params:", search_lr.best_params_)

# Step 2: Build final model with best params
best_lr = LogisticRegression(**search_lr.best_params_, max_iter=1000)

# Step 3: Perform 5-Fold CV using best params
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(best_lr, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"Logistic Regression 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best Logistic Regression Params: {'solver': 'liblinear', 'penalty': 'l2', 'C': 0.5994842503189409}
Logistic Regression 5-Fold Accuracy: 0.3762 (+/- 0.0104)


In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

# TF-IDF features only (recomputed to ensure clean shape)
tfidf_nb = TfidfVectorizer(max_features=5000)
X_nb = tfidf_nb.fit_transform(df['statement'])

# Parameter grid
param_dist = {
    'alpha': np.linspace(0.01, 1.5, 20)
}

# Hyperparameter tuning
search_nb = RandomizedSearchCV(MultinomialNB(), param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42)
search_nb.fit(X_nb, y)
print("Best NB Params:", search_nb.best_params_)

# Final model
best_nb = MultinomialNB(**search_nb.best_params_)

# 5-Fold CV
scores = cross_val_score(best_nb, X_nb, y, cv=cv, scoring='accuracy')
print(f"Naive Bayes 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best NB Params: {'alpha': 0.6373684210526316}
Naive Bayes 5-Fold Accuracy: 0.3525 (+/- 0.0089)


In [13]:
from sklearn.svm import SVC

param_dist = {
    'C': np.logspace(-3, 2, 10),
    'kernel': ['linear', 'rbf']
}

search_svm = RandomizedSearchCV(SVC(), param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_svm.fit(X, y)
print("Best SVM Params:", search_svm.best_params_)

best_svm = SVC(**search_svm.best_params_)
scores = cross_val_score(best_svm, X, y, cv=cv, scoring='accuracy')
print(f"SVM 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best SVM Params: {'kernel': 'linear', 'C': 0.1668100537200059}
SVM 5-Fold Accuracy: 0.3679 (+/- 0.0080)


In [14]:
from sklearn.neighbors import KNeighborsClassifier

param_dist = {
    'n_neighbors': list(range(3, 15)),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

search_knn = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_dist,
                                 n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_knn.fit(X, y)
print("Best KNN Params:", search_knn.best_params_)

best_knn = KNeighborsClassifier(**search_knn.best_params_)
scores = cross_val_score(best_knn, X, y, cv=cv, scoring='accuracy')
print(f"KNN 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best KNN Params: {'weights': 'uniform', 'n_neighbors': 11, 'metric': 'manhattan'}
KNN 5-Fold Accuracy: 0.3267 (+/- 0.0052)


In [15]:
from sklearn.tree import DecisionTreeClassifier

param_dist = {
    'max_depth': list(range(3, 20)),
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

search_dt = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_dt.fit(X, y)
print("Best DT Params:", search_dt.best_params_)

best_dt = DecisionTreeClassifier(**search_dt.best_params_)
scores = cross_val_score(best_dt, X, y, cv=cv, scoring='accuracy')
print(f"Decision Tree 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best DT Params: {'min_samples_split': 2, 'max_depth': 13, 'criterion': 'gini'}
Decision Tree 5-Fold Accuracy: 0.3397 (+/- 0.0049)


In [16]:
from sklearn.ensemble import RandomForestClassifier

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

search_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist,
                               n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_rf.fit(X, y)
print("Best RF Params:", search_rf.best_params_)

best_rf = RandomForestClassifier(**search_rf.best_params_)
scores = cross_val_score(best_rf, X, y, cv=cv, scoring='accuracy')
print(f"Random Forest 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best RF Params: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': None}
Random Forest 5-Fold Accuracy: 0.3599 (+/- 0.0062)


In [17]:
from sklearn.ensemble import AdaBoostClassifier

param_dist = {
    'n_estimators': [50, 100, 200],
    'learning_rate': np.linspace(0.01, 1.0, 10)
}

search_ab = RandomizedSearchCV(AdaBoostClassifier(), param_distributions=param_dist,
                               n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_ab.fit(X, y)
print("Best AdaBoost Params:", search_ab.best_params_)

best_ab = AdaBoostClassifier(**search_ab.best_params_)
scores = cross_val_score(best_ab, X, y, cv=cv, scoring='accuracy')
print(f"AdaBoost 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best AdaBoost Params: {'n_estimators': 200, 'learning_rate': 0.78}
AdaBoost 5-Fold Accuracy: 0.3445 (+/- 0.0099)


In [18]:
from xgboost import XGBClassifier

param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 1.0]
}

search_xgb = RandomizedSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                                param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_xgb.fit(X, y)
print("Best XGB Params:", search_xgb.best_params_)

best_xgb = XGBClassifier(**search_xgb.best_params_, use_label_encoder=False, eval_metric='logloss')
scores = cross_val_score(best_xgb, X, y, cv=cv, scoring='accuracy')
print(f"XGBoost 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best XGB Params: {'subsample': 0.7, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1}
XGBoost 5-Fold Accuracy: 0.3545 (+/- 0.0096)
