In [50]:
import pandas as pd
df=pd.read_csv("politifact_factcheck_data.csv")

In [51]:
df.head()


Unnamed: 0,verdict,statement_originator,statement,statement_date,statement_source,factchecker,factcheck_date,factcheck_analysis_link
0,true,Barack Obama,John McCain opposed bankruptcy protections for...,6/11/2008,speech,Adriel Bettelheim,6/16/2008,https://www.politifact.com/factchecks/2008/jun...
1,false,Matt Gaetz,"""Bennie Thompson actively cheer-led riots in t...",6/7/2022,television,Yacob Reyes,6/13/2022,https://www.politifact.com/factchecks/2022/jun...
2,mostly-true,Kelly Ayotte,"Says Maggie Hassan was ""out of state on 30 day...",5/18/2016,news,Clay Wirestone,5/27/2016,https://www.politifact.com/factchecks/2016/may...
3,false,Bloggers,"""BUSTED: CDC Inflated COVID Numbers, Accused o...",2/1/2021,blog,Madison Czopek,2/5/2021,https://www.politifact.com/factchecks/2021/feb...
4,half-true,Bobby Jindal,"""I'm the only (Republican) candidate that has ...",8/30/2015,television,Linda Qiu,8/30/2015,https://www.politifact.com/factchecks/2015/aug...


In [52]:
df.shape

(21152, 8)

In [53]:
dfq=df['statement_source']
dfq

0              speech
1          television
2                news
3                blog
4          television
             ...     
21147          speech
21148    social_media
21149            news
21150            blog
21151    social_media
Name: statement_source, Length: 21152, dtype: object

In [54]:
dfq.unique()

array(['speech', 'television', 'news', 'blog', 'other', 'social_media',
       'advertisement', 'campaign', 'meeting', 'radio', 'email',
       'testimony', 'statement'], dtype=object)

In [55]:
count=0
for i in range(21151):
    if dfq[i]=="news" or dfq[i]=="blog" or dfq[i]=="social_media":
        count=count+1
print(count)


11795


In [56]:
df.shape

(21152, 8)

#### Imports

In [57]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize, pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
from scipy.sparse import hstack, csr_matrix

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")

In [58]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

#### Data Preprocessing

In [59]:
# Load dataset
df = pd.read_csv("politifact_factcheck_data.csv")

# Step 1: Filter relevant sources
allowed_sources = ['news', 'blog', 'social_media']
df = df[df['statement_source'].str.lower().isin(allowed_sources)].reset_index(drop=True)

# Step 2: Sentiment using VADER
analyzer = SentimentIntensityAnalyzer()
df['sentiment'] = df['statement'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Step 3: POS ratios
def get_pos_features(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    total = len(tokens)
    return pd.Series({
        'noun_ratio': sum(1 for _, t in tags if t.startswith('NN')) / total if total else 0,
        'verb_ratio': sum(1 for _, t in tags if t.startswith('VB')) / total if total else 0,
        'pronoun_ratio': sum(1 for _, t in tags if t in ['PRP', 'PRP$', 'WP', 'WP$']) / total if total else 0
    })

pos_df = df['statement'].apply(get_pos_features)
df = pd.concat([df, pos_df], axis=1)

# Step 4: Label encoding
df['label'] = df['verdict'].apply(lambda x: 'real' if 'true' in x.lower() else 'fake')
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['label'])

# Step 5: TF-IDF on statements
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(df['statement'])

# Step 6: Scale numeric features
scaler = StandardScaler()
X_numeric = scaler.fit_transform(df[['sentiment', 'noun_ratio', 'verb_ratio', 'pronoun_ratio']])

# Step 7: Encode categorical metadata
df['statement_source'] = df['statement_source'].fillna("unknown")
df['statement_originator'] = df['statement_originator'].fillna("unknown")

# One-hot encode
df_encoded = pd.get_dummies(df[['statement_source', 'statement_originator']], drop_first=True)

# Ensure same scale
X_categorical = csr_matrix(df_encoded.values)

# Step 8: Combine all features
X_numeric_sparse = csr_matrix(X_numeric)
X = hstack([X_text, X_numeric_sparse, X_categorical])
y = df['label_enc']

# Step 9: Train-validation-test split (as per paper)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")


Train: (8492, 7865), Val: (944, 7865), Test: (2360, 7865)


#### Model Training

In [60]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Validation performance
y_val_pred = lr.predict(X_val)
print("LR Validation Accuracy:", accuracy_score(y_val, y_val_pred))

# Final test performance
y_test_pred = lr.predict(X_test)
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

LR Validation Accuracy: 0.7129237288135594
Logistic Regression Test Accuracy: 0.7444915254237288
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1484
           1       0.66      0.64      0.65       876

    accuracy                           0.74      2360
   macro avg       0.73      0.72      0.72      2360
weighted avg       0.74      0.74      0.74      2360



In [61]:
# Re-do TF-IDF only for NB (exclude scaled metadata)
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['statement'])

# Re-split only the TF-IDF features
X_tfidf_temp, X_tfidf_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)
X_tfidf_train, X_tfidf_val, y_train, y_val = train_test_split(X_tfidf_temp, y_temp, test_size=0.1, random_state=42, stratify=y_temp)

# Now train Naive Bayes safely
nb = MultinomialNB()
nb.fit(X_tfidf_train, y_train)

print("NB Validation Accuracy:", accuracy_score(y_val, nb.predict(X_tfidf_val)))
print("Naive Bayes Test Accuracy:", accuracy_score(y_test, nb.predict(X_tfidf_test)))
print(classification_report(y_test, nb.predict(X_tfidf_test)))

NB Validation Accuracy: 0.698093220338983
Naive Bayes Test Accuracy: 0.7203389830508474
              precision    recall  f1-score   support

           0       0.74      0.85      0.79      1484
           1       0.66      0.50      0.57       876

    accuracy                           0.72      2360
   macro avg       0.70      0.68      0.68      2360
weighted avg       0.71      0.72      0.71      2360



In [62]:
svm = SVC(kernel='linear', C=1.0)
svm.fit(X_train, y_train)

print("SVM Validation Accuracy:", accuracy_score(y_val, svm.predict(X_val)))
print("SVM Test Accuracy:", accuracy_score(y_test, svm.predict(X_test)))
print(classification_report(y_test, svm.predict(X_test)))


SVM Validation Accuracy: 0.701271186440678
SVM Test Accuracy: 0.7288135593220338
              precision    recall  f1-score   support

           0       0.78      0.80      0.79      1484
           1       0.64      0.62      0.63       876

    accuracy                           0.73      2360
   macro avg       0.71      0.71      0.71      2360
weighted avg       0.73      0.73      0.73      2360



In [63]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

print("KNN Validation Accuracy:", accuracy_score(y_val, knn.predict(X_val)))
print("KNN Test Accuracy:", accuracy_score(y_test, knn.predict(X_test)))
print(classification_report(y_test, knn.predict(X_test)))


KNN Validation Accuracy: 0.6896186440677966
KNN Test Accuracy: 0.6953389830508474
              precision    recall  f1-score   support

           0       0.73      0.81      0.77      1484
           1       0.61      0.50      0.55       876

    accuracy                           0.70      2360
   macro avg       0.67      0.65      0.66      2360
weighted avg       0.69      0.70      0.69      2360



In [64]:
dt = DecisionTreeClassifier(max_depth=10)
dt.fit(X_train, y_train)

print("DT Validation Accuracy:", accuracy_score(y_val, dt.predict(X_val)))
print("Decision Tree Test Accuracy:", accuracy_score(y_test, dt.predict(X_test)))
print(classification_report(y_test, dt.predict(X_test)))


DT Validation Accuracy: 0.7002118644067796
Decision Tree Test Accuracy: 0.6826271186440678
              precision    recall  f1-score   support

           0       0.76      0.73      0.74      1484
           1       0.57      0.60      0.58       876

    accuracy                           0.68      2360
   macro avg       0.66      0.67      0.66      2360
weighted avg       0.69      0.68      0.68      2360



In [65]:
ab = AdaBoostClassifier(n_estimators=100, random_state=42)
ab.fit(X_train, y_train)

print("AB Validation Accuracy:", accuracy_score(y_val, ab.predict(X_val)))
print("AdaBoost Test Accuracy:", accuracy_score(y_test, ab.predict(X_test)))
print(classification_report(y_test, ab.predict(X_test)))


AB Validation Accuracy: 0.7108050847457628
AdaBoost Test Accuracy: 0.6923728813559322
              precision    recall  f1-score   support

           0       0.74      0.79      0.76      1484
           1       0.60      0.52      0.56       876

    accuracy                           0.69      2360
   macro avg       0.67      0.66      0.66      2360
weighted avg       0.69      0.69      0.69      2360



In [66]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

print("XGB Validation Accuracy:", accuracy_score(y_val, xgb.predict(X_val)))
print("XGBoost Test Accuracy:", accuracy_score(y_test, xgb.predict(X_test)))
print(classification_report(y_test, xgb.predict(X_test)))


XGB Validation Accuracy: 0.7182203389830508
XGBoost Test Accuracy: 0.7237288135593221
              precision    recall  f1-score   support

           0       0.77      0.80      0.78      1484
           1       0.64      0.60      0.62       876

    accuracy                           0.72      2360
   macro avg       0.70      0.70      0.70      2360
weighted avg       0.72      0.72      0.72      2360



#### Hyper Parameter Tuning

In [67]:
param_dist = {
    'C': np.logspace(-3, 2, 10),
    'solver': ['lbfgs', 'liblinear'],
    'penalty': ['l2']
}

search_lr = RandomizedSearchCV(LogisticRegression(max_iter=1000), param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_lr.fit(X_train, y_train)
print("Best LR Params:", search_lr.best_params_)

y_pred = search_lr.predict(X_test)
print("Tuned Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

Best LR Params: {'solver': 'liblinear', 'penalty': 'l2', 'C': 0.5994842503189409}
Tuned Logistic Regression Accuracy: 0.7470338983050847


In [68]:
# Use only X_tfidf as done in previous block
param_dist = {
    'alpha': np.linspace(0.1, 1.5, 10)
}

search_nb = RandomizedSearchCV(MultinomialNB(), param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_nb.fit(X_tfidf_train, y_train)
print("Best NB Params:", search_nb.best_params_)

y_pred = search_nb.predict(X_tfidf_test)
print("Tuned Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))


Best NB Params: {'alpha': 0.25555555555555554}
Tuned Naive Bayes Accuracy: 0.7161016949152542


In [69]:
param_dist = {
    'C': np.logspace(-3, 2, 10),
    'kernel': ['linear', 'rbf']
}

search_svm = RandomizedSearchCV(SVC(), param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_svm.fit(X_train, y_train)
print("Best SVM Params:", search_svm.best_params_)

y_pred = search_svm.predict(X_test)
print("Tuned SVM Accuracy:", accuracy_score(y_test, y_pred))


Best SVM Params: {'kernel': 'linear', 'C': 0.1668100537200059}
Tuned SVM Accuracy: 0.7436440677966102


In [70]:
param_dist = {
    'n_neighbors': list(range(3, 21)),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

search_knn = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_dist,
                                 n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_knn.fit(X_train, y_train)
print("Best KNN Params:", search_knn.best_params_)

y_pred = search_knn.predict(X_test)
print("Tuned KNN Accuracy:", accuracy_score(y_test, y_pred))


Best KNN Params: {'weights': 'uniform', 'n_neighbors': 5, 'metric': 'euclidean'}
Tuned KNN Accuracy: 0.6953389830508474


In [71]:
param_dist = {
    'max_depth': list(range(3, 20)),
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

search_dt = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_dt.fit(X_train, y_train)
print("Best DT Params:", search_dt.best_params_)

y_pred = search_dt.predict(X_test)
print("Tuned Decision Tree Accuracy:", accuracy_score(y_test, y_pred))


Best DT Params: {'min_samples_split': 2, 'max_depth': 13, 'criterion': 'gini'}
Tuned Decision Tree Accuracy: 0.6724576271186441


In [72]:
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

search_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist,
                               n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_rf.fit(X_train, y_train)
print("Best RF Params:", search_rf.best_params_)

y_pred = search_rf.predict(X_test)
print("Tuned Random Forest Accuracy:", accuracy_score(y_test, y_pred))


Best RF Params: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': None}
Tuned Random Forest Accuracy: 0.7203389830508474


In [73]:
param_dist = {
    'n_estimators': [50, 100, 200],
    'learning_rate': np.linspace(0.01, 1.0, 10)
}

search_ab = RandomizedSearchCV(AdaBoostClassifier(), param_distributions=param_dist,
                               n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_ab.fit(X_train, y_train)
print("Best AdaBoost Params:", search_ab.best_params_)

y_pred = search_ab.predict(X_test)
print("Tuned AdaBoost Accuracy:", accuracy_score(y_test, y_pred))


Best AdaBoost Params: {'n_estimators': 200, 'learning_rate': 0.56}
Tuned AdaBoost Accuracy: 0.6953389830508474


In [74]:
param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 1.0]
}

search_xgb = RandomizedSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                                param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_xgb.fit(X_train, y_train)
print("Best XGB Params:", search_xgb.best_params_)

y_pred = search_xgb.predict(X_test)
print("Tuned XGBoost Accuracy:", accuracy_score(y_test, y_pred))


Best XGB Params: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.3}
Tuned XGBoost Accuracy: 0.7156779661016949


#### K-Fold Cross Validation

In [75]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [76]:
model = LogisticRegression(max_iter=1000)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"Logistic Regression Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

Logistic Regression Accuracy: 0.7422 (+/- 0.0062)


In [77]:
# Naive Bayes requires only non-negative input
X_nb = tfidf.fit_transform(df['statement'])

model = MultinomialNB()
scores = cross_val_score(model, X_nb, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"Naive Bayes Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Naive Bayes Accuracy: 0.7158 (+/- 0.0084)


In [78]:
model = SVC(kernel='linear')
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"SVM Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


SVM Accuracy: 0.7313 (+/- 0.0038)


In [79]:
model = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"KNN Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


KNN Accuracy: 0.6895 (+/- 0.0060)


In [80]:
model = DecisionTreeClassifier(max_depth=10)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"Decision Tree Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Decision Tree Accuracy: 0.6872 (+/- 0.0101)


In [81]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"Random Forest Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Random Forest Accuracy: 0.7152 (+/- 0.0053)


In [82]:
model = AdaBoostClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"AdaBoost Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


AdaBoost Accuracy: 0.6896 (+/- 0.0049)


In [83]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"XGBoost Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


XGBoost Accuracy: 0.7167 (+/- 0.0104)


#### HyperParameter Tuning + K-fold

In [84]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold
import numpy as np

# Define parameter search space
param_dist = {
    'C': np.logspace(-3, 2, 10),
    'solver': ['lbfgs', 'liblinear'],
    'penalty': ['l2']
}

# Step 1: Randomized Search for best hyperparameters
search_lr = RandomizedSearchCV(
    LogisticRegression(max_iter=1000),
    param_distributions=param_dist,
    n_iter=10,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    random_state=42
)

search_lr.fit(X, y)
print("Best Logistic Regression Params:", search_lr.best_params_)

# Step 2: Build final model with best params
best_lr = LogisticRegression(**search_lr.best_params_, max_iter=1000)

# Step 3: Perform 5-Fold CV using best params
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(best_lr, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"Logistic Regression 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best Logistic Regression Params: {'solver': 'liblinear', 'penalty': 'l2', 'C': 0.5994842503189409}
Logistic Regression 5-Fold Accuracy: 0.7429 (+/- 0.0061)


In [85]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

# TF-IDF features only (recomputed to ensure clean shape)
tfidf_nb = TfidfVectorizer(max_features=5000)
X_nb = tfidf_nb.fit_transform(df['statement'])

# Parameter grid
param_dist = {
    'alpha': np.linspace(0.01, 1.5, 20)
}

# Hyperparameter tuning
search_nb = RandomizedSearchCV(MultinomialNB(), param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42)
search_nb.fit(X_nb, y)
print("Best NB Params:", search_nb.best_params_)

# Final model
best_nb = MultinomialNB(**search_nb.best_params_)

# 5-Fold CV
scores = cross_val_score(best_nb, X_nb, y, cv=cv, scoring='accuracy')
print(f"Naive Bayes 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best NB Params: {'alpha': 0.40210526315789474}
Naive Bayes 5-Fold Accuracy: 0.7143 (+/- 0.0080)


In [86]:
from sklearn.svm import SVC

param_dist = {
    'C': np.logspace(-3, 2, 10),
    'kernel': ['linear', 'rbf']
}

search_svm = RandomizedSearchCV(SVC(), param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_svm.fit(X, y)
print("Best SVM Params:", search_svm.best_params_)

best_svm = SVC(**search_svm.best_params_)
scores = cross_val_score(best_svm, X, y, cv=cv, scoring='accuracy')
print(f"SVM 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best SVM Params: {'kernel': 'linear', 'C': 0.1668100537200059}
SVM 5-Fold Accuracy: 0.7459 (+/- 0.0067)


In [87]:
from sklearn.neighbors import KNeighborsClassifier

param_dist = {
    'n_neighbors': list(range(3, 15)),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

search_knn = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_dist,
                                 n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_knn.fit(X, y)
print("Best KNN Params:", search_knn.best_params_)

best_knn = KNeighborsClassifier(**search_knn.best_params_)
scores = cross_val_score(best_knn, X, y, cv=cv, scoring='accuracy')
print(f"KNN 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best KNN Params: {'weights': 'distance', 'n_neighbors': 12, 'metric': 'euclidean'}
KNN 5-Fold Accuracy: 0.6896 (+/- 0.0079)


In [88]:
from sklearn.tree import DecisionTreeClassifier

param_dist = {
    'max_depth': list(range(3, 20)),
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

search_dt = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_dt.fit(X, y)
print("Best DT Params:", search_dt.best_params_)

best_dt = DecisionTreeClassifier(**search_dt.best_params_)
scores = cross_val_score(best_dt, X, y, cv=cv, scoring='accuracy')
print(f"Decision Tree 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best DT Params: {'min_samples_split': 5, 'max_depth': 6, 'criterion': 'gini'}
Decision Tree 5-Fold Accuracy: 0.6890 (+/- 0.0095)


In [89]:
from sklearn.ensemble import RandomForestClassifier

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

search_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist,
                               n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_rf.fit(X, y)
print("Best RF Params:", search_rf.best_params_)

best_rf = RandomForestClassifier(**search_rf.best_params_)
scores = cross_val_score(best_rf, X, y, cv=cv, scoring='accuracy')
print(f"Random Forest 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best RF Params: {'n_estimators': 300, 'min_samples_split': 10, 'max_depth': None}
Random Forest 5-Fold Accuracy: 0.7184 (+/- 0.0035)


In [90]:
from sklearn.ensemble import AdaBoostClassifier

param_dist = {
    'n_estimators': [50, 100, 200],
    'learning_rate': np.linspace(0.01, 1.0, 10)
}

search_ab = RandomizedSearchCV(AdaBoostClassifier(), param_distributions=param_dist,
                               n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_ab.fit(X, y)
print("Best AdaBoost Params:", search_ab.best_params_)

best_ab = AdaBoostClassifier(**search_ab.best_params_)
scores = cross_val_score(best_ab, X, y, cv=cv, scoring='accuracy')
print(f"AdaBoost 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best AdaBoost Params: {'n_estimators': 200, 'learning_rate': 0.78}
AdaBoost 5-Fold Accuracy: 0.6909 (+/- 0.0052)


In [91]:
from xgboost import XGBClassifier

param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 1.0]
}

search_xgb = RandomizedSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                                param_distributions=param_dist,
                                n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
search_xgb.fit(X, y)
print("Best XGB Params:", search_xgb.best_params_)

best_xgb = XGBClassifier(**search_xgb.best_params_, use_label_encoder=False, eval_metric='logloss')
scores = cross_val_score(best_xgb, X, y, cv=cv, scoring='accuracy')
print(f"XGBoost 5-Fold Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")


Best XGB Params: {'subsample': 0.7, 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}
XGBoost 5-Fold Accuracy: 0.7197 (+/- 0.0073)
