In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_fscore_support

# --- 1. Завантаження даних ---
print("--- 1. Data Loading ---")

train_df_raw = pd.read_csv('../data/liar/train_filtered.csv')
val_df_raw = pd.read_csv('../data/liar/valid_filtered.csv')
test_df_raw = pd.read_csv('../data/liar/test_filtered.csv')

#
# train_df_raw = pd.read_csv('../data/fakenewsnet_dataset/combined_train.csv')
# val_df_raw = pd.read_csv('../data/fakenewsnet_dataset/combined_val.csv')
# test_df_raw = pd.read_csv('../data/fakenewsnet_dataset/combined_test.csv')

# --- 2. Очищення даних та синхронізація X та y ---
print("\n--- 2. Data Cleaning and X, y Synchronization ---")

def clean_and_prepare_data(df, text_column='statement', label_column='binary_label'):
    print(f"Initial shape: {df.shape}")
    # Перевірка на NaN у текстовому стовпці та видалення
    initial_nan_count = df[text_column].isna().sum()
    if initial_nan_count > 0:
        print(f"NaNs found in '{text_column}': {initial_nan_count}")
        df.dropna(subset=[text_column], inplace=True)
        print(f"Shape after dropping NaNs from '{text_column}': {df.shape}")

    # Видалення рядків, де текст порожній або складається лише з пробілів
    initial_len = len(df)
    # Використовуємо .loc для уникнення SettingWithCopyWarning та для коректного відбору
    df = df.loc[df[text_column].apply(lambda x: isinstance(x, str) and x.strip() != '')].copy()
    empty_removed_count = initial_len - len(df)
    if empty_removed_count > 0:
        print(f"Empty/whitespace-only strings removed from '{text_column}': {empty_removed_count}")
        print(f"Shape after removing empty strings: {df.shape}")

    # Перевірка на NaN у цільовому стовпці (якщо потрібно, але зазвичай мітки не бувають NaN)
    nan_labels = df[label_column].isna().sum()
    if nan_labels > 0:
        print(f"Warning: NaNs found in label column '{label_column}': {nan_labels}. Consider how to handle these.")
        # df.dropna(subset=[label_column], inplace=True) # Розкоментуйте, якщо потрібно видаляти
        # print(f"Shape after dropping NaNs from '{label_column}': {df.shape}")

    X = df[text_column]
    y = df[label_column]
    print(f"Final shapes: X: {X.shape}, y: {y.shape}")
    return X, y

X_train, y_train = clean_and_prepare_data(train_df_raw.copy(), text_column='statement', label_column='binary_label')
X_val, y_val = clean_and_prepare_data(val_df_raw.copy(), text_column='statement', label_column='binary_label')
X_test, y_test = clean_and_prepare_data(test_df_raw.copy(), text_column='statement', label_column='binary_label')

print("\nNaN check after cleaning and splitting:")
print("NaN in X_train:", X_train.isna().sum())
print("NaN in X_val:", X_val.isna().sum())
print("NaN in X_test:", X_test.isna().sum())
print("NaN in y_train:", y_train.isna().sum()) # Додаткова перевірка для y
print("NaN in y_val:", y_val.isna().sum())
print("NaN in y_test:", y_test.isna().sum())


# Переконайтеся, що індекси скинуті, якщо це необхідно для подальшої обробки
# (хоча для TfidfVectorizer та навчання моделей це зазвичай не критично, якщо X та y вирівняні)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


In [None]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)

In [None]:
model_rf = RandomForestClassifier(n_estimators=50, random_state=42)
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
print("Shape of X_train_vec:", X_train_vec.shape)
print("Shape of y_train:", y_train.shape)
y_train = y_train[:X_train_vec.shape[0]]
print("Shape of X_train_vec:", X_train_vec.shape)
print("Shape of y_train:", y_train.shape)

In [None]:
model_rf.fit(X_train_vec, y_train)
model_xgb.fit(X_train_vec, y_train)

In [None]:
def evaluate_model(model, X, y, name='Model'):
    y_probs = model.predict_proba(X)[:, 1]
    y_pred = (y_probs >= 0.5).astype(int)

    print(f"=== {name} REPORT ===")
    print(classification_report(y, y_pred))

    # Матриця плутанини
    cm = confusion_matrix(y, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} Confusion Matrix')
    plt.show()

    # ROC-крива
    fpr, tpr, _ = roc_curve(y, y_probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{name} ROC Curve')
    plt.legend()
    plt.show()

    # Розподіл “невпевнених” прогнозів
    uncertain = (y_probs >= 0.4) & (y_probs <= 0.6)
    print(f"{name} uncertain cases (prob 0.4–0.6): {np.sum(uncertain)} / {len(y)}")


In [None]:
y_test = y_test[:X_test_vec.shape[0]]
evaluate_model(model_rf, X_test_vec, y_test, name='Random Forest')
evaluate_model(model_xgb, X_test_vec, y_test, name='XGBoost')

In [None]:
import pandas as pd

all_results = []

for model, name in [(model_rf, 'Random Forest'), (model_xgb, 'XGBoost')]:
    for X, y, ds_name in [(X_train_vec, y_train, 'Train'),
                          (X_val_vec, y_val, 'Validation'),
                          (X_test_vec, y_test, 'Test')]:

        y_probs = model.predict_proba(X)[:, 1]
        y_pred = (y_probs >= 0.5).astype(int)
        y = y[:y_pred.shape[0]]
        precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average='binary')
        acc = np.mean(y_pred == y)
        fpr, tpr, _ = roc_curve(y, y_probs)
        roc_auc = auc(fpr, tpr)
        uncertain = np.sum((y_probs >= 0.4) & (y_probs <= 0.6)) / len(y)

        all_results.append({
            'Model': name,
            'Dataset': ds_name,
            'Accuracy': acc,
            'Precision': precision,
            'Recall': recall,
            'F1': f1,
            'AUC': roc_auc,
            'Uncertain (%)': uncertain * 100
        })

results_df = pd.DataFrame(all_results)
print(results_df)
results_df.to_csv('../results/model_results.csv', index=False)
print("Saved results to ../results/model_results.csv")


In [None]:

# Функція оцінки
def evaluate_model(model, X, y, dataset_name, model_name):
    y_probs = model.predict_proba(X)[:, 1]
    y_pred = (y_probs >= 0.5).astype(int)
    y = y[:y_pred.shape[0]]
    print(f"=== {model_name} on {dataset_name} ===")
    report = classification_report(y, y_pred, output_dict=True)
    print(classification_report(y, y_pred))

    # Матриця плутанини
    cm = confusion_matrix(y, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} {dataset_name} Confusion Matrix')
    plt.show()

    # ROC-крива
    fpr, tpr, _ = roc_curve(y, y_probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{dataset_name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} {dataset_name} ROC Curve')
    plt.legend()
    plt.show()

    # Невпевнені прогнози
    uncertain = (y_probs >= 0.4) & (y_probs <= 0.6)
    print(f"{model_name} {dataset_name} uncertain cases (prob 0.4–0.6): {np.sum(uncertain)} / {len(y)}")

    return report

# Порівняння train / val / test
for model, name in [(model_rf, 'Random Forest'), (model_xgb, 'XGBoost')]:
    print(f"\n\n==== {name} ANALYSIS ====")
    evaluate_model(model, X_train_vec, y_train, 'Train', name)
    evaluate_model(model, X_val_vec, y_val, 'Validation', name)
    evaluate_model(model, X_test_vec, y_test, 'Test', name)

from sklearn.metrics import precision_recall_curve

def plot_pr_curve(y_true, y_probs, model_name, ds_name):
    precision, recall, _ = precision_recall_curve(y_true, y_probs)
    plt.plot(recall, precision, marker='.')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{model_name} {ds_name} Precision-Recall Curve')
    plt.show()

for model, name in [(model_rf, 'Random Forest'), (model_xgb, 'XGBoost')]:
    for X, y, ds_name in [(X_test_vec, y_test, 'Test')]:
        y_probs = model.predict_proba(X)[:, 1]
        plot_pr_curve(y, y_probs, name, ds_name)


from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss

calibrated_rf = CalibratedClassifierCV(model_rf, method='isotonic', cv='prefit')

calibrated_rf.fit(X_val_vec, y_val)

y_probs_calib = calibrated_rf.predict_proba(X_test_vec)[:, 1]
brier = brier_score_loss(y_test, y_probs_calib)
print(f'Brier score (calibrated Random Forest on Test): {brier:.4f}')


In [None]:

# Важливість ознак
def plot_feature_importance(model, vectorizer, model_name, top_n=20):
    feature_names = np.array(vectorizer.get_feature_names_out())
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    else:
        importances = model.get_booster().get_score(importance_type='weight')
        feature_names = np.array(list(importances.keys()))
        importances = np.array(list(importances.values()))
        importances = importances / importances.sum()

    sorted_idx = np.argsort(importances)[::-1][:top_n]
    top_features = feature_names[sorted_idx]
    top_importances = importances[sorted_idx]

    plt.figure(figsize=(10, 6))
    sns.barplot(x=top_importances, y=top_features)
    plt.title(f'{model_name} Top {top_n} Important Features')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.show()

print("\nFeature Importance:")
plot_feature_importance(model_rf, vectorizer, 'Random Forest', top_n=20)
plot_feature_importance(model_xgb, vectorizer, 'XGBoost', top_n=20)