# Fake News Detection: изучениe эффективности методов

## 1. Загрузка и анализ начальных данных

### Загрузка модулей

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
import optuna

### Загрузка начальных данных

In [None]:
import yaml

with open('C:/Users/Zver/Desktop/uni/AIF/hw3/config.yaml', 'r') as f:
    cfg = yaml.safe_load(f)

In [None]:
train = pd.read_csv(cfg['fake_news']['train'])[['id', 'text', 'label']]
test = pd.read_csv(cfg['fake_news']['test'])[['id', 'text']]

### Начальный анализ

In [None]:
print(f"Train data shape: {train.shape}")

In [None]:
print(f"Test data shape: {test.shape}")

## Анализ и подготовка датасета


In [None]:
train = train[train['label'].apply(lambda x: str(x).strip().isdigit())]
train['label'] = train['label'].astype(int)
print(f"Train data shape after filtering: {train.shape}")

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# === Text Preprocessing ===
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

class TextPreprocessor:
    def __init__(self):
        print("Initializing TextPreprocessor...")
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        print("TextPreprocessor initialized.")

    def clean_text(self, text):
        print("Cleaning text...")
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"[^\w\s]", "", text)
        cleaned_text = text.lower()
        print(f"Cleaned text sample: {cleaned_text[:100]}...")
        return cleaned_text

    def tokenize(self, text):
        print("Tokenizing text...")
        tokens = word_tokenize(text)
        print(f"Number of tokens: {len(tokens)}")
        return tokens

    def remove_stopwords(self, tokens):
        print("Removing stopwords...")
        filtered_tokens = [token for token in tokens if token not in self.stop_words]
        print(f"Number of tokens after removing stopwords: {len(filtered_tokens)}")
        return filtered_tokens

    def lemmatize(self, tokens):
        print("Lemmatizing tokens...")
        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        print(f"Lemmatized tokens sample: {lemmatized_tokens[:10]}...")
        return lemmatized_tokens

    def preprocess_text(self, text):
        print("Preprocessing text...")
        text = self.clean_text(text)
        tokens = self.tokenize(text)
        tokens = self.remove_stopwords(tokens)
        tokens = self.lemmatize(tokens)
        preprocessed_text = " ".join(tokens)
        print(f"Preprocessed text sample: {preprocessed_text[:100]}...")
        return preprocessed_text

text_preprocessor = TextPreprocessor()
print("Applying TextPreprocessor to train data...")
train['text'] = train['text'].fillna("").apply(text_preprocessor.preprocess_text)
print("Applying TextPreprocessor to test data...")
test['text'] = test['text'].fillna("").apply(text_preprocessor.preprocess_text)
print("Text preprocessing complete.")

In [None]:
# === TF-IDF ===
print("Starting TF-IDF Vectorization...")
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2)) #Stop words removed in Text Preprocessor
X = tfidf.fit_transform(train['text'])
print(f"Shape of TF-IDF matrix (X): {X.shape}")
X_test_final = tfidf.transform(test['text'])
print(f"Shape of TF-IDF matrix (X_test_final): {X_test_final.shape}")
y = train['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")

## Обучение

In [None]:
# === Logistic Regression ===
def objective_lr(trial):
    print("Starting Logistic Regression Optuna trial...")
    C = trial.suggest_loguniform('C', 1e-3, 10.0)
    print(f"Trial C value: {C}")
    model = LogisticRegression(C=C, max_iter=1000, solver='liblinear') #Added solver, prevents errors.
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    print(f"Trial score: {score}")
    print("Logistic Regression Optuna trial complete.")
    return score

print("Starting Logistic Regression Optuna optimization...")
study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(objective_lr, n_trials=3)
print("Logistic Regression Optuna optimization complete.")
print(f"Best Logistic Regression parameters: {study_lr.best_params}")
best_lr = LogisticRegression(C=study_lr.best_params['C'], max_iter=1000, solver='liblinear')
best_lr.fit(X_train, y_train)
val_lr = accuracy_score(y_val, best_lr.predict(X_val))
print(f"Logistic Regression Accuracy: {val_lr:.4f}")

# === LightGBM ===
def objective_lgb(trial):
    print("Starting LightGBM Optuna trial...")
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.7, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True), #Added L1 Regularization
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True), #Added L2 Regularization

    }
    print(f"Trial parameters: {params}")
    model = lgb.LGBMClassifier(**params, n_estimators=100, random_state=42) #Added random state
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    print(f"Trial score: {score}")
    print("LightGBM Optuna trial complete.")
    return score

print("Starting LightGBM Optuna optimization...")
study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(objective_lgb, n_trials=3)
print("LightGBM Optuna optimization complete.")
print(f"Best LightGBM parameters: {study_lgb.best_params}")
best_lgb = lgb.LGBMClassifier(**study_lgb.best_params, n_estimators=100, random_state=42)
best_lgb.fit(X_train, y_train)
val_lgb = accuracy_score(y_val, best_lgb.predict(X_val))
print(f"LightGBM Accuracy: {val_lgb:.4f}")

# === XGBoost ===
def objective_xgb(trial):
    print("Starting XGBoost Optuna trial...")
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True), #Added gamma regularization
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True), # Added L1 reg
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True), #Added L2 reg
    }
    print(f"Trial parameters: {params}")
    model = XGBClassifier(**params, n_estimators=100, random_state=42) #Added random_state
    score = cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy").mean()
    print(f"Trial score: {score}")
    print("XGBoost Optuna trial complete.")
    return score

print("Starting XGBoost Optuna optimization...")
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=3)
print("XGBoost Optuna optimization complete.")
print(f"Best XGBoost parameters: {study_xgb.best_params}")
best_xgb = XGBClassifier(**study_xgb.best_params, n_estimators=100, use_label_encoder=False, eval_metric="logloss", random_state=42)
best_xgb.fit(X_train, y_train)
val_xgb = accuracy_score(y_val, best_xgb.predict(X_val))
print(f"XGBoost Accuracy: {val_xgb:.4f}")

### Анализ:

Таким образом получены следующие оценки точности:

Логистическая регрессия: 0.9747

LightGBM: 0.9817

XGBoost: 0.9805

Ввиду особенностей работы библиотеки Optuna, данные значения ориентировочны и могут меняться даже при одинаковых параметрах обучения. В целом можно сделать вывод, что логистическая регрессия справляется хуже, чем LightGBM и XGBoost, которые показывают примерно одинаковый результат.

### Получение результата

In [None]:
# === Ensemble ===
print("Starting Ensemble...")
ensemble = VotingClassifier(
    estimators=[
        ('lr', best_lr),
        ('lgb', best_lgb),
        ('xgb', best_xgb)
    ],
    voting='soft'
)
ensemble.fit(X, y)
print("Ensemble fitted.")
final_preds = ensemble.predict(X_test_final)
print("Ensemble predictions made.")

submission = pd.DataFrame({'id': test['id'], 'label': final_preds})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")

In [None]:
submission.head(50)