In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import nltk
from nltk.corpus import stopwords

def preprocess_text(text):
    """To samo czyszczenie co w Regresji Logistycznej"""
    if not isinstance(text, str): return ""
    stop_words = set(stopwords.words('english'))
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return " ".join([w for w in text.split() if w not in stop_words and len(w) > 2])

def run_xgboost():
    print("--- 1. Wczytywanie danych (XGBoost) ---")
    df = pd.read_csv("../data/WELFake_Dataset.csv")
    df.dropna(subset=['title', 'text'], inplace=True)
    df['full_text'] = df['title'] + " " + df['text']
    
    print("--- 2. Preprocessing ---")
    df['cleaned_text'] = df['full_text'].apply(preprocess_text)
    
    X = df['cleaned_text']
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    print("--- 3. Wektoryzacja TF-IDF ---")
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    print("--- 4. Trening (XGBoost) ---")
    # Używamy GPU jeśli (tree_method='gpu_hist'), jeśli nie - usuń ten parametr
    model = XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        use_label_encoder=False
        # tree_method='gpu_hist' # Odkomentuj jeśli masz skonfigurowane GPU dla XGBoost
    )
    model.fit(X_train_tfidf, y_train)
    
    print("--- 5. Ewaluacja ---")
    preds = model.predict(X_test_tfidf)
    print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")
    print(f"F1 Score: {f1_score(y_test, preds):.4f}")
    print("\nRaport:")
    print(classification_report(y_test, preds))

if __name__ == "__main__":
    run_xgboost()

--- 1. Wczytywanie danych (XGBoost) ---
--- 2. Preprocessing ---
--- 3. Wektoryzacja TF-IDF ---
--- 4. Trening (XGBoost) ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- 5. Ewaluacja ---
Accuracy: 0.9674
F1 Score: 0.9684

Raport:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      7006
           1       0.96      0.98      0.97      7302

    accuracy                           0.97     14308
   macro avg       0.97      0.97      0.97     14308
weighted avg       0.97      0.97      0.97     14308

