In [None]:
import pandas as pd
import re
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# --- KONFIGURACJA ---
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 2
LEARNING_RATE = 2e-5 # Zmniejszone dla stabilności

def light_clean_text(text):
    """Lekkie czyszczenie dla BERT (zachowuje interpunkcję)"""
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', '', text) # Usuń linki
    text = re.sub(r'<.*?>', '', text)            # Usuń HTML
    text = re.sub(r'\s+', ' ', text).strip()     # Usuń podwójne spacje
    return text

class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds)
    }

def run_bert():
    print("--- 1. Wczytywanie i Light Clean (BERT) ---")
    df = pd.read_csv("../data/WELFake_Dataset.csv")
    df.dropna(subset=['title', 'text'], inplace=True)
    
    # Łączenie i czyszczenie
    df['light_text'] = (df['title'] + " " + df['text']).apply(light_clean_text)
    
    # Podział danych i RESET INDEKSÓW 
    X = df['light_text']
    y = df['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Reset indeksów, aby pasowały do Datasetu pytorchowego
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    print("--- 2. Tokenizacja (To potrwa chwilę dla 512 tokenów) ---")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=MAX_LEN)
    test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=MAX_LEN)
    
    train_dataset = FakeNewsDataset(train_encodings, y_train)
    test_dataset = FakeNewsDataset(test_encodings, y_test)
    
    print("--- 3. Przygotowanie modelu ---")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Używane urządzenie: {device}")
    
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.to(device)
    
    training_args = TrainingArguments(
        output_dir='./results_bert_fixed',
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=4, # Symuluje większy batch (8 * 4 = 32)
        learning_rate=LEARNING_RATE,
        warmup_steps=100,
        weight_decay=0.01,
        logging_steps=50,
        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        load_best_model_at_end=True,
        fp16=True, # Dla kart RTX, okolo 2x boost 
        report_to="none"
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )
    
    print("--- 4. Rozpoczęcie Treningu ---")
    trainer.train()
    
    print("--- 5. Finalna Ewaluacja ---")
    print(trainer.evaluate())
    
    # Zapis modelu
    model.save_pretrained("./saved_model_bert_final")
    tokenizer.save_pretrained("./saved_model_bert_final")
    print("Model zapisany.")

if __name__ == "__main__":
    run_bert()

--- 1. Wczytywanie i Light Clean (BERT) ---
--- 2. Tokenizacja (To potrwa chwilę dla 512 tokenów) ---
--- 3. Przygotowanie modelu ---
Używane urządzenie: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- 4. Rozpoczęcie Treningu ---


Step,Training Loss,Validation Loss,Accuracy,F1
200,0.0626,0.085455,0.970995,0.970879
400,0.0435,0.038048,0.989446,0.98962
600,0.0405,0.043174,0.989726,0.989961
800,0.0348,0.023007,0.993011,0.993124
1000,0.026,0.019731,0.99378,0.99391
1200,0.0374,0.02715,0.991753,0.991979
1400,0.0335,0.014109,0.995737,0.995829
1600,0.0149,0.015564,0.994688,0.994787
1800,0.0142,0.013549,0.995457,0.99554
2000,0.018,0.013809,0.996156,0.996239


--- 5. Finalna Ewaluacja ---


{'eval_loss': 0.01041598804295063, 'eval_accuracy': 0.9971344702264467, 'eval_f1': 0.9971919731525238, 'eval_runtime': 57.3762, 'eval_samples_per_second': 249.372, 'eval_steps_per_second': 31.18, 'epoch': 2.0}
Model zapisany.
