In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
import os

# --- Performance Optimization ---
# Check if a GPU is available and set precision for faster computation on compatible hardware.
if torch.cuda.is_available():
    torch.set_float32_matmul_precision('high')
    print("CUDA is available. GPU will be used for training.")
else:
    print("CUDA not available. Training will run on CPU, which will be significantly slower.")

# --- 1. Load the Pre-training Dataset ---
# This file contains all the comments with their stance and sentiment labels.
file_path = '../datasets/dataset_with_stances.csv'
try:
    # Using 'stance_df' as the variable name to be clear and avoid conflicts.
    stance_df = pd.read_csv(file_path)
    # Drop rows where text is missing, as they cannot be used for training.
    stance_df.dropna(subset=['text', 'stance', 'sentiment'], inplace=True)
    print(f"Successfully loaded and cleaned {file_path}")
except FileNotFoundError:
    print(f"Error: File not found at '{file_path}'.")
    exit()

# --- 2. Prepare Data for T5 Multi-Task Learning ---
# T5 works well when you frame tasks as text-to-text problems.
# We'll add a prefix to each text input to tell the model which task to perform.
df_stance = pd.DataFrame({
    'input_text': 'stance: ' + stance_df['text'],
    'target_text': stance_df['stance']
})

df_sentiment = pd.DataFrame({
    'input_text': 'sentiment: ' + stance_df['text'],
    'target_text': stance_df['sentiment']
})

# Combine both tasks into a single DataFrame for training
pretrain_df = pd.concat([df_stance, df_sentiment], ignore_index=True)
print(f"Prepared a multi-task dataset with {len(pretrain_df)} examples.")

# --- 3. Split the Dataset ---
# We'll split the data into a training set and a smaller validation set
# to monitor the model's performance during training.
train_df, val_df = train_test_split(pretrain_df, test_size=0.1)

# --- 4. Initialize Tokenizer and Model ---
# We use the Flan-T5 model, which is a powerful, instruction-tuned version of T5.
model_name = 'google/flan-t5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# --- 5. Create a PyTorch Dataset ---
class RumorDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.inputs = dataframe['input_text'].tolist()
        self.targets = dataframe['target_text'].tolist()
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        target_text = str(self.targets[idx])

        # Tokenize the input and target texts
        source = self.tokenizer.batch_encode_plus(
            [input_text],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()

        return {
            'input_ids': source_ids,
            'attention_mask': source_mask,
            'labels': target_ids
        }

# Create the dataset objects
train_dataset = RumorDataset(train_df, tokenizer)
val_dataset = RumorDataset(val_df, tokenizer)

# --- 6. Define Training Arguments ---
# These arguments are optimized for faster training.
training_args = TrainingArguments(
    output_dir='../results/pretraining',
    num_train_epochs=3,
    per_device_train_batch_size=16,  # Increased batch size. Adjust down if you get memory errors.
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),      # Enable mixed-precision training if a GPU is available.
    dataloader_num_workers=os.cpu_count(), # Use all available CPU cores for data loading.
    optim="adamw_torch_fused",           # Use a faster, fused optimizer.
)

# --- 7. Initialize the Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# --- 8. Start Training ---
print("\nStarting model pre-training... This will take some time.")
trainer.train()
print("Pre-training complete.")

# --- 9. Save the Final Model ---
# This model is now "pre-trained" on rumor knowledge and ready for Stage 2.
final_model_path = '../models/rumor_knowledge_model'
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Pre-trained model saved to '{final_model_path}'.")


CUDA is available. GPU will be used for training.
Successfully loaded and cleaned ../datasets/dataset_with_stances.csv
Prepared a multi-task dataset with 99332 examples.

Starting model pre-training... This will take some time.
