In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
import torch
import os
from datasets import Dataset # Moved import to the top for clarity

# --- Custom Callback for Epoch Progress ---
class EpochProgressCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, **kwargs):
        print(f"\n--- Starting Epoch {int(state.epoch) + 1}/{int(state.num_train_epochs)} ---")
    def on_epoch_end(self, args, state, control, logs=None, **kwargs):
        if logs:
            print(f"--- Finished Epoch {int(state.epoch)} ---")
            print(f"Validation Loss: {logs.get('eval_loss', 'N/A'):.4f}")
            print("--------------------------------------------------")

# --- Performance Optimization ---
if torch.cuda.is_available():
    torch.set_float32_matmul_precision('high')
    print("CUDA is available. GPU will be used for training.")
else:
    print("CUDA not available. Training will run on CPU, which will be significantly slower.")

# --- 1. Load the Pre-training Dataset ---
file_path = '../datasets/dataset_with_stances.csv'
try:
    stance_df = pd.read_csv(file_path)
    stance_df.dropna(subset=['text', 'stance', 'sentiment'], inplace=True)
    print(f"Successfully loaded and cleaned {file_path}")
except FileNotFoundError:
    print(f"Error: File not found at '{file_path}'.")
    exit()

# --- 2. Prepare Data for Multi-Task Learning ---
df_stance = pd.DataFrame({
    'input_text': 'stance: ' + stance_df['text'].astype(str),
    'target_text': stance_df['stance'].astype(str)
})
df_sentiment = pd.DataFrame({
    'input_text': 'sentiment: ' + stance_df['text'].astype(str),
    'target_text': stance_df['sentiment'].astype(str)
})
pretrain_df = pd.concat([df_stance, df_sentiment], ignore_index=True)
print(f"Prepared a multi-task dataset with {len(pretrain_df)} examples.")

# --- 3. Split the Dataset ---
train_df, val_df = train_test_split(pretrain_df, test_size=0.1, random_state=42)

# --- 4. Initialize Tokenizer and Model ---
model_name = 'google/flan-t5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# --- 5. OPTIMIZATION: Pre-tokenize the entire dataset ---
print("\nStarting to pre-tokenize the entire dataset... This may take a moment.")

# MODIFIED: Added 'tokenizer' as an argument to the function
def tokenize_function(examples, tokenizer):
    inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], padding='max_length', truncation=True, max_length=128)
    
    inputs['labels'] = labels['input_ids']
    return inputs

train_hf_dataset = Dataset.from_pandas(train_df)
val_hf_dataset = Dataset.from_pandas(val_df)

# MODIFIED: Passed the tokenizer to the map function using 'fn_kwargs'
tokenized_train_dataset = train_hf_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=os.cpu_count(),
    fn_kwargs={'tokenizer': tokenizer}
)
tokenized_val_dataset = val_hf_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=os.cpu_count(),
    fn_kwargs={'tokenizer': tokenizer}
)
print("Pre-tokenization complete.")


# --- 6. Define Training Arguments ---
training_args = TrainingArguments(
    output_dir='../results/pretraining',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=os.cpu_count(),
    optim="adamw_torch_fused",
)

# --- 7. Initialize the Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    callbacks=[EpochProgressCallback()]
)

# --- 8. Start Training ---
print("\nStarting model pre-training...")
trainer.train()
print("Pre-training complete.")

# --- 9. Save the Final Model ---
final_model_path = '../models/rumor_knowledge_model'
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Pre-trained model saved to '{final_model_path}'.")



CUDA is available. GPU will be used for training.
Successfully loaded and cleaned ../datasets/dataset_with_stances.csv
Prepared a multi-task dataset with 99332 examples.

Starting to pre-tokenize the entire dataset... This may take a moment.


Map (num_proc=12): 100%|██████████| 89398/89398 [00:29<00:00, 3014.35 examples/s] 
Map (num_proc=12): 100%|██████████| 9934/9934 [00:12<00:00, 780.76 examples/s] 


Pre-tokenization complete.

Starting model pre-training...

--- Starting Epoch 1/3 ---


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,



--- Starting Epoch 2/3 ---

--- Starting Epoch 3/3 ---
Pre-training complete.
Pre-trained model saved to '../models/rumor_knowledge_model'.
