In [1]:
import warnings
warnings.filterwarnings("ignore")


In [1]:
import os
import shutil
# Avoid TensorFlow imports by setting environment variable
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # Suppress TensorFlow warnings
os.environ["USE_TORCH"] = "1"  # Force transformers to use PyTorch

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import requests

# Increase timeout for Hugging Face downloads
os.environ["HF_HUB_ETAG_TIMEOUT"] = "60"  # Set timeout to 60 seconds
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "60"

# Clear cache directory to avoid conflicts
cache_dir = '/kaggle/working/cache'
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)
os.makedirs(cache_dir, exist_ok=True)

# Verify no TensorFlow interference
try:
    import tensorflow as tf
    print("Warning: TensorFlow is imported. This may cause conflicts.")
except ImportError:
    print("No TensorFlow import detected. Proceeding with PyTorch.")

# Check internet connectivity
try:
    response = requests.get("https://huggingface.co", timeout=10)
    response.raise_for_status()
    print("Internet connection verified.")
except requests.RequestException as e:
    print(f"Warning: No internet connection ({e}). Ensure internet is enabled in Kaggle settings or use offline mode.")

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset from Kaggle input directory
data_path = "/kaggle/input/nnnnpp2/your_file_name.csv"
try:
    df = pd.read_csv(data_path)
    if 'article' not in df.columns or 'highlights' not in df.columns:
        raise ValueError("Dataset must contain 'article' and 'highlights' columns")
    print(f"Dataset loaded successfully. Size: {len(df)} rows")
except FileNotFoundError:
    raise FileNotFoundError(f"Dataset not found at {data_path}. Please check the path.")

# Split into train and validation sets (10% for validation)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print(f"Training set size: {len(train_df)} rows, Validation set size: {len(val_df)} rows")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

# Initialize tokenizer
try:
    tokenizer = T5Tokenizer.from_pretrained('t5-small', cache_dir=cache_dir)
except Exception as e:
    raise RuntimeError(f"Failed to load tokenizer: {e}")

# Preprocess function to tokenize inputs and targets
def preprocess_function(examples):
    inputs = ["summarize: " + str(doc) for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer([str(highlight) for highlight in examples['highlights']], max_length=128, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize datasets
try:
    tokenized_train = train_dataset.map(preprocess_function, batched=True, num_proc=4)
    tokenized_val = val_dataset.map(preprocess_function, batched=True, num_proc=4)
except Exception as e:
    raise RuntimeError(f"Tokenization failed: {e}")

# Initialize model and move to GPU
try:
    model = T5ForConditionalGeneration.from_pretrained('t5-small', cache_dir=cache_dir).to(device)
except Exception as e:
    raise RuntimeError(f"Failed to load model: {e}")

# Define training arguments optimized for Kaggle GPU
training_args = TrainingArguments(
    output_dir='/kaggle/working/results',
    num_train_epochs=10,
    per_device_train_batch_size=8,  # Suitable for 6500 rows on Kaggle GPU
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs',
    logging_steps=100,
    eval_strategy='steps',
    eval_steps=500,
    save_strategy='steps',
    save_steps=1000,
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision for GPU
    report_to='none',  # Disable wandb logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

# Start training
try:
    trainer.train()
except Exception as e:
    raise RuntimeError(f"Training failed: {e}")

# Save the fine-tuned model
model.save_pretrained('/kaggle/working/fine_tuned_t5')
tokenizer.save_pretrained('/kaggle/working/fine_tuned_t5')
print("Model and tokenizer saved to /kaggle/working/fine_tuned_t5")

# Example inference
def summarize_text(text):
    inputs = tokenizer("summarize: " + str(text), return_tensors='pt', max_length=512, truncation=True).to(device)
    summary_ids = model.generate(inputs['input_ids'], max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test inference with a sample article
try:
    sample_article = df['article'].iloc[0]
    print("Sample Article:", str(sample_article)[:200], "...")
    print("Summary:", summarize_text(sample_article))
except Exception as e:
    print(f"Inference failed: {e}")

E0000 00:00:1750262461.834579      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750262461.893071      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Internet connection verified.
Using device: cuda
Dataset loaded successfully. Size: 6490 rows
Training set size: 5841 rows, Validation set size: 649 rows


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map (num_proc=4):   0%|          | 0/5841 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/649 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,1.2491,1.133768
1000,1.2096,1.118031
1500,1.1591,1.11557
2000,1.1578,1.110931
2500,1.1684,1.110257
3000,1.1454,1.109255
3500,1.1503,1.109499


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Model and tokenizer saved to /kaggle/working/fine_tuned_t5
Sample Article: A drunk teenage boy had to be rescued by security after jumping into a lions' enclosure at a zoo in western India. Rahul Kumar, 17, clambered over the enclosure fence at the Kamla Nehru Zoological Par ...
Summary: Rahul Kumar, 17, climbed into a lions' enclosure at a zoo in Ahmedabad. He ran towards the lions shouting: 'Today I kill a lion or a lion kills me!' He was rescued by zoo guards before reaching the lions.


<function __main__.preprocess_function(examples)>