In [1]:
!pip install transformers
!pip install pandas
!pip install torch torchvision
!pip install sentencepiece
!pip install transformers[torch]

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m119.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m87.3 MB/s[0m eta [36m0:00:

In [2]:

# Import required libraries
from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.utils.data import Dataset
import pandas as pd


In [3]:
# Custom Dataset
class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, data_file, text_col, target_col, max_length=512):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(data_file)
        self.text_col = text_col
        self.target_col = target_col
        self.max_length = max_length
        print(f"DataFrame Length: {len(self.data)}")  # Debug statement 1

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
      text = self.data.loc[index, self.text_col]
      target = self.data.loc[index, self.target_col]

      # Tokenize both the text and target
      inputs = self.tokenizer(
          text,
          add_special_tokens=True,
          max_length=self.max_length,
          truncation=True,
          padding="max_length",
          return_attention_mask=True,
          return_tensors='pt'
      )

      targets = self.tokenizer(
          target,
          add_special_tokens=True,
          max_length=self.max_length,
          truncation=True,
          padding="max_length",
          return_attention_mask=True,
          return_tensors='pt'
      )

      # Squeeze unnecessary dimensions
      inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
      targets = {key: tensor.squeeze(0) for key, tensor in targets.items()}

      return {
          'input_ids': inputs['input_ids'],
          'attention_mask': inputs['attention_mask'],
          'labels': targets['input_ids'],
      }

In [4]:
# Initialize the T5 base model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
# Initialize custom dataset for FAQ
train_dataset = CustomTextDataset(tokenizer, "/content/train_faq.csv", "summarized_text", "FAQ")
val_dataset = CustomTextDataset(tokenizer, "/content/val_faq.csv", "summarized_text", "FAQ")
test_dataset = CustomTextDataset(tokenizer, "/content/test_faq.csv", "summarized_text", "FAQ")

DataFrame Length: 1757
DataFrame Length: 220
DataFrame Length: 220


In [6]:
# Data Collator with Debugging
class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    def __call__(self, batch):
        # Call the original data collator
        batch = super().__call__(batch)
        return batch


In [7]:
# Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer with CustomDataCollator
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=CustomDataCollatorForSeq2Seq(tokenizer, model),
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [8]:
# Train the model
trainer.train()

# Save the trained model
model.save_pretrained("/content/summarization_model")

# Evaluate on test data for FAQ
test_results = trainer.evaluate(test_dataset=test_dataset)
print("Test Results:", test_results)

Step,Training Loss
500,3.3274


Test Results: {'eval_loss': 0.026101037859916687, 'eval_runtime': 4.672, 'eval_samples_per_second': 47.089, 'eval_steps_per_second': 1.498, 'epoch': 5.0}
