In [15]:
import matplotlib.pyplot as plt
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from transformers import DataCollatorForSeq2Seq
from urllib.parse import urlparse
from urllib.parse import parse_qs
from transformers import T5Tokenizer
from evaluate import load
from youtube_transcript_api import YouTubeTranscriptApi
import scipy
import math
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer

In [2]:
data_set = load_dataset("trl-lib/tldr")
ds = data_set

In [3]:
ds = {}
for set_type in data_set.keys():
  ds[set_type] = pd.DataFrame(data_set[set_type])
  ds[set_type]["prompt_post"] = ds[set_type].prompt.str.extract(
      r'POST: ((.|\n)*)\nTL;DR:', expand=False
  ).iloc[:, 0]
  ds[set_type]["prompt_title"] = ds[set_type].prompt.str.extract(
    r'TITLE: ((.|\n)*)\n\nPOST:', expand=False
  ).iloc[:, 0]

In [4]:
train_set = ds["train"]

In [5]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
training_samples = ds["train"].sample(1000)
validation_samples = ds["validation"].sample(100)
training_samples.columns

Index(['prompt', 'completion', 'prompt_post', 'prompt_title'], dtype='object')

In [7]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples.loc[:, "prompt"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    # we had max 450 words, so we must be fine. wiht 512 with limit

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples.loc[:, "completion"].values.tolist(), max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
p_training_samples = Dataset.from_dict(preprocess_function(training_samples))
p_validation_samples = Dataset.from_dict(preprocess_function(validation_samples))
rouge = load("rouge")



In [9]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # If model returns tuple (logits, etc.)
    if isinstance(preds, tuple):
        preds = preds[0]

    # Ensure lists of ints (remove nested levels)
    preds = [p.tolist() if hasattr(p, "tolist") else p for p in preds]
    labels = [l.tolist() if hasattr(l, "tolist") else l for l in labels]

    # Some preds are 3D (e.g. [batch, seq, beam]) â€” fix that
    if isinstance(preds[0][0], list):
        preds = [p[0] for p in preds]  # take first beam

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 with pad_token_id before decoding labels
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 4) for k, v in result.items()}



In [19]:
training_args = TrainingArguments(
    output_dir="./mini-trained-model-for-project",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=p_training_samples,
    eval_dataset=p_validation_samples,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss


IndexError: piece id is out of range.

In [16]:
non_sense = "https://www.youtube.com/watch?v=YcSP1ZUf1eQ&list=RDYcSP1ZUf1eQ&start_radio=1"
def get_video_key(p_url):
    parsed_url = urlparse(p_url)
    captured_value = parse_qs(parsed_url.query)['v'][0]
    return captured_value

ytt_api = YouTubeTranscriptApi()

fc = ytt_api.fetch(get_video_key(non_sense))