In [16]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install nltk
!pip install rouge_score
!pip install accelerate -U
!pip install transformers[torch]

zsh:1: no matches found: transformers[torch]


In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
model_id="google/flan-t5-small"
access_token = ""
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=access_token)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, use_auth_token=access_token)
import torch
import json
from datasets import load_dataset, Dataset
import pandas
from evaluate import load



In [20]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available. Using MPS device.")
else:
    device = torch.device("cpu")
    print("MPS device not found. Using CPU.")

MPS is available. Using MPS device.


In [22]:
training_data = []
count = 0
with open("../NLP Processing/after_scraping/Fine-Tuning-Datasets/tuning_summarized_data.json", "r") as file:
    context_data = json.load(file)
    for context in context_data:
      training_data.append({"document": context["context"], "summary":context["summary"] , "id": count})
      count+=1

df = pandas.DataFrame(training_data)

dataset = Dataset.from_pandas(df)

train_dataset = dataset
test_dataset = dataset
metric = load("rouge")
dataset

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 250
})

In [23]:
fake_preds = ["hello there", "general kenobi"]
fake_labels = ["hello there", "general kenobi"]
metric.compute(predictions=fake_preds, references=fake_labels)

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}

In [24]:
from transformers import AutoTokenizer
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [25]:
max_input_length = 1024
max_target_length = 512
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

preprocess_function(train_dataset[:2])

{'input_ids': [[21603, 10, 23689, 12530, 4871, 155, 93, 107, 2837, 221, 21, 2142, 2168, 8172, 7155, 7, 14862, 57, 18529, 9, 9655, 9, 3488, 221, 63, 291, 16, 8, 215, 507, 4225, 23689, 12530, 19, 46, 3555, 408, 57, 2789, 3, 7, 24812, 10099, 11, 19, 80, 13, 8, 200, 8548, 1747, 16, 23689, 37, 3, 15, 6117, 1528, 20212, 13009, 21, 17, 3676, 1584, 2951, 7293, 7, 4648, 11, 1442, 8652, 7, 590, 28, 8732, 1679, 25068, 7, 16, 8, 2422, 94, 19, 213, 8, 11268, 384, 341, 3, 28799, 44, 8, 915, 100, 10900, 3409, 19, 1327, 705, 145, 46, 9241, 17, 7159, 37, 20212, 65, 4964, 3361, 7, 24, 43, 118, 3, 20923, 12, 8, 3488, 221, 63, 291, 7, 13, 499, 7, 127, 15, 10450, 12379, 152, 189, 1823, 1478, 20008, 450, 76, 382, 603, 53, 7, 1771, 12, 2089, 45, 335, 3, 1206, 5422, 12, 305, 3, 1206, 3246, 16924, 651, 8495, 27, 9142, 3, 13427, 21, 2557, 7, 27, 9142, 3, 25991, 21, 2959, 277, 12021, 3403, 368, 2929, 3450, 86, 23689, 1], [21603, 10, 2262, 4987, 30356, 3, 7, 5550, 12530, 3181, 15, 7, 107, 3304, 13363, 5514, 23, 2

In [26]:
training_dataset = train_dataset.map(preprocess_function, batched=True)
testing_dataset = test_dataset.map(preprocess_function, batched=True)



Map: 100%|██████████| 250/250 [00:00<00:00, 5462.44 examples/s]
Map: 100%|██████████| 250/250 [00:00<00:00, 6106.35 examples/s]


In [27]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [32]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-attraction",

    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
    hub_token=access_token,
)



In [33]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [34]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [35]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=training_dataset,
    eval_dataset=testing_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [37]:
trainer.train()



AttributeError: module 'wandb' has no attribute 'log'

In [39]:
model.push_to_hub("finetuned-attraction_summarization_t5", use_auth_token=access_token)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

CommitInfo(commit_url='https://huggingface.co/NoelTiju/finetuned-attraction_summarization_t5/commit/ebb726eef426a116222faa0fde0ebb767f41b797', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='ebb726eef426a116222faa0fde0ebb767f41b797', pr_url=None, pr_revision=None, pr_num=None)

In [40]:
tokenizer.push_to_hub("finetuned-attraction_summarization_t5", use_auth_token=access_token)



CommitInfo(commit_url='https://huggingface.co/NoelTiju/finetuned-attraction_summarization_t5/commit/02d4837871df0861fdbc96078be44e6cb575331e', commit_message='Upload tokenizer', commit_description='', oid='02d4837871df0861fdbc96078be44e6cb575331e', pr_url=None, pr_revision=None, pr_num=None)

In [42]:
model.save_pretrained("finetuned-attraction_summarization_t5")
tokenizer.save_pretrained("finetuned-attraction_summarization_t5")

('finetuned-attraction_summarization_t5/tokenizer_config.json',
 'finetuned-attraction_summarization_t5/special_tokens_map.json',
 'finetuned-attraction_summarization_t5/tokenizer.json')

In [45]:
from transformers import pipeline

pipe = pipeline("text2text-generation", model="NoelTiju/finetuned-attraction_summarization_t5")

In [48]:
print(pipe("Bangalore Palace Winit deshpande for Wikimedia Commons Built by Chamaraja Wodeyar in the year 1887 Bangalore Palace is an inspired design by England s Windsor Castle and is one of the best tourist places in Bangalore The evocative palace comprises fortified arches towers architecture and green lawns along with sophisticated wood carvings in the interior It is where the royal family still resides at the present This architectural creation is nothing less than an epitome The palace has earned foundations that have been attributed to the Wodeyars of Mysore Location Vasanth Nagar BengaluruTimings Sunday to Monday from 10 00 AM to 5 00 PMEntry Fee INR 230 for Indians INR 460 for foreigners Must Read New Year Party In Bangalore"))

[{'generated_text': 'The palace has earned foundations that have been attributed to the Wodeyars'}]


