# Installing Necessary Libraries

In [1]:
!pip install transformers[torch]
!pip install transformers[sentencepiece]
!pip install sentencepiece
!pip install datasets

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.1 MB/s

# Importing Libraries

In [2]:
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorWithPadding

In [17]:
dataset_name = "Sharathhebbar24/app_reviews_modded"
# model_name = "t5-base"
model_name = "t5-small"

# Model and tokenizer

In [18]:
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [5]:
train_dataset = load_dataset(dataset_name, split='train')
test_dataset = load_dataset(dataset_name, split='test')

Downloading readme:   0%|          | 0.00/2.21k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/15.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/259258 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/28807 [00:00<?, ? examples/s]

In [6]:
train_dataset.remove_columns(['date'])
test_dataset.remove_columns(['date'])

Dataset({
    features: ['package_name', 'review', 'star', 'products'],
    num_rows: 28807
})

# Pre-process

In [7]:
def add_prompt(examples):
  # examples['prompt'] = [f"review: {example['products']}, {example['star']} Stars!" for example in examples]
  examples['prompt'] = f"review: {examples['products']}, {examples['star']} Stars!"
  return examples



In [8]:
train_dataset = train_dataset.map(add_prompt)
test_dataset = test_dataset.map(add_prompt)

Map:   0%|          | 0/259258 [00:00<?, ? examples/s]

Map:   0%|          | 0/28807 [00:00<?, ? examples/s]

In [9]:
train_dataset

Dataset({
    features: ['package_name', 'review', 'date', 'star', 'products', 'prompt'],
    num_rows: 259258
})

In [10]:
def preprocess_data(examples):
    inputs = tokenizer(examples['prompt'], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples['review'], padding="max_length", truncation=True, max_length=128)
    target_input_ids = []
    for ids in targets['input_ids']:
      target_input_ids.append([id if id != tokenizer.pad_token_id else -100 for id in ids])
    inputs.update({'labels': target_input_ids})
    return inputs

In [11]:
train_dataset = train_dataset.map(
      preprocess_data,
      batched=True
    )

test_dataset = test_dataset.map(
      preprocess_data,
      batched=True
    )

Map:   0%|          | 0/259258 [00:00<?, ? examples/s]

Map:   0%|          | 0/28807 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
TRAINING_OUTPUT = "./models/t5_reviews_tuned"
batch_size = 12
epochs = 1
training_args = TrainingArguments(
    output_dir=TRAINING_OUTPUT,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

In [25]:
trainer.train()

Step,Training Loss
500,4.5073
1000,4.4414
1500,4.4134
2000,4.3653
2500,4.3432
3000,4.337
3500,4.3214
4000,4.2714
4500,4.247
5000,4.2639


TrainOutput(global_step=21605, training_loss=4.217611055168888, metrics={'train_runtime': 4633.4209, 'train_samples_per_second': 55.954, 'train_steps_per_second': 4.663, 'total_flos': 8773329267720192.0, 'train_loss': 4.217611055168888, 'epoch': 1.0})

In [26]:
trainer.save_model(TRAINING_OUTPUT)

In [27]:
MODEL_PATH = "Sharathhebbar24/t5_reviews_tuned"
model.push_to_hub(
    MODEL_PATH,
    token="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"
)


pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sharathhebbar24/t5_reviews_tuned/commit/1bcfbe7071e9ab67323bc75ca02316ad03f8c1ab', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='1bcfbe7071e9ab67323bc75ca02316ad03f8c1ab', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
mod = "Sharathhebbar24/t5_reviews_tuned"
mod1 = T5ForConditionalGeneration.from_pretrained(mod)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [33]:
def generate_review(text):
  inputs = tokenizer(
        "review: " + text,
        return_tensors="pt",
        max_lenght=512,
        padding='max_length',
        truncation=True,
      )
  outputs = mod1.generate(
        inputs['input_ids'],
        max_length=128,
        no_repeat_ngram_size=3,
        num_beams=6,
        early_stopping=True
      )
  summary = tokenizer.decode(
      outputs[0],
      skip_special_tokens=True,
  )
  return summary

In [34]:
random_products = test_dataset.shuffle(42).select(range(10))['products']

print(generate_review(random_products[0] + ", 3 Stars!"))
print(generate_review(random_products[1] + ", 5 Stars!"))
print(generate_review(random_products[2] + ", 2 Stars!"))

Keyword arguments {'max_lenght': 512} not recognized.
Keyword arguments {'max_lenght': 512} not recognized.


It's good


Keyword arguments {'max_lenght': 512} not recognized.


I love it
I like it
