In [1]:
# Run this code inside a virtual environment (python 3.10 for myself) and import all things inside the virtual environment using cmd***
# open cmd as administrator where the virtual env is (D:\MACHINE LEARNING\LLM\Codes\python 3.10 virtual)
# 3.10env\Scripts\activate

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding




In [4]:
dataset_category = "Software" 
meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{dataset_category}", split='full').to_pandas()[['parent_asin', 'title']]
review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_review_{dataset_category}", split='full').to_pandas()[['parent_asin', 'title', 'rating', 'text', 'verified_purchase']]
review_ds = review_ds.rename(columns={"title":"review_headline"})

ds = meta_ds.merge(review_ds, on='parent_asin', how='inner').drop(columns="parent_asin")

ds.head()

Unnamed: 0,title,review_headline,rating,text,verified_purchase
0,Accupressure Guide,Good,4.0,Good details,True
1,Accupressure Guide,It's a starting point,4.0,"Basic, and it's a starting point.",True
2,Accupressure Guide,Five Stars,5.0,Very helpful.,True
3,Ankylosaurus Fights Back - Smithsonian's Prehi...,"Great for informational reading, but limited i...",4.0,ANKYLOSAURUS FIGHTS BACK is part of the SMITHS...,False
4,Ankylosaurus Fights Back - Smithsonian's Prehi...,well worth the download,5.0,I played this app during the holiday to entert...,True


In [5]:
ds = ds.rename(columns={"rating":"star_rating", "title":"product_title", "text":"review_body"})
ds = ds[ds['verified_purchase'] & (ds['review_body'].map(len) > 100)].sample(10000)
ds.head()

Unnamed: 0,product_title,review_headline,star_rating,review_body,verified_purchase
4117101,Temple Run 2,fun for kids and adults,4.0,Like-- it is an easy game anyone of any age ca...,True
1927391,My Alarm Clock,buyer beware! I deleted this app!!,1.0,I used this alarm clock for about a month. I l...,True
2502893,LYNE,I loved this game,5.0,I loved this game! It is not your average puzz...,True
602756,Farkle Addict Mania - Dice Game for Friends an...,Farkle,3.0,Not the best version. It freezes as well as st...,True
3502124,Slots - Lucky Casino - Play Real Vegas Slot Ma...,Fun at the Casino,5.0,I enjoy playing on my own! When I am winning ...,True


In [6]:
dataset = Dataset.from_pandas(ds)

dataset = dataset.class_encode_column("star_rating")

dataset = dataset.train_test_split(test_size = 0.1, seed = 42, stratify_by_column = "star_rating")

Stringifying the column:   0%|          | 0/10000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [7]:
train_dataset = dataset['train']
test_dataset = dataset['test']

print(train_dataset[73])

{'product_title': "Dr. Panda's Bus Driver: Christmas", 'review_headline': 'Creepily fun!', 'star_rating': 4, 'review_body': 'Items dumb and stupid and hilarious! You can skip people who want to get on and you can honk madly at people sooooo funny', 'verified_purchase': True, '__index_level_0__': 2569751}


In [8]:
# pip install sentencepiece #install inside the virtual env

MODEL_NAME = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### Data Preprocessing, preparing it for the model.

In [9]:
def preprocess_data(examples):
    # examples['prompt'] = [f"review: {example['product_title']}, {example['star_rating']} Stars!" for example in examples]
    # examples['response'] = [f"{example['review_headline']} {example['review_body']}" for example in examples]

    examples['prompt']  = [f"review: {title}, {stars} Stars!" for title, stars in zip(examples['product_title'], examples['star_rating'])]    
    examples['response'] = [f"{headline} {body}" for headline, body in zip(examples['review_headline'], examples['review_body'])]

    inputs = tokenizer(examples['prompt'], padding='max_length', truncation=True, max_length=128)
    targets = tokenizer(examples['response'], padding='max_length', truncation=True, max_length=128)

    # Set -100 at the padding positions of target tokens
    target_input_ids = []
    for ids in targets['input_ids']:
        target_input_ids.append([id if id != tokenizer.pad_token_id else -100 for id in ids])

    inputs.update({'labels': target_input_ids})
    return inputs

In [10]:
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

TRAINING_OUTPUT = "../Models/t5_fine_tuned_reviews"

training_args = TrainingArguments(
    output_dir = TRAINING_OUTPUT,
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    save_strategy = 'epoch'
)

In [12]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    data_collator = data_collator
)

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
