In [4]:
!pip install datasets



In [None]:
!pip install POT==0.4.0


In [20]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


WORD2VEC mode for evaluation

In [None]:
import gensim.downloader as api
from gensim.models import KeyedVectors

# Download the pre-trained Word2Vec model (Google News)
word2vec_model = api.load('word2vec-google-news-300')

In [5]:
from datasets import load_dataset

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty  ", trust_remote_code=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/327M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

In [12]:
print("DATASET INFORMATION")

print(dataset["full"])
print(dataset["full"]["text"][1])
print(dataset["full"]["title"][1])

print(dataset["full"]["text"][:2000])

DATASET INFORMATION
Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 701528
})
This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was described but I was hoping it would be light)
Works great but smells a little weird.


In [7]:
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


In [8]:
class AmazonReview(Dataset):
  def __init__(self, data_text, data_title, tokenizer):
    self.data_text = data_text
    self.data_title = data_title
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.data_text)

  def __getitem__(self, idx):
    text = self.data_text[idx]
    title = self.data_title[idx]


    return text, title




In [13]:
text_train, text_test, titles_train, titles_test = train_test_split(dataset["full"]["text"][:2000], dataset["full"]["title"][:2000], test_size=0.2, random_state=42)
print(len(text_train))


1600


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [14]:
model_path = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path).to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)


train_dataset = AmazonReview(text_train, titles_train, tokenizer)
test_dataset = AmazonReview(text_test, titles_test, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)



In [44]:
from tqdm import tqdm

epochs = 5
for epoch in range(epochs):
  model.train()
  loss_per_epoch = 0
  for batch in tqdm(train_dataloader):
    tokenized_inputs = tokenizer(batch[0], max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    tokenized_targets = tokenizer(batch[1], max_length=512, truncation=True, padding="max_length", return_tensors="pt")

    input_ids = tokenized_inputs['input_ids'].to(device)
    attention_mask = tokenized_inputs['attention_mask'].to(device)
    labels = tokenized_targets['input_ids'].to(device)

    optimizer.zero_grad()

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    loss_per_epoch += loss.item()

  print(loss_per_epoch)
model.save_pretrained("your_model_directory")



100%|██████████| 200/200 [06:17<00:00,  1.89s/it]


7.164702257141471


100%|██████████| 200/200 [06:17<00:00,  1.89s/it]


6.662256568670273


100%|██████████| 200/200 [06:17<00:00,  1.89s/it]


6.329521469771862


100%|██████████| 200/200 [06:17<00:00,  1.89s/it]


5.9144219839945436


100%|██████████| 200/200 [06:17<00:00,  1.89s/it]
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


5.59257498383522


In [67]:
#word2vec metrics
from gensim.models import Word2Vec, KeyedVectors
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.corpora import Dictionary
from scipy.spatial.distance import cdist
import gensim.downloader as api
from gensim.models import KeyedVectors


In [78]:
from rouge import Rouge
rouge = Rouge()

model.eval()
wmd_score = []
rouge_scores = []

with torch.no_grad():
  test_loss = 0
  for idx , test_batch in enumerate(test_dataloader):
    tokenized_inputs = tokenizer(test_batch[0], max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    tokenized_targets = tokenizer(test_batch[1], max_length=512, truncation=True, padding="max_length", return_tensors="pt")

    input_ids = tokenized_inputs['input_ids'].to(device)
    attention_mask = tokenized_inputs['attention_mask'].to(device)
    labels = tokenized_targets['input_ids'].to(device)

    output = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
    generated_summary = tokenizer.decode(output[0], skip_special_tokens=True)

    wmd = word2vec_model.wmdistance(test_batch[0][0].split(), generated_summary.split())
    wmd_score.append(wmd)

    scores = rouge.get_scores(test_batch[0][0], generated_summary)
    rouge_scores.append(scores[0]["rouge-1"]['r'])

    print("Example Input")
    print(test_batch[0][0])
    print("Example Generated")
    print(generated_summary)



Example Input
So, naturally, they all wanted these. None have told me how they work but I am presuming okay. If I get something like this, they want it immediately. I am sure that all three have used everyone of these patches and probably want more.
Example Generated
They all want these
Example Input
This spray smells amazing, it does claim to be completely natural so I am wondering if it really is since the smell is very well... processed in my opinion, that being said, it does look completely natural it's completely transparent and once you apply it it doesn't feel as sticky as other products I've used before it really gives the hair a very natural look and for having just applied it a few minutes ago my hair feels extremely soft to the touch which gives me a pretty good idea that this is a good hair spray. The presentation is really pretty and looks cute and clean as well. I love it! I just hope it works well in the long run! The only problem I had (completely unrelated to the spray



Example Input
I bought a pack of these and loved themso much I immediately bought another pack. They are perfect all around and the price is great!
Example Generated
Perfect!
Example Input
great seeds smell very well and arrived quickly.
Example Generated
Five Stars
Example Input
The curved ends on almost all of these is not curved & angled enough to twist & clasp, at all. I cant get them to clasp, even when they are not in my hair. This was a huge waste of money.
Example Generated
Waste of money
Example Input
A lil runny but works well
Example Generated
A lil runny
Example Input
I bought this for my sister who is having a baby.  Her nursery has a black and white theme which kind of makes me feel like I am inside a domino,  but this fits in great.  A full sized refill pack of diaper wipes fits in it easily so there's no guesswork about how many to put inside.  The silicone lid ensures that it is airtight and the wipes don't dry out.  I mainly bought it for the big container,  but I rea



Example Input
I'm so amazed by boar bristle hairbrushes and how incredible they make hair look and feel.  This GAINWELL boar bristle brush is a wonderful, compact brush. It may be small but it works wonders on our family's hair.  It works through tangles without pulling or causing pain while leaving hair soft, shiny and manageable.  The compact size makes it perfect for travel, to keep in a purse, a school/gym locker, briefcase or even a pocket.  If you've never experienced a boar bristle brush, I HIGHLY recommend giving it a try. You can't be the price and the results will be worth it!  If, for some reason you're not satisfied, GAINWELL asks you to contact them for assistance. I can't imagine they receive a ton of calls with issues but its great to know customer service is a  priority!
Example Generated
Impressed!


Evaluating use Word2Vec

In [85]:
import numpy as np
a = (np.asarray(wmd_score))
inf_remove = a[a < 1E308]
print("AVERAGE WMD SCORE")
print(np.average(inf_remove))
print("AVERAGE ROUGE1 SCORE")
print(sum(rouge_scores)/len(rouge_scores))



AVERAGE WMD SCORE
1.1304564371601467
AVERAGE ROUGE1 SCORE
0.4917691197691198
