In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
!pip install transformers



In [36]:
import re
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch.optim as optim

In [37]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [99]:
reviews = pd.read_csv('/content/drive/MyDrive/Reviews.csv')
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [92]:
reviews.Text.values[:5]

array(['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
       'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
       'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
    

In [93]:
reviews.Summary.values[:5]

array(['Good Quality Dog Food', 'Not as Advertised',
       '"Delight" says it all', 'Cough Medicine', 'Great taffy'],
      dtype=object)

In [100]:
reviews.dropna(subset=['Text', 'Summary'], inplace=True)

In [42]:
#

In [81]:
len(reviews)

568427

In [101]:
max_length = 300

In [102]:
reviews['model_input'] = reviews['Text'] + " TL;DR " + reviews['Summary']
reviews['model_input'].values[:3]

array(['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. TL;DR Good Quality Dog Food',
       'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo". TL;DR Not as Advertised',
       'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into sell

In [103]:
max_length = 250

In [104]:
reviews = reviews.sample(20000)
reviews = reviews.model_input.values.tolist()
len(reviews)

20000

In [48]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("gpt2")



In [49]:
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

In [50]:
tokenizer.encode(" TL;DR ")

[24811, 26, 7707, 220]

In [51]:
extra_length = len(tokenizer.encode(" TL;DR "))

In [52]:
class ReviewDataset(Dataset):
    def __init__(self, tokenizer, reviews, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.reviews = reviews
        self.result = []

        for review in self.reviews:
            # Encode the text using tokenizer.encode(). We add EOS at the end
            tokenized = self.tokenizer.encode(review + self.eos)

            # Padding/truncating the encoded sequence to max_len
            padded = self.pad_truncate(tokenized)

            # Creating a tensor and adding to the result
            self.result.append(torch.tensor(padded))

    def __len__(self):
        return len(self.result)


    def __getitem__(self, item):
        return self.result[item]

    def pad_truncate(self, name):
        name_length = len(name) - extra_length
        if name_length < self.max_len:
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            result = name[:self.max_len + 3]+[self.eos_id]
        else:
            result = name
        return result

In [53]:
dataset = ReviewDataset(tokenizer, reviews, max_length)

Token indices sequence length is longer than the specified maximum sequence length for this model (1162 > 1024). Running this sequence through the model will result in indexing errors


In [54]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

In [55]:
def train(model, optimizer, dl, epochs):
    for epoch in range(epochs):
        for idx, batch in enumerate(dl):
             with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device)
                output = model(batch, labels=batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 100 == 0:
                    print("loss: %f, %d"%(loss, idx))

In [105]:
train(model=model, optimizer=optimizer, dl=dataloader, epochs=6)

loss: 2.063668, 0
loss: 1.902992, 100
loss: 1.940823, 200
loss: 1.709489, 300
loss: 1.954444, 0
loss: 1.982386, 100
loss: 1.632235, 200
loss: 1.469066, 300
loss: 1.454039, 0
loss: 1.784222, 100
loss: 1.466187, 200
loss: 1.719608, 300
loss: 1.509658, 0
loss: 1.586798, 100
loss: 1.600494, 200
loss: 1.546392, 300
loss: 1.270890, 0
loss: 1.245149, 100
loss: 1.322057, 200
loss: 1.380549, 300
loss: 1.105842, 0
loss: 1.180933, 100
loss: 1.172334, 200
loss: 0.930210, 300


In [57]:
# import torch

# model_dir = '/content/drive/MyDrive/model_train'
# model.save_pretrained(model_dir)
# tokenizer.save_pretrained(model_dir)


In [106]:
def topk(probs, n=9):
    # The scores are initially softmaxed to convert to probabilities
    probs = torch.softmax(probs, dim= -1)

    # PyTorch has its own topk method, which we use here
    tokensProb, topIx = torch.topk(probs, k=n)

    # The new selection pool (9 choices) is normalized
    tokensProb = tokensProb / torch.sum(tokensProb)

    # Send to CPU for numpy handling
    tokensProb = tokensProb.cpu().detach().numpy()

    # Make a random choice from the pool based on the new prob distribution
    choice = np.random.choice(n, 1, p = tokensProb)
    tokenId = topIx[choice][0]

    return int(tokenId)

In [107]:
def model_infer(model, tokenizer, review, max_length=20):
    # Preprocess the init token (task designator)
    review_encoded = tokenizer.encode(review)
    result = review_encoded
    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)

    with torch.set_grad_enabled(False):
        # Feed the init token to the model
        output = model(initial_input)

        # Flatten the logits at the final time step
        logits = output.logits[0,-1]

        # Make a top-k choice and append to the result
        result.append(topk(logits))

        # For max_length times:
        for _ in range(max_length):
            # Feed the current sequence to the model and make a choice
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0,-1]
            res_id = topk(logits)

            # If the chosen token is EOS, return the result
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else: # Append to the sequence
                result.append(res_id)
    # IF no EOS is generated, return after the max_len
    return tokenizer.decode(result)

In [110]:
sample_reviews = [review.split(" TL;DR ")[0] for review in random.sample(reviews, 10)]
sample_reviews

["My daughter is a singer and loves this tea.  I don't drink it.<br /><br />The supplier (Vitaminlife) had this item on backorder, which happens sometimes, I understand.  But their communication about the status was poor.  I had to email and then telephone, to find out when it might be shipped.",
 "I absolutely love this stew mix.  All I do is add stew beef (uncooked) and water in the crockpot and it tastes like I made it from scratch.  I've even added a can of diced tomatoes and a can of tomato sauce for a little different flavor.  I could eat it every day!",
 'I will have my Agave for a while. I use it daily for green smoothies. So I needed a large amount. I\'m sure 5 gallons will hold me for 3 months. :) Wow! what a savings!<br />I prefer buying through Amazon.com. Good prices, good deals,<br />good products. No long waiting for delivery. Amazon carry name brands I find at the store. So I know what I am buying.<br />Thanks Amazon.com. You are my kind of store. "I\'ll be back". :)',


In [111]:
for review in sample_reviews:
    print(review)
    summary = model_infer(model, tokenizer, review + " TL;DR ").split(" TL;DR ")[1].strip()
    print("Summaries: "+ str(summary) +"\n")

My daughter is a singer and loves this tea.  I don't drink it.<br /><br />The supplier (Vitaminlife) had this item on backorder, which happens sometimes, I understand.  But their communication about the status was poor.  I had to email and then telephone, to find out when it might be shipped.
Summaries: Yummy tea

I absolutely love this stew mix.  All I do is add stew beef (uncooked) and water in the crockpot and it tastes like I made it from scratch.  I've even added a can of diced tomatoes and a can of tomato sauce for a little different flavor.  I could eat it every day!
Summaries: rave rave!

I will have my Agave for a while. I use it daily for green smoothies. So I needed a large amount. I'm sure 5 gallons will hold me for 3 months. :) Wow! what a savings!<br />I prefer buying through Amazon.com. Good prices, good deals,<br />good products. No long waiting for delivery. Amazon carry name brands I find at the store. So I know what I am buying.<br />Thanks Amazon.com. You are my kin

In [None]:
import torch
import os

# Save the model to the output directory
output_dir = '/kaggle/working/output/'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save the model state dictionary
torch.save(model.state_dict(), os.path.join(output_dir, 'trained_model.pth'))

In [None]:
import torch
from transformers import AutoModelWithLMHead, AutoTokenizer
from torch.utils.data import Dataset, DataLoader

# Load the model architecture and tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("gpt2")

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the saved model state dictionary
model_path = '/kaggle/working/output/trained_model.pth'
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)

# # Now, you can use your model for inference or further training in your Kaggle notebook

# # Example usage:
# inputs = tokenizer.encode("heloooo", return_tensors="pt").to(device)
# outputs = model.generate(inputs, max_length=50, do_sample=True)
# decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print(decoded_output)

In [None]:
!pip install rouge

In [None]:
while True:
    print("\nEnter your review (type 'exit' to quit):")
    review = input()
    if review.lower() == 'exit':
        break

    print("Enter your summary prompt (press Enter for default ' TL;DR '):")
    summary_prompt = input().strip()
    if not summary_prompt:
        summary_prompt = " TL;DR "

    generated_summary = model_infer(model, tokenizer, review + summary_prompt +" TL;DR ").split(" TL;DR ")[1].strip()
    print("\nGenerated Summary:", generated_summary)
    calculate_rouge_scores(summary_prompt, generated_summary)

In [None]:
from rouge import Rouge

def calculate_rouge_scores(original_summary, predicted_summary):
    # Initialize Rouge
    rouge = Rouge()

    # Calculate ROUGE scores
    scores = rouge.get_scores(predicted_summary, original_summary)

    # Print ROUGE scores
    print("ROUGE-1: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(
        scores[0]['rouge-1']['p'], scores[0]['rouge-1']['r'], scores[0]['rouge-1']['f']))
    print("ROUGE-2: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(
        scores[0]['rouge-2']['p'], scores[0]['rouge-2']['r'], scores[0]['rouge-2']['f']))
    print("ROUGE-L: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(
        scores[0]['rouge-l']['p'], scores[0]['rouge-l']['r'], scores[0]['rouge-l']['f']))