In [18]:
import torch
import random
import re
import pandas as pd
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from sklearn.model_selection import train_test_split
from rouge import Rouge
import torch.optim as optim

In [19]:
device_use_cpu = torch.device('cpu')
device_use_cpu

device(type='cpu')

In [20]:
df=pd.read_csv('Reviews.csv')

In [21]:
df.dropna(inplace=True)

df['training'] = df['Text'].str.lower()  + 'TL;DR' + df['Summary'].str.lower()

In [22]:
df = df[['Summary','Text','training']][:5000]

In [23]:
max_len = 100

In [24]:
df = df['training'].values.tolist()

In [25]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [26]:
model = model.to(device_use_cpu)
optimizer = optimizer = optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

In [27]:
tokenizer.encode(" TL;DR ")

[24811, 26, 7707, 220]

In [28]:
train_reviews, test_reviews = train_test_split(df, test_size=0.25, random_state=42)

In [29]:
extra_len = len(tokenizer.encode(" TL;DR "))

In [30]:
class EncodedDataset(Dataset):
    def __init__(self, tokenizer, rev, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.rev = rev
        self.result = []

        for review in self.rev:
            # Encode the text using tokenizer.encode(). We add EOS at the end
            tokenized = self.tokenizer.encode(review + self.eos)

            # Padding/truncating the encoded sequence to max_len
            padded = self.pad_truncate(tokenized)

            # Creating a tensor and adding to the result
            self.result.append(torch.tensor(padded))

    def __len__(self):
        return len(self.result)


    def __getitem__(self, item):
        return self.result[item]

    def pad_truncate(self, name):
        name_len = len(name) - extra_len
        if name_len < self.max_len:
            difference = self.max_len - name_len
            result = name + [self.eos_id] * difference
        elif name_len > self.max_len:
            result = name[:self.max_len + 3]+[self.eos_id]
        else:
            result = name
        return result

In [31]:
train_dataset = EncodedDataset(tokenizer, train_reviews, max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (1203 > 1024). Running this sequence through the model will result in indexing errors


In [32]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True)

In [33]:
def train_save_modal(model, optimizer, dl, epochs):
    for epoch in range(epochs):
        for idx, batch in enumerate(dl):
             with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device_use_cpu)
                output = model(batch, labels=batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 100 == 0:
                    print("loss: %f, %d"%(loss, idx))
    torch.save(model.state_dict(), 'trained_model.pth')

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [34]:
train_save_modal(model=model, optimizer=optimizer, dl=train_dataloader, epochs=10)

loss: 6.585080, 0


In [None]:
def select_top_tokens(probabilities, num_tokens=9):
    # The scores are initially softmaxed to convert to probabilities
    probabilities = torch.softmax(probabilities, dim=-1)

    # PyTorch has its own topk method, which we use here
    token_probs, top_indices = torch.topk(probabilities, k=num_tokens)

    # The new selection pool (9 choices) is normalized
    token_probs = token_probs / torch.sum(token_probs)

    # Send to CPU for numpy handling
    token_probs = token_probs.cpu().detach().numpy()

    # Make a random choice from the pool based on the new prob distribution
    choice = np.random.choice(num_tokens, 1, p=token_probs)
    token_id = top_indices[choice][0]

    return int(token_id)

In [None]:
def generate_text(model, tokenizer, initial_text, max_length=15):
    # Preprocess the initial text
    initial_tokens = tokenizer.encode(initial_text)
    generated_tokens = initial_tokens

    with torch.no_grad():
        for _ in range(max_length):
            # Convert the current tokens into a tensor
            input_ids = torch.tensor([generated_tokens]).to(device_use_cpu)

            # Feed the tokens to the model to get predictions
            outputs = model(input_ids)
            logits = outputs.logits[0, -1]

            # Select the next token based on top-k sampling
            next_token_id = select_top_tokens(logits)

            # If the chosen token is EOS, return the generated text
            if next_token_id == tokenizer.eos_token_id:
                return tokenizer.decode(generated_tokens)

            # Append the new token to the generated text
            generated_tokens.append(next_token_id)

    # If no EOS token is generated, return after reaching max_length
    return tokenizer.decode(generated_tokens)

In [None]:
def calculate_rouge_scores(generated_text, original_text):
    rouge_evaluator = Rouge()

    # Calculate ROUGE scores for the generated summary and the original review
    rouge_scores = rouge_evaluator.get_scores(generated_text, original_text)

    # Extract precision, recall, and F1 score for ROUGE-1
    rouge1_p = rouge_scores[0]['rouge-1']['p']
    rouge1_r = rouge_scores[0]['rouge-1']['r']
    rouge1_f = rouge_scores[0]['rouge-1']['f']

    # Extract precision, recall, and F1 score for ROUGE-2
    rouge2_p = rouge_scores[0]['rouge-2']['p']
    rouge2_r = rouge_scores[0]['rouge-2']['r']
    rouge2_f = rouge_scores[0]['rouge-2']['f']

    # Extract precision, recall, and F1 score for ROUGE-L
    rougeL_p = rouge_scores[0]['rouge-l']['p']
    rougeL_r = rouge_scores[0]['rouge-l']['r']
    rougeL_f = rouge_scores[0]['rouge-l']['f']

    return rouge1_p, rouge1_r, rouge1_f, rouge2_p, rouge2_r, rouge2_f, rougeL_p, rougeL_r, rougeL_f


In [None]:
for review in test_reviews:
    print("Original Review: ", review)

    # Generate summary for the current review
    summary = generate_text(model, tokenizer, review + " TL;DR ").split(" TL;DR ")[5].strip()

    # Print the summary
    print("Generated Summary: ", summary)

    # Calculate ROUGE scores
    rouge1_p, rouge1_r, rouge1_f, rouge2_p, rouge2_r, rouge2_f, rougeL_p, rougeL_r, rougeL_f = calculate_rouge_scores(summary, review)

    # Print ROUGE scores
    print("ROUGE-1:")
    print("Precision:", rouge1_p)
    print("Recall:", rouge1_r)
    print("F1 Score:", rouge1_f)

    print("\nROUGE-2:")
    print("Precision:", rouge2_p)
    print("Recall:", rouge2_r)
    print("F1 Score:", rouge2_f)

    print("\nROUGE-L:")
    print("Precision:", rougeL_p)
    print("Recall:", rougeL_r)
    print("F1 Score:", rougeL_f)

    # Break the loop after processing one review
    break

In [None]:
from torch.utils.data import DataLoader, random_split

def train_with_evaluation(net, tknzr, train_set, lr, b_size, num_epochs):
    # Split data into training and validation
    dataset = EncodedDataset(tknzr, train_set, max_len)
    train_sz = int(0.8 * len(dataset))
    val_sz = len(dataset) - train_sz
    train_set, val_set = random_split(dataset, [train_sz, val_sz])

    train_loader = DataLoader(train_set, batch_size=b_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=b_size, shuffle=False)

    # Setup model, optimizer, and move model to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = net.to(device)
    optimizer = torch.optim.AdamW(net.parameters(), lr=lr)

    # Training loop
    net.train()
    for epoch in range(num_epochs):
        for batch in train_loader:
            batch = batch.to(device)
            outputs = net(input_ids=batch, labels=batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    # Validation loop
    net.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            outputs = net(input_ids=batch, labels=batch)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    return avg_val_loss



In [None]:
learning_rates = [3e-4, 1e-4]
batch_sizes = [8, 16]
num_epochs = [5, 7]

best_loss = float('inf')
best_hyperparams = {}

for lr in learning_rates:
    for bs in batch_sizes:
        for epochs in num_epochs:
            average_val_loss = train_with_evaluation(model, tokenizer, train_reviews, lr, bs, epochs)
            print(f"Validation Loss for LR={lr}, BS={bs}, Epochs={epochs}: {average_val_loss}")
            if average_val_loss < best_loss:
                best_loss = average_val_loss
                best_hyperparams = {'learning_rate': lr, 'batch_size': bs, 'num_epochs': epochs}

print("Best Hyperparameters:", best_hyperparams)
