In [18]:

import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/reviews/preprocessed_reviews.csv
/kaggle/input/modelpath/fine_tuned_model.pth
/kaggle/input/modelxx/modelxx.pth
/kaggle/input/models/gpt2_024.pth
/kaggle/input/models/fine_tuned_newmodel.pth


In [24]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

class Custom_class(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        review_text = self.data['Text'].iloc[index]
        summary_text = self.data['Summary'].iloc[index]
        
        review_tokens = self.tokenize_and_pad(review_text)
        summary_tokens = self.tokenize_and_pad(summary_text)
        
        return {'input_ids': torch.tensor(review_tokens), 'labels': torch.tensor(summary_tokens)}
    
    def tokenize_and_pad(self, text):
        tokens = self.tokenizer.encode(text, max_length=self.max_length, truncation=True)
        padded_tokens = tokens[:self.max_length] + [0] * (self.max_length - len(tokens))
        return padded_tokens


df = pd.read_csv("/kaggle/input/reviews/preprocessed_reviews.csv")  
train_df = df.sample(n=1000, random_state=42)  

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

train_dataset = Custom_class(train_df, tokenizer, max_length=512)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

model = GPT2LMHeadModel.from_pretrained("gpt2")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=500, gamma=0.9)

num_epochs = 5
epoch = 0
while epoch < num_epochs:
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Avg Loss: {avg_loss:.4f}")
    
    epoch += 1

torch.save(model.state_dict(), "./fine_tuned_newmodel.pth")

Epoch 1/5: 100%|██████████| 500/500 [02:41<00:00,  3.09it/s]


Avg Loss: 0.0845


Epoch 2/5: 100%|██████████| 500/500 [02:41<00:00,  3.10it/s]


Avg Loss: 0.0621


Epoch 3/5: 100%|██████████| 500/500 [02:41<00:00,  3.09it/s]


Avg Loss: 0.0592


Epoch 4/5: 100%|██████████| 500/500 [02:41<00:00,  3.10it/s]


Avg Loss: 0.0575


Epoch 5/5: 100%|██████████| 500/500 [02:41<00:00,  3.09it/s]


Avg Loss: 0.0546


In [17]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        review_text = self.data['Text'].iloc[index]
        summary_text = self.data['Summary'].iloc[index]
        
        review_tokens = self.tokenize_and_pad(review_text)
        summary_tokens = self.tokenize_and_pad(summary_text)
        
        return {'input_ids': torch.tensor(review_tokens), 'labels': torch.tensor(summary_tokens)}
    
    def tokenize_and_pad(self, text):
        tokens = self.tokenizer.encode(text, max_length=self.max_length, truncation=True)
        padded_tokens = tokens[:self.max_length] + [0] * (self.max_length - len(tokens))
        return padded_tokens

df = pd.read_csv("/kaggle/input/reviews/preprocessed_reviews.csv")  
train_df = df.sample(n=2000, random_state=42)  

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

train_dataset = CustomDataset(train_df, tokenizer, max_length=512)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

#Hyperparameters

num_epochs = 5
optimizer = AdamW(model.parameters(), lr = 1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(train_loader) * num_epochs)

for epoch in range(num_epochs):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()  
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Avg Loss: {avg_loss:.4f}")

torch.save(model.state_dict(), "./modelxx.pth")


Epoch 1/5: 100%|██████████| 1000/1000 [05:32<00:00,  3.00it/s]


Avg Loss: 0.1079


Epoch 2/5: 100%|██████████| 1000/1000 [05:32<00:00,  3.01it/s]


Avg Loss: 0.0631


Epoch 3/5: 100%|██████████| 1000/1000 [05:32<00:00,  3.01it/s]


Avg Loss: 0.0612


Epoch 4/5: 100%|██████████| 1000/1000 [05:32<00:00,  3.01it/s]


Avg Loss: 0.0597


Epoch 5/5: 100%|██████████| 1000/1000 [05:32<00:00,  3.01it/s]


Avg Loss: 0.0581


In [3]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [32]:
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import T5Tokenizer, GPT2Tokenizer
from transformers import T5ForConditionalGeneration
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
model_name = "t5-base"
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("gpt2")
modul = T5ForConditionalGeneration.from_pretrained(model_name)
tokanizer = T5Tokenizer.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained("gpt2")

file_path = '/kaggle/input/reviews/preprocessed_reviews.csv'
df = pd.read_csv(file_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Inference/Evaluation

In [34]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from rouge_score import rouge_scorer
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
custom_config = GPT2Config.from_pretrained("gpt2", vocab_size=50257)

model = GPT2LMHeadModel(custom_config)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

saved_dict_path = "/kaggle/input/modelxx/modelxx.pth"
state_dict = torch.load(saved_dict_path)
state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}

model.load_state_dict(state_dict)

def generate_summary(review_text, max_length=512):
    inputs = tokenizer(review_text, return_tensors="pt", max_length=max_length, truncation=True, padding=True)
    outputs = model.generate(inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=max_length, num_beams=4, early_stopping=True)
    summary = tokanizer.decode(outputs[0], skip_special_tokens=True)
    summary = summary.replace('!', '')
    return summary

review_text = input("Enter the review text: ")

summary = generate_summary(review_text)
print("Generated Summary:", summary)


Enter the review text:  i be visit my friend nate the other morning for coffee he come out of his storage room with a packet of mccanns instant irish oatmeal he suggest that i try it for my own use in my stash sometimes nate dose not give you a chance to say no so i end up try the apple and cinn find it to be very tastefull when make with water or powder milk it go good with oj and coffee and a slice of toast and your ready to take on the worldor the day at least jerry reith


Generated Summary: i be visit my friend nate the other morning for coffee he come out of his storage room with a packet of mccanns instant irish oatmeal he suggest that i try it for my own use in my stash sometimes nate do not give you a chance to say no so i end up try the apple and cinn find it very tastefull when make with water or powder milk it go good with oj and coffee and a slice of toast and your ready to


In [5]:
pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=16b42c47e234380b33fcc879bc12e4f1301e8a4848dcda24e25ed09e6094c46c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [21]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from rouge_score import rouge_scorer
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
custom_config = GPT2Config.from_pretrained("gpt2", vocab_size=50257)

model = GPT2LMHeadModel(custom_config)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

saved_dict_path = "/kaggle/input/models/fine_tuned_newmodel.pth"
state_dict = torch.load(saved_dict_path)
state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}

model.load_state_dict(state_dict)

def generate_summary(review_text, max_length=512):
    inputs = tokenizer(review_text, return_tensors="pt", max_length=max_length, truncation=True, padding=True)
    outputs = model.generate(inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=max_length, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    summary = summary.replace('!', '')
    return summary

def compute_rouge_scores(actual_summary, predicted_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(actual_summary, predicted_summary)
    return scores

review_text = input("Enter the review text: ")
actual_summary = input("Enter the actual summary text: ")

# review_text = "i be visit my friend nate the other morning for coffee he come out of his storage room with a packet of mccanns instant irish oatmeal he suggest that i try it for my own use in my stash sometimes nate dose not give you a chance to say no so i end up try the apple and cinn find it to be very tastefull when make with water or powder milk it go good with oj and coffee and a slice of toast and your ready to take on the worldor the day at least jerry reith"
# actual_summary = "good way to start the day"

predicted_summary = generate_summary(review_text)

scores = compute_rouge_scores(actual_summary, predicted_summary)

print("Generated Summary:", predicted_summary)

print("ROUGE-1: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rouge1'].precision, scores['rouge1'].recall, scores['rouge1'].fmeasure))
print("ROUGE-2: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rouge2'].precision, scores['rouge2'].recall, scores['rouge2'].fmeasure))
print("ROUGE-L: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rougeL'].precision, scores['rougeL'].recall, scores['rougeL'].fmeasure))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Summary: i be visit my friend nate the other morning for coffee he come out of his storage room with a packet of mccanns instant irish oatmeal he suggest that i try it for my own use in my stash sometimes nate dose not give you a chance to say no so i end up try the apple and cinn find it to be very tastefull when make with water or powder milk it go good with oj and coffee and a slice of toast and your ready to take on the worldor the day at least jerry reith
ROUGE-1: Precision: 0.04, Recall: 0.67, F1-Score: 0.08
ROUGE-2: Precision: 0.01, Recall: 0.20, F1-Score: 0.02
ROUGE-L: Precision: 0.04, Recall: 0.67, F1-Score: 0.08


In [30]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from rouge_score import rouge_scorer
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
custom_config = GPT2Config.from_pretrained("gpt2", vocab_size=50257)

model = GPT2LMHeadModel(custom_config)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

saved_dict_path = "/kaggle/input/modelxx/modelxx.pth"
state_dict = torch.load(saved_dict_path)
state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}

model.load_state_dict(state_dict)

def generate_summary(review_text, max_length=512):
    inputs = tokenizer(review_text, return_tensors="pt", max_length=max_length, truncation=True, padding=True)
    outputs = model.generate(inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=max_length, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    summary = summary.replace('!', '')
    return summary

def compute_rouge_scores(actual_summary, predicted_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
    scores = scorer.score(actual_summary, predicted_summary)
    return scores

review_text = input("Enter the review text: ")
actual_summary = input("Enter the actual summary text: ")

# review_text = "i be visit my friend nate the other morning for coffee he come out of his storage room with a packet of mccanns instant irish oatmeal he suggest that i try it for my own use in my stash sometimes nate dose not give you a chance to say no so i end up try the apple and cinn find it to be very tastefull when make with water or powder milk it go good with oj and coffee and a slice of toast and your ready to take on the worldor the day at least jerry reith"
# actual_summary = "good way to start the day"

predicted_summary = generate_summary(review_text)

scores = compute_rouge_scores(actual_summary, predicted_summary)

print("Generated Summary:", predicted_summary)
print()
print("ROUGE-1: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rouge1'].precision, scores['rouge1'].recall, scores['rouge1'].fmeasure))
print("ROUGE-2: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rouge2'].precision, scores['rouge2'].recall, scores['rouge2'].fmeasure))
print("ROUGE-L: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rougeL'].precision, scores['rougeL'].recall, scores['rougeL'].fmeasure))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Summary: i be visit my friend nate the other morning for coffee he come out of his storage room with a packet of mccanns instant irish oatmeal he suggest that i try it for my own use in my stash sometimes nate dose not give you a chance to say no so i end up try the apple and cinn find it to be very tastefull when make with water or powder milk it go good with oj and coffee and a slice of toast and your ready to take on the worldor the day at least jerry reith

ROUGE-1: Precision: 0.04, Recall: 0.67, F1-Score: 0.08
ROUGE-2: Precision: 0.01, Recall: 0.20, F1-Score: 0.02
ROUGE-L: Precision: 0.04, Recall: 0.67, F1-Score: 0.08


In [27]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from rouge_score import rouge_scorer

t5_model_name = "t5-base"
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)

def generate_summary_t5(review_text, max_length=512):
    inputs = t5_tokenizer("summarize: " + review_text, return_tensors="pt", max_length=max_length, truncation=True, padding=True)
    outputs = t5_model.generate(inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=max_length, num_beams=4, early_stopping=True)
    summary = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

def compute_rouge_scores_t5(actual_summary, predicted_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(actual_summary, predicted_summary)
    return scores

review_text = input("Enter the review text: ")
actual_summary = input("Enter the actual summary text: ")

# review_text = "i be visit my friend nate the other morning for coffee he come out of his storage room with a packet of mccanns instant irish oatmeal he suggest that i try it for my own use in my stash sometimes nate dose not give you a chance to say no so i end up try the apple and cinn find it to be very tastefull when make with water or powder milk it go good with oj and coffee and a slice of toast and your ready to take on the worldor the day at least jerry reith"
# actual_summary = "good way to start the day"

# rev2= "The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound,and feels comfortable to play. However, some users have reported issues with thetuning stability."
# act2 = "Good for beginners but has tuning stability issues."

predicted_summary = generate_summary_t5(review_text)
# predicted_summary = generate_summary_t5(rev2)


scores_t5 = compute_rouge_scores_t5(actual_summary, predicted_summary)
# scores_t5 = compute_rouge_scores_t5(act2, predicted_summary)


print("Generated Summary:", predicted_summary)

print("ROUGE-1 : Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores_t5['rouge1'].precision, scores_t5['rouge1'].recall, scores_t5['rouge1'].fmeasure))
print("ROUGE-2 : Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores_t5['rouge2'].precision, scores_t5['rouge2'].recall, scores_t5['rouge2'].fmeasure))
print("ROUGE-L : Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores_t5['rougeL'].precision, scores_t5['rougeL'].recall, scores_t5['rougeL'].fmeasure))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Summary: summarize: i be visit my friend nate the other morning for coffee he come out of his storage room with a packet of mccanns instant irish oatmeal he suggest that i try it for my own use in my stash sometimes nate dose not give you a chance to say no so i end up try the apple and cinn find it to be very tastefull when make with water or powder milk it go good with oj and coffee and a slice of toast and your ready to take on the worldor the day at least jerry reith
ROUGE-1 : Precision: 0.04, Recall: 0.67, F1-Score: 0.08
ROUGE-2 : Precision: 0.01, Recall: 0.20, F1-Score: 0.02
ROUGE-L : Precision: 0.04, Recall: 0.67, F1-Score: 0.08
