In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import pickle
from rouge import Rouge

In [6]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [7]:
df = pd.read_csv("/kaggle/input/amazon-fine-food-reviews-gpt2/Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [8]:
df.shape

(568454, 10)

In [10]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\d+', '', text)         # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'<.*?>', '', text)
    # Remove non-ASCII characters and special characters
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [11]:
df = df.iloc[:5555,:]

In [12]:
df['Text'] = df['Text'].apply(preprocess_text)
df['Summary'] = df['Summary'].apply(preprocess_text)

In [13]:
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)
print(f'Training data shape: {train_df.shape}')
print(f'Testing data shape: {test_df.shape}')

Training data shape: (4166, 10)
Testing data shape: (1389, 10)


### Custom Dataset

In [21]:
class ReviewSummaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['Text']
        summary = self.data.iloc[index]['Summary']
        inputs = self.tokenizer.encode_plus(
            text, 
            None, 
            add_special_tokens=True, 
            max_length=self.max_length, 
            padding='max_length', 
            return_token_type_ids=True, 
            truncation=True,
            return_attention_mask=True, 
            return_tensors='pt'
        )
        outputs = self.tokenizer.encode_plus(
            summary, 
            None,
            add_special_tokens=True, 
            max_length=self.max_length, 
            padding='max_length', 
            return_token_type_ids=True, 
            truncation=True,
            return_attention_mask=True,  # Include attention mask
            return_tensors='pt'
        )

        return 
        {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': outputs['input_ids'].flatten(),
        }
def collate_fn(batch):
# Filter out None values
    batch = [item for item in batch if item is not None]
    if len(batch) == 0:
        return None
    input_ids = [item['input_ids'] for item in batch]
    labels = [item['labels'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    padded_inputs = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    return {'input_ids': padded_inputs, 'attention_mask': padded_attention_mask, 'labels': padded_labels}

In [15]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

### Fine Tune - 1

In [100]:
# # Define batch_size and max_length
# batch_size = 4
# max_length = 128
# optimizer = AdamW(model.parameters(), lr=5e-5)
# epochs = 3



### Fine Tune - 2

In [92]:
# # Fine-tuning parameters
# batch_size = 5
# epochs = 5
# learning_rate = 1e-5
# optimizer = AdamW(model.parameters(), lr=1e-5)
# max_length = 128



### Fine Tune - 3

In [82]:
# # Fine-tuning parameters
# batch_size = 8
# epochs = 3
# learning_rate = 2e-5
# max_length = 128
# optimizer = AdamW(model.parameters(), lr=2e-5)



### Fine Tune - 4

In [107]:
# # Define optimizer and scheduler
# learning_rate = 1e-5
# epochs = 5
# warmup_steps = int(0.1 * len(train_loader) * epochs)
# optimizer = AdamW(model.parameters(), lr=learning_rate)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_loader) * epochs)



### Fine Tune - 5

In [22]:
# Fine-tuning parameters
batch_size = 10
epochs = 20
learning_rate = 5e-5
max_length = 128
optimizer = AdamW(model.parameters(), lr=5e-5)

In [23]:
train_dataset = ReviewSummaryDataset(train_df, tokenizer, max_length=max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [24]:
tokenizer.pad_token = tokenizer.eos_token
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [25]:
num_epochs = 10
# Now start the training loop
for epoch in range(num_epochs):
    model.train()
    #total_loss = 0
#     for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
    for batch in train_loader:
        if batch is not None:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            #print(loss)
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print(loss.item())
            

#     Print average loss for the epoch
    print(f'Epoch {epoch + 1}/{num_epochs}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
test_df.shape

(1389, 10)

In [28]:
# Function to generate summaries
def generate_summary(review_text):
    review_text_with_token = review_text + " [SEP]"
    input_ids = tokenizer.encode(review_text, return_tensors='pt', max_length=1024, truncation=True)
    input_ids = input_ids.to(device)
    max_length = min(len(input_ids[0]) + 40, 1024)  # Set max_length dynamically based on input length
    with torch.no_grad():  # Ensure no gradient computation
        output = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True, pad_token_id=tokenizer.eos_token_id)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary


# Apply the generate_summary function to the test dataset
test_df['Generated_Summary'] = test_df['Text'].apply(generate_summary)

In [29]:
# Store generated summaries in a list separately
generated_summary = test_df['Generated_Summary'].tolist()
# Print the first few generated summaries to verify
print("Generated Summaries List:", generated_summary[:5])

Generated Summaries List: ['dog eating purina long long time switched newman noticed something dog owner would conscious ofthe waste longer expel foul odorous waste purina seemed regularly since theyve newman waste almost odorless wellit easy pick tell nutritious come pick dog daily like youll appreciate cleaner output of purina purina purina purina purina purina purina purina purina purina purina purina purina purina purina purina purina purina purina pur', 'need gluten free nondairy egg picking pretty slimbr br use really sharp serrated knife work carefully split successfullybr br great minipizzas br good minipas br good minipas br good minipas br good minipas br good minipas br good minipas br br good minipas br good minip', 'one perfect box rip tear gouge inside box badly dented can chowder looked like fell airplane every single disappointed return chowder good joke rate starfor unblemished box rusted box rusted rusted box rusted box rusted box rusted box rusted box rusted box rust

In [30]:
# Split the generated summaries by the token [SEP]
split_generated_summaries = [summary.split('[SEP]') for summary in generated_summary]

# Extract the summaries from the split list
extracted_summaries = [split_summary[0].strip() for split_summary in split_generated_summaries]

# Add the extracted summaries to a new column in test_df
test_df['Extracted_Summary'] = extracted_summaries

# Print the first few rows of test_df to verify the addition of the new column
print(test_df.head())

        Id   ProductId          UserId  ProfileName  HelpfulnessNumerator  \
4817  4818  B00139TT72  A2QV98MKFH116V          CM9                     1   
4625  4626  B000FFRY3G  A1VYZE8SUYU9B9  Jesse "JMT"                     6   
2802  2803  B000J2DQ46  A27COOQH0XCAB2    Drakonian                     6   
230    231  B003SO503C  A2ODZ3CH8PMYTL      Nanette                     0   
3872  3873  B001M074MY  A3OWR50RRC1GLD        linda                     0   

      HelpfulnessDenominator  Score        Time                  Summary  \
4817                       1      5  1342051200               tell great   
4625                       6      4  1184025600  pretty good considering   
2802                      12      1  1291161600            dropped plane   
230                        0      5  1347235200                 love tea   
3872                       0      5  1306108800  best decaf coffee tried   

                                                   Text  \
4817  dog eating puri

In [32]:
from rouge import Rouge

# Initialize the Rouge metric
rouge = Rouge()

# Function to compute ROUGE scores for each predicted summary against the actual summary
def compute_rouge_scores(row):
    actual_summary = row['Summary']
    generated_summary = row['Generated_Summary']
    if actual_summary and generated_summary:  
        rouge_scores = rouge.get_scores(generated_summary, actual_summary)[0]  
        return rouge_scores
    else:
        return None  

# Apply the compute_rouge_scores function to the test dataset
test_df['ROUGE_scores'] = test_df.apply(compute_rouge_scores, axis=1)

# Drop rows with empty ROUGE scores
test_df.dropna(subset=['ROUGE_scores'], inplace=True)

# Function to print ROUGE scores in the desired format with Precision, Recall, and F1-Score
def print_rouge_scores_with_metrics(rouge_scores):
    print("ROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"{key.upper()}: Precision: {value.get('f')}, Recall: {value.get('p')}, F1-Score: {value.get('r')}")
        
# Display review text, summaries, ROUGE scores, Precision, Recall, and F1-Score for each test sample
for idx, row in test_df.iterrows():
    print(f"Sample {idx + 1}:")
    print("Review Text:", row['Text'])
    print("Given Summary:", row['Summary'])
    print("Generated Summary:", row['Generated_Summary'])
    print(f"ROUGE Scores for Sample {idx + 1}:")
    rouge_scores = row['ROUGE_scores']
    print("ROUGE-1: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(
        rouge_scores['rouge-1']['p'],
        rouge_scores['rouge-1']['r'],
        rouge_scores['rouge-1']['f']
    ))
    print("ROUGE-2: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(
        rouge_scores['rouge-2']['p'],
        rouge_scores['rouge-2']['r'],
        rouge_scores['rouge-2']['f']
    ))
    print("ROUGE-L: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(
        rouge_scores['rouge-l']['p'],
        rouge_scores['rouge-l']['r'],
        rouge_scores['rouge-l']['f']
    ))
    print("\n")

# # Compute the average ROUGE scores
# average_rouge_scores = test_df['ROUGE_scores'].apply(pd.Series).mean()

# # Display the average ROUGE scores
# print(f"Average ROUGE Scores: {average_rouge_scores}")

Sample 4818:
Review Text: dog eating purina long long time switched newman noticed something dog owner would conscious ofthe waste longer expel foul odorous waste purina seemed regularly since theyve newman waste almost odorless wellit easy pick tell nutritious come pick dog daily like youll appreciate cleaner output
Given Summary: tell great
Generated Summary: dog eating purina long long time switched newman noticed something dog owner would conscious ofthe waste longer expel foul odorous waste purina seemed regularly since theyve newman waste almost odorless wellit easy pick tell nutritious come pick dog daily like youll appreciate cleaner output of purina purina purina purina purina purina purina purina purina purina purina purina purina purina purina purina purina purina purina pur
ROUGE Scores for Sample 4818:
ROUGE-1: Precision: 0.03, Recall: 0.50, F1-Score: 0.05
ROUGE-2: Precision: 0.00, Recall: 0.00, F1-Score: 0.00
ROUGE-L: Precision: 0.03, Recall: 0.50, F1-Score: 0.05


Sample

### Fine Tuning | Hyperparameter Search

In [73]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import numpy as np

# Generate a sample dataset (replace with your actual dataset)
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1)

# Define the model and hyperparameter space for grid search
model = RandomForestRegressor()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X, y)

# Print the best hyperparameters found
print("Best hyperparameters from grid search:", grid_search.best_params_)

# Define the model and hyperparameter space for random search
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [5, 10, 15, 20, 25, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform random search
random_search = RandomizedSearchCV(model, param_dist, n_iter=10, cv=5, n_jobs=-1)
random_search.fit(X, y)

# Print the best hyperparameters found
print("Best hyperparameters from random search:", random_search.best_params_)

Best hyperparameters from grid search: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best hyperparameters from random search: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}
