In [1]:
# Install the Transformers library using pip
!pip install transformers



In [2]:
# Importing required libraries
import re  # Regular expression library for text processing
import random  # Random number generation for shuffling data
import pandas as pd  # Pandas library for handling data frames
import numpy as np  # NumPy library for numerical computations
import torch  # PyTorch library for deep learning
from torch.utils.data import Dataset, DataLoader  # Data loading utilities from PyTorch
from transformers import AutoTokenizer, AutoModelWithLMHead  # Hugging Face Transformers library for pretrained models
import torch.optim as optim  # PyTorch library for optimization algorithms


In [3]:
# Check if CUDA GPU is available, otherwise use CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Print the device being used (CUDA GPU or CPU)
device


device(type='cuda', index=0)

In [4]:
import csv

# Path to the CSV file containing reviews
reviews_path = "/kaggle/input/ir-a4-dataset/Reviews.csv"

# List to store reviews as strings
reviews = []

# Open the CSV file
with open(reviews_path, "r", encoding="utf-8") as reviews_raw:
    # Create a CSV reader object
    csv_reader = csv.DictReader(reviews_raw)
    # Iterate over each row in the CSV file
    for row in csv_reader:
        # Extract summary and text from the current row
        summary = row["Summary"]
        text = row["Text"]
        # Replace " = " with " TL;DR " in summary and text
        summary = summary.replace(" = ", " TL;DR ")
        text = text.replace(" = ", " TL;DR ")
        # Combine summary and text into one string, separated by " = "
        review = f"{text.strip()} = {summary.strip()}\n"
        # Append the combined review to the list of reviews
        reviews.append(review)


In [7]:
# Display the first 5 reviews in the list
reviews[:5]


['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. = Good Quality Dog Food\n',
 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo". = Not as Advertised\n',
 'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother an

In [8]:
# Calculate the number of reviews in the dataset
len(reviews)


568454

In [9]:
# Display the 11th review in the dataset
reviews[10]


"I don't know if it's the cactus or the tequila or just the unique combination of ingredients, but the flavour of this hot sauce makes it one of a kind!  We picked up a bottle once on a trip we were on and brought it back home with us and were totally blown away!  When we realized that we simply couldn't find it anywhere in our city we were bummed.<br /><br />Now, because of the magic of the internet, we have a case of the sauce and are ecstatic because of it.<br /><br />If you love hot sauce..I mean really love hot sauce, but don't want a sauce that tastelessly burns your throat, grab a bottle of Tequila Picante Gourmet de Inclan.  Just realize that once you taste it, you will never want to use any other sauce.<br /><br />Thank you for the personal, incredible service! = The Best Hot Sauce in the World\n"

In [10]:
# Calculate the average length of reviews in terms of number of words
avg_length = sum([len(review.split()) for review in reviews]) / len(reviews)
avg_length


85.37717211946789

In [13]:
# Define the maximum length of the reviews
max_length = 100


In [14]:
# Initialize the GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Initialize the GPT-2 model
model = AutoModelWithLMHead.from_pretrained("gpt2")




In [7]:
# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)


In [8]:
# Define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=3e-4)


In [9]:
# Encode the special token "TL;DR" using the tokenizer
encoded_token = tokenizer.encode(" TL;DR ")
encoded_token


[24811, 26, 7707, 220]

In [10]:
# Calculate the length of the special token "TL;DR"
extra_length = len(tokenizer.encode(" TL;DR ")) 
extra_length


4

In [11]:
class ReviewDataset(Dataset):  
    def __init__(self, tokenizer, reviews, max_len):
        """
        Initialize the ReviewDataset class.

        Args:
            tokenizer (transformers.tokenization_utils_base.PreTrainedTokenizer): Tokenizer object.
            reviews (list): List of reviews as strings.
            max_len (int): Maximum length of the input sequence.
        """
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.reviews = reviews
        self.result = []

        # Process each review in the dataset
        for review in self.reviews:
            # Encode the text using tokenizer.encode(). We add EOS at the end
            tokenized = self.tokenizer.encode(review + self.eos)
            
            # Padding/truncating the encoded sequence to max_len 
            padded = self.pad_truncate(tokenized)            

            # Creating a tensor and adding to the result
            self.result.append(torch.tensor(padded))

    def __len__(self):
        """
        Get the total number of reviews in the dataset.

        Returns:
            int: Total number of reviews.
        """
        return len(self.result)

    def __getitem__(self, item):
        """
        Get a specific review item from the dataset.

        Args:
            item (int): Index of the review item.

        Returns:
            torch.Tensor: Encoded review tensor.
        """
        return self.result[item]

    def pad_truncate(self, name):
        """
        Pad or truncate the encoded review sequence to match the max_len.

        Args:
            name (list): Encoded review sequence.

        Returns:
            list: Padded or truncated review sequence.
        """
        name_length = len(name) - extra_length
        if name_length < self.max_len:
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            result = name[:self.max_len + 3] + [self.eos_id] 
        else:
            result = name
        return result


In [18]:
# Create an instance of the ReviewDataset class
# Parameters:
#   tokenizer: The tokenizer object used for tokenization
#   reviews: List of review strings
#   max_length: Maximum length of tokenized sequences
dataset = ReviewDataset(tokenizer, reviews, max_length)


# DATALOADER

In [19]:
from torch.utils.data import DataLoader

# Create a DataLoader instance for batching and shuffling the dataset
# Parameters:
#   dataset: The dataset object to load batches from
#   batch_size: The number of samples per batch
#   shuffle: Whether to shuffle the dataset before each epoch
#   drop_last: Whether to drop the last incomplete batch if its size is less than the specified batch size
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)


In [20]:
def train(model, optimizer, dl, epochs):    
    # Iterate over the specified number of epochs
    for epoch in range(epochs):
        # Iterate over each batch in the data loader
        for idx, batch in enumerate(dl):
            # Enable gradient calculation for the model parameters
            with torch.set_grad_enabled(True):
                # Clear the gradients from the previous iteration
                optimizer.zero_grad()
                # Move the batch to the appropriate device (GPU or CPU)
                batch = batch.to(device)
                # Forward pass: compute the model's predictions and loss
                output = model(batch, labels=batch)
                loss = output[0]
                # Backward pass: compute gradients of the loss with respect to model parameters
                loss.backward()
                # Update model parameters using the gradients and optimizer
                optimizer.step()
                # Print the loss at specified intervals
                if idx % 50 == 0:
                    print("loss: %f, %d"%(loss, idx))


In [76]:
train(model=model, optimizer=optimizer, dl=dataloader, epochs=1)


loss: 7.179998, 0
loss: 2.546208, 50
loss: 2.173442, 100
loss: 2.687057, 150
loss: 2.299516, 200
loss: 2.323481, 250
loss: 2.308963, 300
loss: 2.470415, 350
loss: 2.384151, 400
loss: 2.411604, 450
loss: 2.191579, 500
loss: 2.710379, 550
loss: 2.356698, 600
loss: 2.104545, 650
loss: 2.092360, 700
loss: 2.215420, 750
loss: 2.406816, 800
loss: 2.368682, 850
loss: 2.637617, 900
loss: 2.189571, 950
loss: 2.308575, 1000
loss: 2.238402, 1050
loss: 2.063868, 1100
loss: 2.344211, 1150
loss: 2.464975, 1200
loss: 2.153612, 1250
loss: 2.138024, 1300
loss: 2.524029, 1350
loss: 2.249254, 1400
loss: 2.285464, 1450
loss: 2.017530, 1500
loss: 1.820162, 1550
loss: 2.174565, 1600
loss: 2.552882, 1650
loss: 2.148380, 1700
loss: 2.165045, 1750
loss: 2.298380, 1800
loss: 2.125666, 1850
loss: 2.262254, 1900
loss: 2.293321, 1950
loss: 2.087833, 2000
loss: 2.001413, 2050
loss: 2.193791, 2100
loss: 2.232748, 2150
loss: 2.245459, 2200
loss: 2.175688, 2250
loss: 2.203779, 2300
loss: 2.333107, 2350
loss: 2.174418,

In [21]:
def topk(probs, n=9):
    # Softmax operation to convert scores to probabilities
    probs = torch.softmax(probs, dim=-1)
    
    # Extract top k tokens and their probabilities
    tokensProb, topIx = torch.topk(probs, k=n)
    
    # Normalize the probabilities to ensure they sum up to 1
    tokensProb = tokensProb / torch.sum(tokensProb)
    
    # Convert probabilities to numpy array for random selection
    tokensProb = tokensProb.cpu().detach().numpy()
    
    # Randomly select a token based on the probabilities
    choice = np.random.choice(n, 1, p=tokensProb)
    tokenId = topIx[choice][0]

    return int(tokenId)


In [51]:
def model_infer(model, tokenizer, review, max_length=15):
    # Tokenize the initial review
    review_encoded = tokenizer.encode(review)
    result = review_encoded
    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)

    with torch.set_grad_enabled(False):
        # Feed the initial token to the model
        output = model(initial_input)

        # Get the logits for the next token
        logits = output.logits[0, -1]

        # Make a top-k choice and append to the result
        result.append(topk(logits))

        # For max_length times:
        for _ in range(max_length):
            # Feed the current sequence to the model and make a choice
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0, -1]
            res_id = topk(logits)

            # If the chosen token is EOS, return the result
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else:  # Append to the sequence
                result.append(res_id)
    # If no EOS token is generated, return after reaching max_length
    return tokenizer.decode(result)


In [26]:
sample_reviews = [review.split(" TL;DR ")[0] for review in random.sample(reviews, 5)]
sample_reviews

['I loved this brand.  It was the best vanilla flavor of others I tried.  I would buy more if it was a better price. = Wolfgang puck coffee vanilla francaise\n',
 'Seriously,<br /><br />this has to be the best tasting Spearmint gum out there. And on top of that it is sugar free and good for your teeth. Without those nasty, artificial sweeteners. That has to be chewing gum heaven. I would definitely recommend this product!!! = the best...\n',
 'I order this food to try as it was grain free and I liked the ingredients listed on the site with the reviews. I opened the bag and the smell was incredible. It almost made me hungry lol. The dog loves it and is doing well on it.<br />  I start doing more research on the food and was very impressed. The owners of this food make sure ethoxyquin, BHA and BHT (which are knowing to cause cancer) are NOT used at all in the food. They make sure that the slaughter and processing plants do not use them either. That is great and it show that they care as 

In [52]:
for review in sample_reviews:
    summaries = set()
    print(review)
    while len(summaries) < 3:
        # Print the result of model_infer before splitting
        result = model_infer(model, tokenizer, review + " TL;DR ")
        print("Result:", result)
        
        # Split the result and print the parts
        parts = result.split(" TL;DR ")
        print("Parts:", parts)
        
        # Try to access the second part after splitting
        if len(parts) > 1:
            summary = parts[1].strip()
            if summary not in summaries:
                summaries.add(summary)
        print("Summaries:", summaries)
        
        # Break out of the loop once three summaries are collected
        if len(summaries) >= 1:
            break
    print("\n")


I loved this brand.  It was the best vanilla flavor of others I tried.  I would buy more if it was a better price. = Wolfgang puck coffee vanilla francaise

Result: I loved this brand.  It was the best vanilla flavor of others I tried.  I would buy more if it was a better price. = Wolfgang puck coffee vanilla francaise
 TL;DR  If you love vanilla, this is the right price for you. = The
Parts: ['I loved this brand.  It was the best vanilla flavor of others I tried.  I would buy more if it was a better price. = Wolfgang puck coffee vanilla francaise\n', '\xa0If you love vanilla, this is the right price for you. = The']
Summaries: {'If you love vanilla, this is the right price for you. = The'}


Seriously,<br /><br />this has to be the best tasting Spearmint gum out there. And on top of that it is sugar free and good for your teeth. Without those nasty, artificial sweeteners. That has to be chewing gum heaven. I would definitely recommend this product!!! = the best...

Result: Seriously,<

# EARLIER ROUGE 0

In [54]:
from torch.utils.data import random_split

# Split the dataset into training and testing sets
train_size = int(0.75 * len(dataset))  # Determine the size of the training set (75% of the dataset)
test_size = len(dataset) - train_size   # Determine the size of the testing set (remaining portion)
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])  # Split the dataset

# Create data loaders for training and testing sets
# Training DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
# Explanation:
# - DataLoader: Creates an iterator that provides batches of data during training.
# - train_dataset: The dataset used for training.
# - batch_size: Number of samples in each batch.
# - shuffle: Whether to shuffle the data at the beginning of each epoch (recommended for training).
# - drop_last: Whether to drop the last incomplete batch if the dataset size is not divisible by the batch size.

# Testing DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# Explanation:
# - Similar to the training DataLoader but shuffling is set to False since shuffling is not needed during testing.


In [61]:
# Define your train function
def train(model, optimizer, train_dl, epochs):
    for epoch in range(epochs):
        model.train()  # Set model to training mode
        for idx, batch in enumerate(train_dl):
            optimizer.zero_grad()
            batch = batch.to(device)
            output = model(batch, labels=batch)
            loss = output[0]
            loss.backward()
            optimizer.step()
            if idx % 50 == 0:
                print(f"Epoch [{epoch+1}/{epochs}], Step [{idx+1}/{len(train_dl)}], Loss: {loss.item()}")


In [79]:
# Train the model on the training set
train(model, optimizer, train_dataloader, epochs=1)


Epoch [1/1], Step [1/13323], Loss: 1.8593367338180542
Epoch [1/1], Step [51/13323], Loss: 1.9994953870773315
Epoch [1/1], Step [101/13323], Loss: 1.8977000713348389
Epoch [1/1], Step [151/13323], Loss: 2.0506272315979004
Epoch [1/1], Step [201/13323], Loss: 1.5420185327529907
Epoch [1/1], Step [251/13323], Loss: 1.4611223936080933
Epoch [1/1], Step [301/13323], Loss: 2.0511529445648193
Epoch [1/1], Step [351/13323], Loss: 1.8443139791488647
Epoch [1/1], Step [401/13323], Loss: 2.04577898979187
Epoch [1/1], Step [451/13323], Loss: 2.006317615509033
Epoch [1/1], Step [501/13323], Loss: 1.8634902238845825
Epoch [1/1], Step [551/13323], Loss: 2.206110715866089
Epoch [1/1], Step [601/13323], Loss: 1.6728296279907227
Epoch [1/1], Step [651/13323], Loss: 1.805537223815918
Epoch [1/1], Step [701/13323], Loss: 1.817780613899231
Epoch [1/1], Step [751/13323], Loss: 2.183117389678955
Epoch [1/1], Step [801/13323], Loss: 2.004399538040161
Epoch [1/1], Step [851/13323], Loss: 1.8522589206695557
Epo

In [63]:
model_path = "/kaggle/input/ir-model/model"  # Specify the correct file path including the file extension
model.load_state_dict(torch.load(model_path))
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [64]:
# pip install rouge


In [65]:
%%capture
!pip install evaluate
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [66]:
import evaluate
rouge = evaluate.load('rouge')


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [67]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
rouge1 = []
rouge2 = []
rougeL = []
rougeLsum = []

model.eval()
for batch in test_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
    batch = batch.to(device)
    with torch.no_grad():
        outputs = model(batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    generated_output = tokenizer.decode(predictions[0], skip_special_tokens=True)
    references = tokenizer.decode(batch[0], skip_special_tokens=True)
    results = rouge.compute(predictions=[generated_output], references=[references])
    rouge1.append(results['rouge1'])
    rouge2.append(results['rouge2'])
    rougeL.append(results['rougeL'])
    rougeLsum.append(results['rougeLsum'])

In [138]:
print(f'rouge1: {sum(rouge1)/len(rouge1)}\nrouge2: {sum(rouge2)/len(rouge2)}\nrougeL: {sum(rougeL)/len(rougeL)}\nrougeLsum: {sum(rougeLsum)/len(rougeLsum)}')

rouge1: 0.4086952614760332
rouge2: 0.0852688926207635
rougeL: 0.2926001694066451
rougeLsum: 0.297603059631863


## Inference

In [159]:
inp = input("Enter Review text: ")
model.eval()
inp = tokenizer(inp, truncation=True, return_tensors="pt").to(device)
output = model.generate(inp['input_ids'], max_length= 60)
generated_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(f'Generate Summary: \n{generated_output}')

Enter Review text:  The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generate Summary: 
The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability.

The Fender CD-60S Dreadn


In [97]:
from rouge import Rouge
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel


# Define functions to calculate ROUGE scores
def calculate_rouge(hypothesis, reference):
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, reference)
    return scores[0]

# Define function to generate summary
def generate_summary(review_text):
    inputs = tokenizer.encode("summarize: " + review_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
reviews_df = pd.read_csv(reviews_path)

# Iterate over the rows of the DataFrame
for index, row in reviews_df.iterrows():
    review_text = row["Review Text"]
    reference_summary = row["Summary"]
    # Your code for processing each row goes here


In [None]:
def topk(probs, n=9):
    # The scores are initially softmaxed to convert to probabilities
    probs = torch.softmax(probs, dim= -1)
    
    # PyTorch has its own topk method, which we use here
    tokensProb, topIx = torch.topk(probs, k=n)
    
    # The new selection pool (9 choices) is normalized
    tokensProb = tokensProb / torch.sum(tokensProb)

    # Send to CPU for numpy handling
    tokensProb = tokensProb.cpu().detach().numpy()

    # Make a random choice from the pool based on the new prob distribution
    choice = np.random.choice(n, 1, p = tokensProb)
    tokenId = topIx[choice][0]

    return int(tokenId)

In [96]:
def model_infer(model, tokenizer, review, max_length=15):
    # Preprocess the init token (task designator)
    review_encoded = tokenizer.encode(review)
    result = review_encoded
    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)

    with torch.set_grad_enabled(False):
        # Feed the init token to the model
        output = model(initial_input)

        # Flatten the logits at the final time step
        logits = output.logits[0,-1]

        # Make a top-k choice and append to the result
        result.append(topk(logits))

        # For max_length times:
        for _ in range(max_length):
            # Feed the current sequence to the model and make a choice
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0,-1]
            res_id = topk(logits)

            # If the chosen token is EOS, return the result
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else: # Append to the sequence 
                result.append(res_id)
    # IF no EOS is generated, return after the max_len
    return tokenizer.decode(result)


In [None]:
sample_reviews = [review.split(" TL;DR ")[0] for review in random.sample(reviews, 5)]
sample_reviews

In [None]:
for review in sample_reviews:
    summaries = set()
    print(review)
    while len(summaries) < 3:
        summary = model_infer(model, tokenizer, review + " TL;DR ").split(" TL;DR ")[1].strip()
        if summary not in summaries:
            summaries.add(summary)
    print("Summaries: "+ str(summaries) +"\n")