In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer
import numpy as np, re, torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# facebook/bart-large-cnn
# mabrouk/amazon-review-summarizer-bart

In [3]:
# Load summarization model (fine-tuned for product reviews)
model_name = "facebook/bart-large-cnn"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
model.to("cuda" if torch.cuda.is_available() else "cpu")

summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Sentence embedding model for extractive selection
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Models loaded successfully.")

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


✅ Models loaded successfully.


In [4]:
def split_sentences(text):
    """Split text into clean sentences."""
    sents = re.split(r'(?<=[.!?])\s+', text.strip())
    sents = [s.strip() for s in sents if len(s.strip()) > 3]
    return sents

def extract_key_by_embedding(text, top_k=6, diversity=0.7):
    sents = split_sentences(text)
    if not sents:
        return ''
    embs = embed_model.encode(sents, convert_to_numpy=True)
    centroid = embs.mean(axis=0)
    scores = embs.dot(centroid)
    selected = []
    selected_idx = []

    for _ in range(min(top_k, len(sents))):
        if not selected:
            idx = int(np.argmax(scores))
        else:
            sim_to_selected = np.max(np.dot(embs, embs[selected_idx].T), axis=1)
            mmr = diversity * scores - (1 - diversity) * sim_to_selected
            mmr[selected_idx] = -np.inf
            idx = int(np.argmax(mmr))
        selected_idx.append(idx)
        selected.append(sents[idx])

    return ' '.join(selected)
    
def chunk_text(text, tokenizer, max_tokens=800, stride=50):
    """Split long texts into overlapping chunks within model limits."""
    toks = tokenizer(text, return_tensors='pt', truncation=False)
    input_ids = toks['input_ids'][0].tolist()
    chunks = []
    i = 0
    while i < len(input_ids):
        chunk_ids = input_ids[i:i+max_tokens]
        chunks.append(tokenizer.decode(chunk_ids, skip_special_tokens=True))
        i += max_tokens - stride
    return chunks


In [5]:
def summarize_reviews(text, summarizer, tokenizer, max_input_tokens=800):
    """Hybrid extractive-abstractive summarization."""
    text = re.sub(r'\s+', ' ', text).strip()
    sentences = split_sentences(text)
    if not sentences:
        return ""

    # Short text → one-shot summarize
    if len(sentences) <= 8:
        key = extract_key_by_embedding(text, top_k=6)
        # summary = summarizer(
        #     key,
        #     max_length=120,      # was 60
        #     min_length=40,       # was 20
        #     num_beams=5,         # explore more candidates
        #     length_penalty=1.2,  # encourage longer outputs
        #     no_repeat_ngram_size=3,
        #     early_stopping=False # allow full decoding
        # )[0]['summary_text']

        input_text = (
            "Write a concise and natural summary in your own words for the following product review:\n\n" + text
        )
        summary = summarizer(
            input_text,
            max_length=180,        # slightly longer to avoid truncation
            min_length=60,         # ensures some depth
            do_sample=True,        # adds diversity
            top_k=50,              # diverse token sampling
            top_p=0.95,            # nucleus sampling
            temperature=0.8,       # softer creativity
            repetition_penalty=1.2 # avoids repeated phrases
        )[0]['summary_text']
        
        return summary

    # Long text → chunk, summarize each, then summarize all
    chunks = chunk_text(text, tokenizer, max_tokens=max_input_tokens)
    chunk_summaries = []
    for c in chunks:
        key = extract_key_by_embedding(c, top_k=6)
        # result = summarizer(
        #     key,
        #     max_length=120,      # was 60
        #     min_length=40,       # was 20
        #     num_beams=5,         # explore more candidates
        #     length_penalty=1.2,  # encourage longer outputs
        #     no_repeat_ngram_size=3,
        #     early_stopping=False # allow full decoding
        # )[0]['summary_text']

        input_text = (
            "Write a concise and natural summary in your own words for the following product review:\n\n" + text
        )
        result = summarizer(
            input_text,
            max_length=180,        # slightly longer to avoid truncation
            min_length=60,         # ensures some depth
            do_sample=True,        # adds diversity
            top_k=50,              # diverse token sampling
            top_p=0.95,            # nucleus sampling
            temperature=0.8,       # softer creativity
            repetition_penalty=1.2 # avoids repeated phrases
        )[0]['summary_text']
        
        chunk_summaries.append(result)

    combined_text = ' '.join(chunk_summaries)
    # final_summary = summarizer(
    #     key,
    #     max_length=120,      # was 60
    #     min_length=40,       # was 20
    #     num_beams=5,         # explore more candidates
    #     length_penalty=1.2,  # encourage longer outputs
    #     no_repeat_ngram_size=3,
    #     early_stopping=False # allow full decoding
    # )[0]['summary_text']

    input_text = (
        "Write a concise and natural summary in your own words for the following product review:\n\n" + text
    )
    final_summary = summarizer(
        input_text,
        max_length=180,        # slightly longer to avoid truncation
        min_length=60,         # ensures some depth
        do_sample=True,        # adds diversity
        top_k=50,              # diverse token sampling
        top_p=0.95,            # nucleus sampling
        temperature=0.8,       # softer creativity
        repetition_penalty=1.2 # avoids repeated phrases
    )[0]['summary_text']
    
    return final_summary

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("grouped_reviews.csv")

In [1]:
df['review_length'] = df['review_text'].astype(str).apply(len)
df_sorted = df.sort_values(by='review_length', ascending=False)
df_sorted[['parent_asin', 'product_title', 'review_length']].head(50)

NameError: name 'df' is not defined

In [22]:
# Choose a product ID
sample_asin = df.iloc[5]['parent_asin']
df[df['parent_asin'] == sample_asin].iloc[0]['review_text']

"I am an avid reader of historical fiction, with the emphasis on the historical rather than the upthrust breast variety. This catastrophe fails at both. I have read many many entries in this genre from the extraordinary to the putrid. Lady of Hay falls somewhere below this last category. As other readers have commented, this book is about 200 pages too long. The author has managed to confuse tedium with suspense by dragging out the conclusion long past the point of reader exhaustion. Ms. Erskine's attempts to interweave the present day plot with historical narrative is a complete failure, as she only manages to disrupt both story lines resulting in a herkyjerky narrative flow that became quite annoying. Compounding the problem, not a single character is the slightest bit sympathetic, and although I began the read with curiosity and interest, I ended feeling frustrated and cheated. There are many red herrings dropped along the way that kept me reading, but I got no satisfaction at the e

In [24]:
summary = summarize_reviews(df[df['parent_asin'] == sample_asin].iloc[0]['review_text'], summarizer, tokenizer)
summary

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


'The author has managed to confuse tedium with suspense by dragging out the conclusion long past the point of reader exhaustion. There are too many superfluous characters, seemingly all of whom are interested in and experienced with past life regression. I became so exasperated with their antics and cruel misdeeds that I actually found myself rooting for them all to be sent to the pokey.'

In [26]:
# Choose a product ID
sample_asin = df.iloc[6]['parent_asin']
df[df['parent_asin'] == sample_asin].iloc[0]['review_text']

"Kate Kennedy has broken up with her boyfriend and is left without a place to stay. She is in the middle of writing a book and decides to rent an isolated country cottage. Once there she is plagued by mysterious and supernatural events. Out of all of Erskine's books that I have read, I probably enjoyed this one the least. What I've enjoyed in the past was her characters in the present day and their connection or experience with historical characters. Although some of that was present here, it read more like a ghost story than an historical novel."

In [None]:
summarize_reviews(df[df['parent_asin'] == sample_asin].iloc[0]['review_text'], summarizer, tokenizer)

In [27]:
# Choose a product ID
sample_asin = df.iloc[7]['parent_asin']
df[df['parent_asin'] == sample_asin].iloc[0]['review_text']

"This whimsical character actorwho Stars in Doctor Who tell's his life's story in morbidly dark humor makingas he adeptly does the nonsensical sound believable. The seriousness of his voice is in contrast to the silliness of his tellng. These events did happen and they were bad, but he tells it as if laughing at these peoplein the face who wronged him those many years ago. Lalla Ward, another playphsyco was the perfect match and now I know why. This making the obsurd sound like normal occurance is always present in the tone of these two's voice of reason. It is a game to them. I was on medication for ten years for making the obsurd sound belevalbeeven though I was kidding, apparently mymoronic doctor thought I was serious.I didn't give him a signal, GD. he said after the lawsuit.. Ha! The reason Americans like Baker so much as Doctor Who is this obsurdity that he portrays in contrast to what we lack so severely in American conservativePC culture. Bill Cosby or Tom Baker, you decide, he

In [29]:
summary = summarize_reviews(df[df['parent_asin'] == sample_asin].iloc[0]['review_text'], summarizer, tokenizer)
summary

'This was to be quite frank the most depressing autobiography I have ever read. We read of a sordid life full of selfloathing and bumbling from one insanity to another. The attempts at humor are so dark to be totally unfunny. Neither is their any depth, what are described are a series of befuddled mixed up scenes which go nowhere to a answer the question of the title.'

# Improving the abstractive part of summaries

In [None]:
# Choose a product ID
sample_asin = df.iloc[57]['parent_asin']
df[df['parent_asin'] == sample_asin].iloc[0]['review_text']

In [None]:
summarize_reviews(df[df['parent_asin'] == sample_asin].iloc[0]['review_text'], summarizer, tokenizer)

# Pros and cons generation

In [10]:
# Load a lightweight sentiment-analysis model
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def extract_pros_cons_transformer(summary):
    """
    Extract pros and cons using a transformer sentiment model.
    Handles negations and context better than TextBlob.
    """
    pros, cons = [], []

    # Split into sentences
    sentences = re.split(r'(?<=[.!?]) +', summary)

    for sent in sentences:
        if not sent.strip():
            continue
        result = sentiment_analyzer(sent.strip())[0]  # returns {'label': 'POSITIVE', 'score': 0.99}
        label = result['label']
        
        if label == "POSITIVE":
            pros.append(sent.strip())
        else:
            cons.append(sent.strip())

    # If still empty, mark as neutral
    if not pros and not cons:
        pros.append("No clear positive points detected.")
        cons.append("No clear negative points detected.")

    return pros, cons


Device set to use cuda:0


In [15]:
# Choose a product ID
sample_asin = df.iloc[57]['parent_asin']
df[df['parent_asin'] == sample_asin].iloc[0]['review_text']

"I enjoyed this book very much. It is a feel good story and perfect for holiday reading. Jenny and her daughter April move into the local coffee shop cottage. She has split up with her cheating husband and has left a very large villa behind. Noah the owner of the coffee shop has given Jenny a job. Jenny and April share the cottage with him and Elle, Noah's friend who works in the coffee shop too. Jenny worries about April because she can't keep her in the style that they used to live in. Can she and her daughter find happiness? Read on. When I picked this up I was expecting a light hearted, easy read. What I got was a heart warming tale of second chances. And I loved it.br The Coffee Club is the best coffee place for miles around, run by Noah and Elle. When Jennys marriage to Zak, the owner of a local fashion house, falls apart after some shocking revelations from the man himself, Jenny finds herself and her 10 yr old daughter April sharing a cottage with the Coffee Club owners. What a

In [16]:
summary = summarize_reviews(df[df['parent_asin'] == sample_asin].iloc[0]['review_text'], summarizer, tokenizer)
summary

"Jenny and her daughter April move into the local coffee shop cottage. She has split up with her cheating husband and has left a very large villa behind. The Coffee Club is the best coffee place for miles around. Jenny and April share the cottage with him and Elle, Noah's friend who works in the coffee shop too."

In [17]:
# Choose a product ID
sample_asin = df.iloc[62]['parent_asin']
df[df['parent_asin'] == sample_asin].iloc[0]['review_text']

"I decided to read the book instead of seeing the movie. I should have waited for the movie. There are those works of fiction that make better movies instead of books, this is one of them. The story while interesting was not very compelling. The book offered very little in depth to the true aspect to the emotion of grief. What I thought might be a profound testment to the human heart was really nothing more than a story of grief in the form of total abandonment of her other two children and her own selfishness. I was ready to put it down but after I had invested the time in reading over 200 pages, I trodded along to the end, which left me very unimpressed. The point at which this story ended really should have been it's climax. Like I said I should have waited for the movie."

In [20]:
summary = summarize_reviews(df[df['parent_asin'] == sample_asin].iloc[0]['review_text'], summarizer, tokenizer)
summary

"The book offered very little in depth to the true aspect to the emotion of grief. I was ready to put it down but after I had invested the time in reading over 200 pages, I trodded along to the end, which left me very unimpressed. The point at which this story ended really should have been it's climax."

In [12]:
# Choose a product ID
sample_asin = df.iloc[85]['parent_asin']
df[df['parent_asin'] == sample_asin].iloc[0]['review_text']

"First, I should say that, unlike many people reading this book, I'd heard about the ending of the series before I even started reading. However, I think that spoiler is allowing me to enjoy the series more than I would have otherwise. Backlash builds the story very slowly. It's not even clear what the larger story arch is about. In retrospect, the Fate of the Jedi series probably takes too long to actually set up the larger threat. As such, much of Backlash might seem pointless. However, knowing what's coming ahead, I can tell you there is a point.br br With that caveat, how is the book? For me, Backlash is where the Fate of the Jedi series began to lose its way. While some of the previous books were a bit slower, they did have some interesting character moments or plot points. I loved the relationship between Luke and Ben in Outcast and Omen. The Mind Walkers provided some great moments through reviving the ghosts of the past.br br By contrast, Backlash takes place primarily on the p

In [13]:
summary = summarize_reviews(df[df['parent_asin'] == sample_asin].iloc[0]['review_text'], summarizer, tokenizer)
summary

"Backlash builds the story very slowly. It's not even clear what the larger story arch is about. Aaron Allston skims over characterization, even to the extent that he leaves major plot holes. For an absurdly large part of the book, Luke and Han take part in the Dathomiri equivalent of the olympics."

In [30]:
extract_pros_cons_transformer(summary)

([],
 ['This was to be quite frank the most depressing autobiography I have ever read.',
  'We read of a sordid life full of selfloathing and bumbling from one insanity to another.',
  'The attempts at humor are so dark to be totally unfunny.',
  'Neither is their any depth, what are described are a series of befuddled mixed up scenes which go nowhere to a answer the question of the title.'])

In [33]:
# Choose a product ID
sample_asin = df.iloc[3]['parent_asin']
df[df['parent_asin'] == sample_asin].iloc[0]['review_text']

"perhaps the most memorable of the Border Trilogy..Cormac McCarthy takes a little getting used to as far as his literary style but once you find the rhythm of his words sit back and enjoy being transported to a world of masterful prose..John Grady Cole is 16, parents divorced and his grandfather, the only adult he's close to dies then his world changes..deprived of life on his grandfather's ranch after it's sold..he decides to cross the border into Mexico..he's joined on the trip by an old friend, Lacey Rawlins..along the way they meet another teenager, Jimmy Blevins, most likely riding a stolen horse..the three amigos ride into Mexico..and what awaits is a journey that turns them from kids into men..in a harsh Western way..beautifully told...splendid imagery.. I've only just finished Moby Dick before reading ATPH so cryptic style was not a problem, if you follow me, shipmates? I have been aware of Mc Carthy for some time, but have been reluctant to read him, I think due to his artisti

In [34]:
summary = summarize_reviews(df[df['parent_asin'] == sample_asin].iloc[0]['review_text'], summarizer, tokenizer)
summary

"Cormac McCarthy takes a little getting used to as far as his literary style but once you find the rhythm of his words sit back and enjoy being transported to a world of masterful prose. I've only just finished Moby Dick before reading ATPH so cryptic style was not a problem, if you follow me, shipmates? I have been aware of Mc Carthy for some time, but have been reluctant to read him, I think due to his artistic reputation."

In [35]:
extract_pros_cons_transformer(summary)

(['Cormac McCarthy takes a little getting used to as far as his literary style but once you find the rhythm of his words sit back and enjoy being transported to a world of masterful prose.'],
 ["I've only just finished Moby Dick before reading ATPH so cryptic style was not a problem, if you follow me, shipmates?",
  'I have been aware of Mc Carthy for some time, but have been reluctant to read him, I think due to his artistic reputation.'])