In [1]:
# Extractive summarization imports
import nltk
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize

# nltk.download('punkt')

In [2]:
df = pd.read_csv("grouped_reviews.csv")
df.head()

Unnamed: 0,parent_asin,product_title,average_rating,review_text,avg_user_rating,helpful_vote
0,1046314,A Woman of Substance,4.6,Great for a quick tape of the best Bradford bo...,4.0,0
1,1046519,The Importance of Being Earnest Complete & Una...,4.5,"Oscar Wilde's masterpiece, this play has many,...",5.0,1
2,1048236,The Sherlock Holmes Audio Collection,3.6,Just as advertised,5.0,0
3,1048252,All the Pretty Horses,4.3,perhaps the most memorable of the Border Trilo...,4.5,2
4,1048791,"The Crucible Performed by Stuart Pankin, Jerom...",4.6,Completely unabridged audio version of The Cru...,5.0,0


In [15]:
def extractive_summary(text, top_percent=0.25, similarity_threshold=0.8):
    # Split text into sentences
    sentences = sent_tokenize(text)
    if len(sentences) < 3:
        return text  # too short to summarize

    # Vectorize sentences using TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)

    # Build similarity matrix
    sim_matrix = cosine_similarity(X, X)

    # Apply TextRank (PageRank)
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    # Rank sentences
    ranked_sentences = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)

    # Select top N% of sentences
    top_n = max(1, int(len(sentences) * top_percent))
    selected = sorted(ranked_sentences[:top_n], key=lambda x: x[2])  # preserve order

    # Remove near-duplicate sentences
    final_sentences = []
    used = set()
    for _, sent, idx in selected:
        vec = X[idx].toarray()
        if all(cosine_similarity(vec, X[j].toarray())[0][0] < similarity_threshold for j in used):
            final_sentences.append(sent)
            used.add(idx)

    # Join back to form the summary
    summary = " ".join(final_sentences)
    return summary

In [11]:
# Create a new column with review text length
df['review_length'] = df['review_text'].astype(str).apply(len)

# Sort by review length descending
df_sorted = df.sort_values(by='review_length', ascending=False)

# Show top 10 products with largest review text
df_sorted[['parent_asin', 'product_title', 'review_length']].head(20)

Unnamed: 0,parent_asin,product_title,review_length
4029,B00003CXKT,Test Big Data 1737,593541
5289,B000058983,Forever Changes,186706
1678,B000002TYZ,Anthology 2,98753
1677,B000002TYX,Anthology 1,97616
5759,B00005JOKM,"Good Night, and Good Luck.",76602
424,1401923119,The Biology of Belief: Unleashing the Power of...,75969
5845,B00005M97A,Willa Was Here,65522
22339,B0015FQZ94,Ghosts I - IV,56411
23491,B001B71NOI,The Slip,54938
5760,B00005JPAM,The Queen,54499


In [5]:
# Choose a product ID
sample_asin = df.iloc[29]['parent_asin']
df[df['parent_asin'] == sample_asin].iloc[0]['review_text']

"Screwtape is a great read and I thought that it would make a great audio book to listen to in the car. Joss Ackland is however an unfortunate choice as a narrator for this work. His reading style is so dry that it makes this lively book a real chore to listen to. I imagine that John Cleese would be a much better choice to narrate this work. I just wish that his narration was still available. I have long loved this book and this audio performance is almost as good as the one John Cleese did years ago. A great Story to go on a trip with I bought this audiobook to listen to in my car during my commute, and I'm very glad I did. I think Joss Ackland was a great choice for a narrator, his voice is really demony! It's a great book, and a great reading!"

In [6]:
df.iloc[29]['parent_asin']

'0007159854'

In [8]:
# Enter a product ID manually
product_id = "0007159854"  # e.g., 'B00X4WHP5E'
product_data = df[df["parent_asin"] == product_id].iloc[0]

print("Product:", product_data["product_title"])
print("\nFull Review Text (first 500 chars):\n")
print(product_data["review_text"][:500], "...\n")

summary = extractive_summary(product_data["review_text"], top_percent=0.25)
print("Extractive Summary:\n")
print(summary)


Product: The Screwtape Letters Complete and Unabridged

Full Review Text (first 500 chars):

Screwtape is a great read and I thought that it would make a great audio book to listen to in the car. Joss Ackland is however an unfortunate choice as a narrator for this work. His reading style is so dry that it makes this lively book a real chore to listen to. I imagine that John Cleese would be a much better choice to narrate this work. I just wish that his narration was still available. I have long loved this book and this audio performance is almost as good as the one John Cleese did years ...

Extractive Summary:

Screwtape is a great read and I thought that it would make a great audio book to listen to in the car. It's a great book, and a great reading!


In [12]:
# B002X3XR70

In [13]:
# Choose a product ID
df[df['parent_asin'] == 'B002X3XR70'].iloc[0]['review_text']

"For anyone who says this program is a scam or a complete waste of money has also not read the entire Bible, nor can they be a Christian. I doubt many, if any, caught this. That's not a bad thing, it's just a fact because every single principle outlined and discussed in the program actually expands on what the Bible says about prayer, the golden rule, perseverance, positive thinking, being grateful and thankful, listening and believing to the right people, how to get your prayers answered, ect....if anyone whether they are a believer or not has taken the time to actually read the Bible in it's entirety then they simply cannot deny this. If you are a Christian follower, then I believe this course actually clarifies and helps explain a little better some of the concepts in the Bible.br br It also agrees with many, if not all, of the so called conspiracy theories I've read about and heard about concerning the real goals behind governments, The IMF, The World Bank, ect....when it comes to 

In [14]:
# Enter a product ID manually
product_id = "B002X3XR70"  # 
product_data = df[df["parent_asin"] == product_id].iloc[0]

print("📦 Product:", product_data["product_title"])
print("\n📝 Full Review Text (first 500 chars):\n")
print(product_data["review_text"][:500], "...\n")

summary = extractive_summary(product_data["review_text"], top_percent=0.25)
print("📚 Extractive Summary:\n")
print(summary)


📦 Product: Your Wish Is Your Command Series Player

📝 Full Review Text (first 500 chars):

For anyone who says this program is a scam or a complete waste of money has also not read the entire Bible, nor can they be a Christian. I doubt many, if any, caught this. That's not a bad thing, it's just a fact because every single principle outlined and discussed in the program actually expands on what the Bible says about prayer, the golden rule, perseverance, positive thinking, being grateful and thankful, listening and believing to the right people, how to get your prayers answered, ect... ...

📚 Extractive Summary:

If you are a Christian follower, then I believe this course actually clarifies and helps explain a little better some of the concepts in the Bible.br br It also agrees with many, if not all, of the so called conspiracy theories I've read about and heard about concerning the real goals behind governments, The IMF, The World Bank, ect....when it comes to the ruling elite class an