#Data Preprocessing


In [1]:
import pandas as pd
url = "https://raw.githubusercontent.com/Pontakorn-Wich/Mini_project/master/data/books_1250_above_reviews.csv"
df = pd.read_csv(url)
df.head()

print(df.head())

    book_id                           user_id  \
0  13526165  8842281e1d1347389f2ab93d60773d4d   
1   9938498  8842281e1d1347389f2ab93d60773d4d   
2   2767052  8842281e1d1347389f2ab93d60773d4d   
3    136251  8842281e1d1347389f2ab93d60773d4d   
4  15507958  7504b2aee1ecb5b2872d3da381c6c91e   

                          review_id  rating  \
0  51fe3e46c7f8eb39f5623d1bd8bbbbfc       5   
1  bff5654c639c7b008571c3d4398d930a       4   
2  248c011811e945eca861b5c31a549291       5   
3  132eab4c9a3724493204cc083e0e2ecc       5   
4  63ff74279e46b247cb1754313b160006       4   

                                         review_text  \
0  My wife suggested I read this book, and I resi...   
1  Great story of the US Ambassador to Germany an...   
2  I cracked and finally picked this up. Very enj...   
3  Loved every minute. So sad there isn't another...   
4  I finished reading this days ago and cant get ...   

                       date_added                    date_updated  n_votes  \
0  Thu 

## Remove non-english reviews

In [2]:
get_ipython().system('pip install langdetect')

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.2/981.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=da75063ae166efad9800cdf2b40c5f2b730f5d2c0870010dec9d1971fa0960ae
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [3]:
from langdetect import detect, LangDetectException

def detect_language_safe(text):
    if isinstance(text, str) and text.strip():
        try:
            return detect(text)
        except LangDetectException:
            return 'unknown'
    return 'empty'

df['language'] = df['review_text'].apply(detect_language_safe)
print(df[['review_text', 'language']].head())

                                         review_text language
0  My wife suggested I read this book, and I resi...       en
1  Great story of the US Ambassador to Germany an...       en
2  I cracked and finally picked this up. Very enj...       en
3  Loved every minute. So sad there isn't another...       en
4  I finished reading this days ago and cant get ...       en


In [4]:
df_english = df[df['language'] == 'en'].copy()
print(df_english[['review_text', 'language']].head())

                                         review_text language
0  My wife suggested I read this book, and I resi...       en
1  Great story of the US Ambassador to Germany an...       en
2  I cracked and finally picked this up. Very enj...       en
3  Loved every minute. So sad there isn't another...       en
4  I finished reading this days ago and cant get ...       en


In [5]:
df = df_english

##Normalization


In [6]:
import re

df['normalized_text'] = df['review_text'].str.lower()
df['normalized_text'] = df['normalized_text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

print("Text normalized (lowercase and whitespace removal).")
print(df[['review_text', 'normalized_text']].head())

Text normalized (lowercase and whitespace removal).
                                         review_text  \
0  My wife suggested I read this book, and I resi...   
1  Great story of the US Ambassador to Germany an...   
2  I cracked and finally picked this up. Very enj...   
3  Loved every minute. So sad there isn't another...   
4  I finished reading this days ago and cant get ...   

                                     normalized_text  
0  my wife suggested i read this book, and i resi...  
1  great story of the us ambassador to germany an...  
2  i cracked and finally picked this up. very enj...  
3  loved every minute. so sad there isn't another...  
4  i finished reading this days ago and cant get ...  


###Sampling

In [7]:
sampled = (
    df.groupby("book_id")
      .apply(lambda x: x.sample(1200, random_state=42))
      .reset_index(drop=True)
)

  .apply(lambda x: x.sample(1200, random_state=42))


##Special Characters Removal


In [None]:
def remove_noise(text):
    # Remove characters that are not letters or spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

sampled['cleaned_review_text'] = sampled['normalized_text'].apply(remove_noise)
print(sampled[['normalized_text', 'cleaned_review_text']].head())

                                     normalized_text  \
0  it was okay. i didn't really care for the ending.   
1  what a great story! very thought provoking and...   
2  i must admit, i wasn't interested in reading t...   
3  i loved this book. very interesting story and ...   
4  it made me think of a society that satan would...   

                                 cleaned_review_text  
0     it was okay i didnt really care for the ending  
1  what a great story very thought provoking and ...  
2  i must admit i wasnt interested in reading thi...  
3  i loved this book very interesting story and l...  
4  it made me think of a society that satan would...  


##Stopword Removal

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return ' '.join(filtered_sentence)

sampled['review_text_nostopword'] = sampled['cleaned_review_text'].apply(remove_stopwords)
print(sampled[['cleaned_review_text', 'review_text_nostopword']].head())

                                 cleaned_review_text  \
0     it was okay i didnt really care for the ending   
1  what a great story very thought provoking and ...   
2  i must admit i wasnt interested in reading thi...   
3  i loved this book very interesting story and l...   
4  it made me think of a society that satan would...   

                              review_text_nostopword  
0                      okay didnt really care ending  
1  great story thought provoking made feel gratef...  
2  must admit wasnt interested reading book proba...  
3  loved book interesting story louis lowry reall...  
4                made think society satan would like  


Tokenization


In [None]:
# from nltk.tokenize import word_tokenize

# def tokenize_text(text):
#     return word_tokenize(text)

# sampled['tokenized_text'] = sampled['review_text_nostopword'].apply(tokenize_text)

# print(sampled[['review_text_nostopword', 'tokenized_text']].head())

In [None]:
get_ipython().system('pip install spacy')



##Lemmatization

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    # Extract lemmas for each token and join them into a string
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

sampled['lemmatized_text'] = sampled['review_text_nostopword'].apply(lemmatize_text)
print(sampled[['review_text_nostopword', 'lemmatized_text']].head())

KeyboardInterrupt: 

POS Tagging

In [None]:
# def pos_tag_text(text):
#     doc = nlp(text)
#     # Extract POS tag for each token and join them into a string
#     pos_tags = [(token.text, token.pos_) for token in doc]
#     return pos_tags

# sampled['pos_tagged_text'] = sampled['lemmatized_text'].apply(pos_tag_text)

# print(sampled[['lemmatized_text', 'pos_tagged_text']].head())

Named Entity Recognition

In [None]:
# def extract_named_entities(text):
#     doc = nlp(text)
#     # Extract named entities
#     entities = [(ent.text, ent.label_) for ent in doc.ents]
#     return entities

# sampled['named_entities'] = sampled['lemmatized_text'].apply(extract_named_entities)
# print(sampled[['lemmatized_text', 'named_entities']].head())

In [None]:
# print(sampled[['review_text', 'cleaned_review_text', 'normalized_text_no_stopwords', 'normalized_text', 'lemmatized_text', 'pos_tagged_text', 'named_entities']].head())

In [None]:
# # Sort the DataFrame by the length of the 'review_text' column
# df_sorted_by_review_length = df.copy()
# df_sorted_by_review_length['review_text_length'] = df_sorted_by_review_length['review_text'].apply(len)
# df_sorted_by_review_length = df_sorted_by_review_length.sort_values(by='review_text_length', ascending=True)
# print(df_sorted_by_review_length[['review_text', 'cleaned_review_text', 'normalized_text_no_stopwords', 'normalized_text', 'lemmatized_text', 'pos_tagged_text', 'named_entities']].head())

#Topic Modeling


# New Section

##BERTopic



In [10]:
!pip install bertopic sentence-transformers umap-learn hdbscan transformers

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3


###Text Embedding

In [51]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import umap.umap_ as umap
import hdbscan
embedding_model = SentenceTransformer("intfloat/e5-small-v2")


###UMAP

In [106]:
umap_model = umap.UMAP(
    n_neighbors=30,
    n_components=5,
    min_dist=0.1,
    metric='cosine',
    random_state=42
)



###HDBSCAN

In [107]:
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=100,
    min_samples=30,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

###Stopword removal by CountVectorizer

In [58]:
vectorizer_model = CountVectorizer(stop_words="english")

###cTF-IDF

In [59]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

###MMR to diversify words

In [60]:
representation_model = MaximalMarginalRelevance(diversity=0.2)


###Zero-shot topic labeling

In [None]:
# candidate_topics = [
#     "romance", "mystery", "fantasy", "science fiction", "thriller",
#     "self-help", "psychology", "memoir", "history", "children",
#     "philosophy", "religion", "politics", "crime", "biography",
#     "adventure", "young adult", "graphic novel", "family drama"
# ]

# label_model = ZeroShotClassification(
#     model="facebook/bart-large-mnli",
#     candidate_topics=candidate_topics,
# )



###Build BERTopic

In [61]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [18]:
# import pandas as pd
# model = BERTopic.load("sampled_model")

# results = pd.read_csv("bertopic_results.csv")

In [20]:
# docs = results["review_text"].tolist()
# topics = results["topic"].tolist()
# probs = results["topic_probability"].tolist()

In [108]:
model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    verbose=True
)

docs = sampled['normalized_text'].tolist()

topics, probs = model.fit_transform(docs)

topic_info = model.get_topic_info()




2025-11-19 15:14:07,722 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/375 [00:00<?, ?it/s]

2025-11-19 15:14:34,490 - BERTopic - Embedding - Completed ✓
2025-11-19 15:14:34,491 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-19 15:14:50,976 - BERTopic - Dimensionality - Completed ✓
2025-11-19 15:14:50,978 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-19 15:14:51,489 - BERTopic - Cluster - Completed ✓
2025-11-19 15:14:51,494 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-19 15:14:52,346 - BERTopic - Representation - Completed ✓


In [112]:
model.save("sampled_model")



In [None]:
model.visualize_topics()

In [110]:
model.visualize_term_rank()


In [111]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1894,-1_read_loved_series_stars,"[read, loved, series, stars, love, characters,...",[i finally done it! i've finally finished this...
1,0,2899,0_read_great_ending_characters,"[read, great, ending, characters, books, love,...","[i laughed i cried, what more could you want. ..."
2,1,1105,1_creepy_scary_illustrations_horror,"[creepy, scary, illustrations, horror, emily, ...",[oh. my. goodness. holymolyholymolyholymolyhol...
3,2,1032,2_vance_hillbilly_memoir_poverty,"[vance, hillbilly, memoir, poverty, appalachia...",[j.d. vance grew up poor with a mother who bat...
4,3,996,3_dodd_germany_larson_ambassador,"[dodd, germany, larson, ambassador, martha, hi...","[what was it really like, living in berlin in ..."
5,4,898,4_rupi_milk_healing_words,"[rupi, milk, healing, words, poem, breaking, t...",[this collection of poetry has sky-rocketed in...
6,5,740,5_flavia_luce_chemistry_11,"[flavia, luce, chemistry, 11, murder, poisons,...",[flavia de luce is a complex little girl. unca...
7,6,578,6_rowling_voldemort_snape_hallows,"[rowling, voldemort, snape, hallows, dumbledor...",[spoilers ahead folks. harry potter is one of ...
8,7,494,7_bernadette_semple_characters_satire,"[bernadette, semple, characters, satire, hilar...",[a synopsis: things going against bernadette f...
9,8,421,8_lou_louisa_jojo_quadriplegic,"[lou, louisa, jojo, quadriplegic, love, trayno...","[this is not a fluffy, easy read. it is deeply..."


In [113]:
fig = model.visualize_barchart(top_n_topics=12)
fig.show()



In [115]:
sampled["topic"] = topics
sampled["topic_probability"] = probs
sampled.to_csv("bertopic_results.csv", index=False)

In [None]:
# model = BERTopic.load("sampled_model")
# dir(model)


In [None]:
# model.probabilities_

In [None]:
# fig = model.visualize_barchart()
# fig.show()


**Reduced the number of topics because the original model produced too many small or irrelevant topics**

In [None]:
# candidates = [20, 15, 10, 5]
# reduced_models = {}

# for k in candidates:
#     base = BERTopic.load("original_model")
#     reduced = base.reduce_topics(docs, nr_topics=k)


In [None]:
# # for n in candidates:
#     display(reduced_models[n].get_topic_info())

In [None]:
# best = sorted(results, key=lambda x: (x[1], x[2]), reverse=True)[0]
# print("Best Topic Model Size:")
# print(f"Topics = {best[0]}")
# print(f"Coherence = {best[1]:.4f}")
# print(f"Diversity = {best[2]:.4f}")


In [None]:
# model.reduce_topics(docs, nr_topics=5)
# reduced_topic_info = model.get_topic_info()
# print(reduced_topic_info)

##LDA

In [None]:
!pip install gensim

In [None]:
from gensim.utils import simple_preprocess
sampled["lemmatized_text"] = sampled["normalized_text"].apply(lemmatize_text)
texts = sampled["lemmatized_text"].apply(simple_preprocess)
sampled.shape

In [None]:
from gensim import corpora

dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=2000)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
!pip install pyLDAvis

In [None]:
from gensim import models
n_topics = 7

lda_model = models.LdaModel(corpus=corpus, num_topics=n_topics)
lda_model.print_topics()

In [None]:
Import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
from gensim import models
n_topics = 10

lda_model = models.LdaModel(corpus=corpus, num_topics=n_topics)
lda_model.print_topics()

In [None]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
n_topics = 15

lda_model = models.LdaModel(corpus=corpus, num_topics=n_topics)
lda_model.print_topics()

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
from gensim import models

n_topics = 20

lda_model = models.LdaModel(corpus=corpus, num_topics=n_topics)
lda_model.print_topics()

In [None]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
print(dictionary.token2id)
print(dictionary.id2token)


#Summarization

### Extractive Summarization - LexRank

**Install and import libraries**

In [28]:
!pip install lexrank path.py

import pandas as pd
from lexrank import LexRank
from lexrank.mappings.stopwords import STOPWORDS
from nltk.tokenize import sent_tokenize
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

Collecting lexrank
  Downloading lexrank-0.1.0-py3-none-any.whl.metadata (5.8 kB)
Collecting path.py
  Downloading path.py-12.5.0-py3-none-any.whl.metadata (1.3 kB)
Collecting pyrsistent>=0.14.0 (from lexrank)
  Downloading pyrsistent-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting urlextract>=0.7 (from lexrank)
  Downloading urlextract-1.9.0-py3-none-any.whl.metadata (5.8 kB)
Collecting path (from path.py)
  Downloading path-17.1.1-py3-none-any.whl.metadata (6.5 kB)
Collecting uritools (from urlextract>=0.7->lexrank)
  Downloading uritools-5.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading lexrank-0.1.0-py3-none-any.whl (69 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.8/69.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading path.py-12.5.0-py3-none-any.whl (2.3 kB)
Downloading pyrsistent-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (122 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

**Load BERTopic results and inspect topics**

In [116]:
df = pd.read_csv("bertopic_results.csv")
df.head()

Unnamed: 0,book_id,user_id,review_id,rating,review_text,date_added,date_updated,n_votes,n_comments,language,normalized_text,topic,topic_probability
0,3636,2bc509b5178d5cb4f1cbb2688014069f,2ce5398c9dbb7d28864306d89cfb250f,4,Easy read and cute story if you are into dysti...,Mon Jun 11 21:27:55 -0700 2012,Mon Jun 11 21:34:37 -0700 2012,0,0,en,easy read and cute story if you are into dysti...,0,1.0
1,3636,d412ea805a486ce0d16c97df1b053827,e2c5ea07ea90987eedbaf318cf4f4131,3,"Very interesting. A little 'Orwell', a little ...",Mon Oct 24 04:56:36 -0700 2011,Tue Jan 03 05:21:44 -0800 2012,0,0,en,"very interesting. a little 'orwell', a little ...",0,1.0
2,3636,41ec02777adaa8b57555e7c55015ec08,ee5186ccad19cf5edd833194d4cf9689,5,This was the best book that I have read in a l...,Mon Oct 13 18:24:47 -0700 2014,Mon Oct 27 13:50:47 -0700 2014,0,0,en,this was the best book that i have read in a l...,0,0.972148
3,3636,e03f562917ffd87d5f17402815c40a7f,aff7abee3861b24dc3f904b0dff11488,5,"Outstanding. Truly amazing. Thought provoking,...",Fri May 20 14:20:25 -0700 2011,Fri May 20 14:23:29 -0700 2011,0,0,en,"outstanding. truly amazing. thought provoking,...",0,0.972148
4,3636,4e959b1e2a791afa91d235a4126fbb83,892f3cff39f2d84afc6564b285df2f52,4,Well that was a quick read. I find myself read...,Sat Apr 14 18:09:39 -0700 2012,Sun Apr 15 15:08:55 -0700 2012,0,0,en,well that was a quick read. i find myself read...,0,1.0


In [117]:
# remove topic = -1 (outlier)
df = df[df["topic"] != -1]


print("Total rows after this step:", len(df))

Total rows after this step: 10106


**Prepare the LexRank corpus**

In [118]:
# Create corpus
all_docs = []

for text in df["review_text"].dropna().tolist():
    sentences = sent_tokenize(str(text))
    if len(sentences) > 0:
        all_docs.append(sentences)

lexrank = LexRank(all_docs, stopwords=STOPWORDS["en"])

print("Number of documents in LexRank corpus:", len(all_docs))

Number of documents in LexRank corpus: 10106


**Summarization function**

In [119]:
def summarize_topic(texts, summary_size=3):
    """
    texts: list ของข้อความรีวิว (string) ภายใน topic นั้น
    summary_size: จำนวนประโยคที่อยากให้สรุปออกมา

    return: list ของประโยคที่เป็น summary
    """
    sentences = []
    for t in texts:
        sentences.extend(sent_tokenize(str(t)))

    if len(sentences) < 3:
        return sentences[:summary_size]

    try:
        summary = lexrank.get_summary(
            sentences,
            summary_size=min(summary_size, len(sentences)),
            threshold=0.1
        )
        return summary
    except ValueError:
        return sentences[:summary_size]


**Summaries for each topic**

In [121]:
topic_summaries = {}

for t in sorted(df["topic"].unique()):
    subset = df[df["topic"] == t]
    texts = subset["review_text"].dropna().tolist()

    if not texts:
        continue

    summary_sentences = summarize_topic(texts, summary_size=3)

    topic_summaries[t] = {
        "n_reviews": len(subset),
        "summary_sentences": summary_sentences,
        "summary_text": " ".join(summary_sentences)
    }

print("Number of topics summarized:", len(topic_summaries))
print("Topic IDs summarized:", list(topic_summaries.keys()))

Number of topics summarized: 12
Topic IDs summarized: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11)]


**Display the summaries**

In [122]:
for t, info in topic_summaries.items():
    print(f"\n=== Topic {t} | n_reviews = {info['n_reviews']} ===")
    for i, s in enumerate(info["summary_sentences"], 1):
        print(f"{i}. {s}")


=== Topic 0 | n_reviews = 2899 ===
1. Best book I've read in a while.
2. Everyone should read this book!
3. Read this book myself when it first came out.

=== Topic 1 | n_reviews = 1105 ===
1. This graphic novel has 5 different creepy stories, I liked 3 out of the 5 stories which is pretty awesome.
2. Creepy, beautiful art and stories
3. Creepy stories.

=== Topic 2 | n_reviews = 1032 ===
1. Vance is a hillbilly.
2. I still don't know how I feel about Vance and his "memoir".
3. What Vance is able to say in this book is, I think, what many of us think but feel like isn't ours to say.

=== Topic 3 | n_reviews = 996 ===
1. In the Garden of Beasts looks at the rise of Hitler's Germany through the eyes of the American ambassador to Germany, William E. Dodd, and his daughter, Martha Dodd.
2. This book looks at the rise of the Nazi part in Germany at the end of World War I through the eyes of the American ambassador to Germany, William Dodd and his daughter Martha.
3. Larson tells the dark s

**Convert results to a DataFrame**

In [123]:
rows = []
for t, info in topic_summaries.items():
    rows.append({
        "topic": t,
        "n_reviews": info["n_reviews"],
        "summary": info["summary_text"]
    })

summary_df = pd.DataFrame(rows).sort_values("topic")
summary_df

Unnamed: 0,topic,n_reviews,summary
0,0,2899,Best book I've read in a while. Everyone shoul...
1,1,1105,This graphic novel has 5 different creepy stor...
2,2,1032,Vance is a hillbilly. I still don't know how I...
3,3,996,In the Garden of Beasts looks at the rise of H...
4,4,898,"Some poetry I understand, a few poems I love a..."
5,5,740,I love Flavia and the mystery. Loved Flavia an...
6,6,578,The end of the Harry Potter series. I read thi...
7,7,494,"I'm not sure I believe in Bernadette, but I lo..."
8,8,421,This is a story about life and love and you sh...
9,9,414,Katniss and Peeta find themselves competing in...


### Abstractive Summarization

**Install and import libraries**

In [124]:
!pip install transformers sentencepiece accelerate -q

In [125]:
from transformers import pipeline
import pandas as pd

**Create an abstractive summarization pipeline using a pre-trained BART model.**

In [126]:
abstractive_summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    tokenizer="facebook/bart-large-cnn",
    device_map="auto"
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


**Rewrites the extractive summary into an abstractive summarization**

In [127]:
def abstractive_summarize(text, max_length=80, min_length=30):
    """
    รับ summary จาก LexRank (text) แล้วให้โมเดลเขียนสรุปใหม่แบบ abstractive
    """
    if not text or len(text.split()) < 10:
        return text

    result = abstractive_summarizer(
        text,
        max_length=max_length,
        min_length=min_length,
        do_sample=False
    )
    return result[0]["summary_text"]

**Abstractive summary for each topic**

In [128]:
abstractive_summaries = []

for i, row in summary_df.iterrows():
    topic_id = row["topic"]
    extractive_summary = row["summary"]

    abs_sum = abstractive_summarize(extractive_summary)

    abstractive_summaries.append(abs_sum)

summary_df["abstractive_summary"] = abstractive_summaries

summary_df

Your max_length is set to 80, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max_length is set to 80, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Your max_length is set to 80, but your input_length is only 53. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)
Your max_length is set to 80, but your input_length is only 78. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Your max

Unnamed: 0,topic,n_reviews,summary,abstractive_summary
0,0,2899,Best book I've read in a while. Everyone shoul...,Best book I've read in a while. Everyone shoul...
1,1,1105,This graphic novel has 5 different creepy stor...,This graphic novel has 5 different creepy stor...
2,2,1032,Vance is a hillbilly. I still don't know how I...,"""I still don't know how I feel about Vance and..."
3,3,996,In the Garden of Beasts looks at the rise of H...,In the Garden of Beasts looks at the rise of H...
4,4,898,"Some poetry I understand, a few poems I love a...",Rupi Kaur's Milk and Honey is a collection of ...
5,5,740,I love Flavia and the mystery. Loved Flavia an...,Loved Flavia and will certainly read the next ...
6,6,578,The end of the Harry Potter series. I read thi...,A good ending to the Harry potter series. The ...
7,7,494,"I'm not sure I believe in Bernadette, but I lo...","""I'm not sure I believe in Bernadette, but I l..."
8,8,421,This is a story about life and love and you sh...,This is a story about life and love and you sh...
9,9,414,Katniss and Peeta find themselves competing in...,Katniss and Peeta have to battle the other tri...


In [129]:
summary_df

Unnamed: 0,topic,n_reviews,summary,abstractive_summary
0,0,2899,Best book I've read in a while. Everyone shoul...,Best book I've read in a while. Everyone shoul...
1,1,1105,This graphic novel has 5 different creepy stor...,This graphic novel has 5 different creepy stor...
2,2,1032,Vance is a hillbilly. I still don't know how I...,"""I still don't know how I feel about Vance and..."
3,3,996,In the Garden of Beasts looks at the rise of H...,In the Garden of Beasts looks at the rise of H...
4,4,898,"Some poetry I understand, a few poems I love a...",Rupi Kaur's Milk and Honey is a collection of ...
5,5,740,I love Flavia and the mystery. Loved Flavia an...,Loved Flavia and will certainly read the next ...
6,6,578,The end of the Harry Potter series. I read thi...,A good ending to the Harry potter series. The ...
7,7,494,"I'm not sure I believe in Bernadette, but I lo...","""I'm not sure I believe in Bernadette, but I l..."
8,8,421,This is a story about life and love and you sh...,This is a story about life and love and you sh...
9,9,414,Katniss and Peeta find themselves competing in...,Katniss and Peeta have to battle the other tri...


**Inspect a few topics with both extractive and abstractive summaries**

In [132]:
for i, row in summary_df.iterrows():
    print(f"\n=== Topic {row['topic']} | n_reviews = {row['n_reviews']} ===")
    print("Extractive summary (LexRank):")
    print(row["summary"])
    print("\nAbstractive summary (BART):")
    print(row["abstractive_summary"])
    print("-" * 80)


=== Topic 0 | n_reviews = 2899 ===
Extractive summary (LexRank):
Best book I've read in a while. Everyone should read this book! Read this book myself when it first came out.

Abstractive summary (BART):
Best book I've read in a while. Everyone should read this book! Read this book myself when it first came out. It's a great book.
--------------------------------------------------------------------------------

=== Topic 1 | n_reviews = 1105 ===
Extractive summary (LexRank):
This graphic novel has 5 different creepy stories, I liked 3 out of the 5 stories which is pretty awesome. Creepy, beautiful art and stories Creepy stories.

Abstractive summary (BART):
This graphic novel has 5 different creepy stories, I liked 3 out of the 5 stories which is pretty awesome. Creepy, beautiful art and stories.
--------------------------------------------------------------------------------

=== Topic 2 | n_reviews = 1032 ===
Extractive summary (LexRank):
Vance is a hillbilly. I still don't know how

In [131]:
import pandas as pd
df = pd.read_csv("bertopic_results.csv")

print("Columns:", df.columns.tolist())
print("Unique topics in CSV:", sorted(df["topic"].unique()))

Columns: ['book_id', 'user_id', 'review_id', 'rating', 'review_text', 'date_added', 'date_updated', 'n_votes', 'n_comments', 'language', 'normalized_text', 'topic', 'topic_probability']
Unique topics in CSV: [np.int64(-1), np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11)]
