#Data Preprocessing


In [None]:
import pandas as pd
url = "https://raw.githubusercontent.com/Pontakorn-Wich/Mini_project/master/data/books_1250_above_reviews.csv"
df = pd.read_csv(url)
df.head()

print(df.head())

    book_id                           user_id  \
0  13526165  8842281e1d1347389f2ab93d60773d4d   
1   9938498  8842281e1d1347389f2ab93d60773d4d   
2   2767052  8842281e1d1347389f2ab93d60773d4d   
3    136251  8842281e1d1347389f2ab93d60773d4d   
4  15507958  7504b2aee1ecb5b2872d3da381c6c91e   

                          review_id  rating  \
0  51fe3e46c7f8eb39f5623d1bd8bbbbfc       5   
1  bff5654c639c7b008571c3d4398d930a       4   
2  248c011811e945eca861b5c31a549291       5   
3  132eab4c9a3724493204cc083e0e2ecc       5   
4  63ff74279e46b247cb1754313b160006       4   

                                         review_text  \
0  My wife suggested I read this book, and I resi...   
1  Great story of the US Ambassador to Germany an...   
2  I cracked and finally picked this up. Very enj...   
3  Loved every minute. So sad there isn't another...   
4  I finished reading this days ago and cant get ...   

                       date_added                    date_updated  n_votes  \
0  Thu 

###Sampling

In [None]:
sampled = (
    df.groupby("book_id")
      .apply(lambda x: x.sample(1200, random_state=42))
      .reset_index(drop=True)
)

  .apply(lambda x: x.sample(1200, random_state=42))


## Remove non-english reviews

In [None]:
get_ipython().system('pip install langdetect')

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.1/981.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=e670cb0b1f94c143c4ea249a0abcf59811cbae12716b4830f3e894466f0d6457
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f0

In [None]:
from langdetect import detect, LangDetectException

def detect_language_safe(text):
    if isinstance(text, str) and text.strip():
        try:
            return detect(text)
        except LangDetectException:
            return 'unknown'
    return 'empty'

sampled['language'] = sampled['review_text'].apply(detect_language_safe)
print(sampled[['review_text', 'language']].head())

                                         review_text language
0  It was okay. I didn't really care for the ending.       en
1  What a great story! Very thought provoking and...       en
2  I must admit, I wasn't interested in reading t...       en
3  I loved this book. Very interesting story and ...       en
4  It made me think of a society that Satan would...       en


In [None]:
sampled_english = sampled[sampled['language'] == 'en'].copy()
print(sampled_english[['review_text', 'language']].head())

                                         review_text language
0  It was okay. I didn't really care for the ending.       en
1  What a great story! Very thought provoking and...       en
2  I must admit, I wasn't interested in reading t...       en
3  I loved this book. Very interesting story and ...       en
4  It made me think of a society that Satan would...       en


In [None]:
sampled = sampled_english

##Normalization


In [None]:
import re

sampled['normalized_text'] = sampled['review_text'].str.lower()
sampled['normalized_text'] = sampled['normalized_text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

print("Text normalized (lowercase and whitespace removal).")
print(sampled[['review_text', 'normalized_text']].head())

Text normalized (lowercase and whitespace removal).
                                         review_text  \
0  It was okay. I didn't really care for the ending.   
1  What a great story! Very thought provoking and...   
2  I must admit, I wasn't interested in reading t...   
3  I loved this book. Very interesting story and ...   
4  It made me think of a society that Satan would...   

                                     normalized_text  
0  it was okay. i didn't really care for the ending.  
1  what a great story! very thought provoking and...  
2  i must admit, i wasn't interested in reading t...  
3  i loved this book. very interesting story and ...  
4  it made me think of a society that satan would...  


##Special Characters Removal


In [None]:
def remove_noise(text):
    # Remove characters that are not letters or spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

sampled['cleaned_review_text'] = sampled['normalized_text'].apply(remove_noise)
print(sampled[['normalized_text', 'cleaned_review_text']].head())

                                     normalized_text  \
0  it was okay. i didn't really care for the ending.   
1  what a great story! very thought provoking and...   
2  i must admit, i wasn't interested in reading t...   
3  i loved this book. very interesting story and ...   
4  it made me think of a society that satan would...   

                                 cleaned_review_text  
0     it was okay i didnt really care for the ending  
1  what a great story very thought provoking and ...  
2  i must admit i wasnt interested in reading thi...  
3  i loved this book very interesting story and l...  
4  it made me think of a society that satan would...  


##Stopword Removal

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return ' '.join(filtered_sentence)

sampled['review_text_nostopword'] = sampled['cleaned_review_text'].apply(remove_stopwords)
print(sampled[['cleaned_review_text', 'review_text_nostopword']].head())

                                 cleaned_review_text  \
0     it was okay i didnt really care for the ending   
1  what a great story very thought provoking and ...   
2  i must admit i wasnt interested in reading thi...   
3  i loved this book very interesting story and l...   
4  it made me think of a society that satan would...   

                              review_text_nostopword  
0                      okay didnt really care ending  
1  great story thought provoking made feel gratef...  
2  must admit wasnt interested reading book proba...  
3  loved book interesting story louis lowry reall...  
4                made think society satan would like  


Tokenization


In [None]:
# from nltk.tokenize import word_tokenize

# def tokenize_text(text):
#     return word_tokenize(text)

# sampled['tokenized_text'] = sampled['review_text_nostopword'].apply(tokenize_text)

# print(sampled[['review_text_nostopword', 'tokenized_text']].head())

In [None]:
get_ipython().system('pip install spacy')



##Lemmatization

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    # Extract lemmas for each token and join them into a string
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

sampled['lemmatized_text'] = sampled['review_text_nostopword'].apply(lemmatize_text)
print(sampled[['review_text_nostopword', 'lemmatized_text']].head())

                              review_text_nostopword  \
0                      okay didnt really care ending   
1  great story thought provoking made feel gratef...   
2  must admit wasnt interested reading book proba...   
3  loved book interesting story louis lowry reall...   
4                made think society satan would like   

                                     lemmatized_text  
0                        okay do not really care end  
1  great story think provoking make feel grateful...  
2  must admit be not interested reading book prob...  
3  love book interesting story louis lowry really...  
4                make think society satan would like  


POS Tagging

In [None]:
# def pos_tag_text(text):
#     doc = nlp(text)
#     # Extract POS tag for each token and join them into a string
#     pos_tags = [(token.text, token.pos_) for token in doc]
#     return pos_tags

# sampled['pos_tagged_text'] = sampled['lemmatized_text'].apply(pos_tag_text)

# print(sampled[['lemmatized_text', 'pos_tagged_text']].head())

Named Entity Recognition

In [None]:
# def extract_named_entities(text):
#     doc = nlp(text)
#     # Extract named entities
#     entities = [(ent.text, ent.label_) for ent in doc.ents]
#     return entities

# sampled['named_entities'] = sampled['lemmatized_text'].apply(extract_named_entities)
# print(sampled[['lemmatized_text', 'named_entities']].head())

In [None]:
# print(sampled[['review_text', 'cleaned_review_text', 'normalized_text_no_stopwords', 'normalized_text', 'lemmatized_text', 'pos_tagged_text', 'named_entities']].head())

In [None]:
# # Sort the DataFrame by the length of the 'review_text' column
# df_sorted_by_review_length = df.copy()
# df_sorted_by_review_length['review_text_length'] = df_sorted_by_review_length['review_text'].apply(len)
# df_sorted_by_review_length = df_sorted_by_review_length.sort_values(by='review_text_length', ascending=True)
# print(df_sorted_by_review_length[['review_text', 'cleaned_review_text', 'normalized_text_no_stopwords', 'normalized_text', 'lemmatized_text', 'pos_tagged_text', 'named_entities']].head())

#Topic Modeling


# New Section

##BERTopic



In [None]:
!pip install bertopic sentence-transformers umap-learn hdbscan transformers

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3


###Text Embedding

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import umap.umap_ as umap
import hdbscan
embedding_model = SentenceTransformer("intfloat/e5-small-v2")


  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


###UMAP

In [None]:
umap_model = umap.UMAP(
    n_neighbors=30,
    n_components=2,
    min_dist=0.1,
    metric='cosine',
    random_state=42
)



###HDBSCAN

In [None]:
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=80,
    min_samples=10,
    metric='euclidean',
    cluster_selection_method='eom'
)


###Stopword removal by CountVectorizer

In [None]:
vectorizer_model = CountVectorizer(stop_words="english")

###cTF-IDF

In [None]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

###MMR to diversify words

In [None]:
representation_model = MaximalMarginalRelevance(diversity=0.2)


###Zero-shot topic labeling

In [None]:
# candidate_topics = [
#     "romance", "mystery", "fantasy", "science fiction", "thriller",
#     "self-help", "psychology", "memoir", "history", "children",
#     "philosophy", "religion", "politics", "crime", "biography",
#     "adventure", "young adult", "graphic novel", "family drama"
# ]

# label_model = ZeroShotClassification(
#     model="facebook/bart-large-mnli",
#     candidate_topics=candidate_topics,
# )



###Build BERTopic

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [None]:
loaded_model = BERTopic.load("sampled_model")

In [None]:
# model = BERTopic(
#     embedding_model=embedding_model,
#     umap_model=umap_model,
#     hdbscan_model=hdbscan_model,
#     vectorizer_model=vectorizer_model,
#     ctfidf_model=ctfidf_model,
#     representation_model=representation_model,
#     verbose=True
# )

# docs = sampled['normalized_text'].tolist()

# topics, probs = model.fit_transform(docs)

# topic_info = model.get_topic_info()

# # model.save("sampled_model")

In [None]:
topic_info

In [None]:
model.visualize_topics()

In [None]:
fig = model.visualize_barchart(top_n_topics=10)
fig.show()

In [None]:
# fig = model.visualize_barchart()
# fig.show()

**Reduced the number of topics because the original model produced too many small or irrelevant topics**

In [None]:
# candidates = [20, 15, 10, 5]
# reduced_models = {}

# for k in candidates:
#     base = BERTopic.load("original_model")
#     reduced = base.reduce_topics(docs, nr_topics=k)


In [None]:
# # for n in candidates:
#     display(reduced_models[n].get_topic_info())

In [None]:
# best = sorted(results, key=lambda x: (x[1], x[2]), reverse=True)[0]
# print("Best Topic Model Size:")
# print(f"Topics = {best[0]}")
# print(f"Coherence = {best[1]:.4f}")
# print(f"Diversity = {best[2]:.4f}")


In [None]:
# model.reduce_topics(docs, nr_topics=5)
# reduced_topic_info = model.get_topic_info()
# print(reduced_topic_info)

##LDA

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
from gensim.utils import simple_preprocess
sampled["lemmatized_text"] = sampled["normalized_text"].apply(lemmatize_text)
texts = sampled["lemmatized_text"].apply(simple_preprocess)
sampled.shape

(11315, 14)

In [None]:
from gensim import corpora

dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=2000)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [None]:
from gensim import models
n_topics = 7

lda_model = models.LdaModel(corpus=corpus, num_topics=n_topics)
lda_model.print_topics()



KeyboardInterrupt: 

In [None]:
Import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
from gensim import models
n_topics = 10

lda_model = models.LdaModel(corpus=corpus, num_topics=n_topics)
lda_model.print_topics()



[(0,
  '0.032*"98" + 0.026*"25" + 0.021*"100" + 0.020*"103" + 0.018*"160" + 0.014*"27" + 0.014*"108" + 0.014*"18" + 0.013*"263" + 0.013*"3"'),
 (1,
  '0.033*"103" + 0.026*"78" + 0.022*"18" + 0.018*"3" + 0.018*"27" + 0.018*"69" + 0.017*"61" + 0.016*"4" + 0.013*"72" + 0.013*"108"'),
 (2,
  '0.024*"18" + 0.024*"4" + 0.024*"3" + 0.021*"78" + 0.018*"121" + 0.016*"1" + 0.014*"69" + 0.013*"108" + 0.012*"25" + 0.012*"166"'),
 (3,
  '0.023*"18" + 0.022*"1877" + 0.019*"98" + 0.016*"102" + 0.016*"3" + 0.015*"96" + 0.015*"27" + 0.014*"1715" + 0.012*"308" + 0.012*"93"'),
 (4,
  '0.082*"169" + 0.057*"359" + 0.027*"18" + 0.018*"78" + 0.015*"356" + 0.015*"69" + 0.015*"61" + 0.013*"3" + 0.013*"27" + 0.012*"286"'),
 (5,
  '0.030*"4" + 0.027*"61" + 0.025*"27" + 0.025*"78" + 0.024*"108" + 0.022*"1" + 0.016*"24" + 0.014*"18" + 0.013*"3" + 0.012*"65"'),
 (6,
  '0.043*"4" + 0.029*"271" + 0.026*"108" + 0.026*"1" + 0.016*"39" + 0.015*"65" + 0.014*"18" + 0.012*"27" + 0.011*"98" + 0.011*"78"'),
 (7,
  '0.041*"27

In [None]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

  return datetime.utcnow().replace(tzinfo=utc)


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
n_topics = 15

lda_model = models.LdaModel(corpus=corpus, num_topics=n_topics)
lda_model.print_topics()

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

[(0,
  '0.046*"103" + 0.043*"4" + 0.029*"78" + 0.022*"18" + 0.019*"108" + 0.019*"1" + 0.018*"25" + 0.015*"138" + 0.014*"61" + 0.013*"3"'),
 (1,
  '0.033*"78" + 0.028*"98" + 0.026*"108" + 0.023*"24" + 0.022*"4" + 0.015*"11" + 0.015*"61" + 0.013*"120" + 0.013*"1" + 0.012*"102"'),
 (2,
  '0.028*"605" + 0.026*"27" + 0.025*"18" + 0.014*"102" + 0.014*"239" + 0.013*"3" + 0.013*"214" + 0.011*"1003" + 0.011*"61" + 0.011*"4"'),
 (3,
  '0.040*"78" + 0.038*"3" + 0.021*"18" + 0.020*"69" + 0.019*"108" + 0.017*"4" + 0.012*"25" + 0.012*"16" + 0.012*"1429" + 0.011*"239"'),
 (4,
  '0.036*"61" + 0.025*"18" + 0.016*"24" + 0.014*"108" + 0.014*"102" + 0.014*"78" + 0.013*"25" + 0.012*"27" + 0.012*"98" + 0.011*"3"'),
 (5,
  '0.123*"160" + 0.054*"25" + 0.036*"78" + 0.025*"1372" + 0.023*"59" + 0.023*"707" + 0.022*"205" + 0.018*"807" + 0.017*"61" + 0.016*"107"'),
 (6,
  '0.038*"18" + 0.024*"359" + 0.017*"103" + 0.016*"102" + 0.015*"149" + 0.014*"61" + 0.014*"39" + 0.014*"27" + 0.012*"182" + 0.011*"78"'),
 (7,
  

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
from gensim import models

n_topics = 20

lda_model = models.LdaModel(corpus=corpus, num_topics=n_topics)
lda_model.print_topics()

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

[(0,
  '0.033*"65" + 0.028*"25" + 0.024*"1539" + 0.023*"6" + 0.022*"27" + 0.022*"18" + 0.019*"78" + 0.018*"108" + 0.015*"12" + 0.014*"976"'),
 (1,
  '0.039*"149" + 0.031*"97" + 0.026*"18" + 0.021*"356" + 0.021*"359" + 0.017*"25" + 0.017*"78" + 0.016*"233" + 0.016*"3" + 0.015*"263"'),
 (2,
  '0.045*"359" + 0.027*"18" + 0.026*"169" + 0.025*"27" + 0.017*"25" + 0.015*"182" + 0.013*"61" + 0.012*"102" + 0.011*"4" + 0.011*"166"'),
 (3,
  '0.054*"1222" + 0.053*"160" + 0.052*"69" + 0.041*"78" + 0.039*"1306" + 0.036*"30" + 0.034*"1085" + 0.028*"164" + 0.024*"1426" + 0.023*"72"'),
 (4,
  '0.149*"103" + 0.037*"33" + 0.023*"138" + 0.022*"4" + 0.020*"201" + 0.020*"3" + 0.018*"78" + 0.017*"102" + 0.015*"108" + 0.011*"18"'),
 (5,
  '0.046*"78" + 0.038*"4" + 0.028*"108" + 0.026*"3" + 0.022*"1" + 0.019*"61" + 0.017*"27" + 0.017*"18" + 0.013*"87" + 0.012*"65"'),
 (6,
  '0.032*"4" + 0.028*"1" + 0.021*"27" + 0.018*"61" + 0.017*"3" + 0.014*"78" + 0.014*"103" + 0.014*"102" + 0.014*"25" + 0.013*"39"'),
 (7,
 

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
print(dictionary.token2id)
print(dictionary.id2token)


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


{}


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


# Summarization

## Extractive Summarization - LexRank (Ver.1)

In [None]:
!pip install lexrank path.py

Collecting lexrank
  Downloading lexrank-0.1.0-py3-none-any.whl.metadata (5.8 kB)
Collecting path.py
  Downloading path.py-12.5.0-py3-none-any.whl.metadata (1.3 kB)
Collecting pyrsistent>=0.14.0 (from lexrank)
  Downloading pyrsistent-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting urlextract>=0.7 (from lexrank)
  Downloading urlextract-1.9.0-py3-none-any.whl.metadata (5.8 kB)
Collecting path (from path.py)
  Downloading path-17.1.1-py3-none-any.whl.metadata (6.5 kB)
Collecting uritools (from urlextract>=0.7->lexrank)
  Downloading uritools-5.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading lexrank-0.1.0-py3-none-any.whl (69 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.8/69.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading path.py-12.5.0-py3-none-any.whl (2.3 kB)
Downloading pyrsistent-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (122 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

**Load bertopic_results.csv**

In [None]:
import pandas as pd

df = pd.read_csv("bertopic_results.csv")
df.columns.tolist()

['book_id',
 'user_id',
 'review_id',
 'rating',
 'review_text',
 'date_added',
 'date_updated',
 'n_votes',
 'n_comments',
 'language',
 'normalized_text',
 'cleaned_review_text',
 'review_text_nostopword',
 'lemmatized_text',
 'topic',
 'topic_probability']

In [None]:
import pandas as pd
from lexrank import LexRank
from lexrank.mappings.stopwords import STOPWORDS
from nltk.tokenize import sent_tokenize
import nltk

nltk.download('punkt')

In [None]:
# Remove outlier topic = -1
df = df[df["topic"] != -1]
df = df[df["topic_probability"] >= 0.30]

print("Unique topics:", sorted(df["topic"].unique()))
print("Total rows after filtering:", len(df))

#  Build LexRank corpus (all reviews)
all_docs = []
for text in df["review_text"].dropna().tolist():
    sentences = sent_tokenize(str(text))
    if len(sentences) > 0:
        all_docs.append(sentences)

lexrank = LexRank(all_docs, stopwords=STOPWORDS["en"])

#  Helper function: summarize one topic
def summarize_topic(texts, summary_size=3):
    sentences = []
    for t in texts:
        sentences.extend(sent_tokenize(str(t)))

    # if too few sentences, just return them
    if len(sentences) < 3:
        return sentences[:summary_size]

    try:
        summary = lexrank.get_summary(
            sentences,
            summary_size=min(summary_size, len(sentences)),
            threshold=0.1
        )
        return summary
    except ValueError:
        # e.g. "documents are not informative"
        return sentences[:summary_size]

#  Loop over topics and summarize
topic_summaries = {}

for t in sorted(df["topic"].unique()):
    subset = df[df["topic"] == t]
    texts = subset["review_text"].dropna().tolist()
    if not texts:
        continue

    summary_sentences = summarize_topic(texts, summary_size=3)

    topic_summaries[t] = {
        "n_reviews": len(subset),
        "summary_sentences": summary_sentences,
        "summary_text": " ".join(summary_sentences)
    }

#  Inspect a few topics
for t, info in list(topic_summaries.items())[:5]:
    print(f"\n=== Topic {t} | n_reviews = {info['n_reviews']} ===")
    for i, s in enumerate(info["summary_sentences"], 1):
        print(f"{i}. {s}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unique topics: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8)]
Total rows after filtering: 9984

=== Topic 0 | n_reviews = 4247 ===
1. I think this is a book everyone should read.
2. Everyone should read this book!
3. Read this book.

=== Topic 1 | n_reviews = 997 ===
1. Vance is a hillbilly.
2. Vance is a good, not great, author.
3. Hillbilly Elegy is his memoir of his family life, growing up in a poor white American culture.

=== Topic 2 | n_reviews = 991 ===
1. This graphic novel has 5 different creepy stories, I liked 3 out of the 5 stories which is pretty awesome.
2. Creepy, beautiful art and stories
3. Each of the five stories are just creepy.

=== Topic 3 | n_reviews = 975 ===
1. In the Garden of Beasts looks at the rise of Hitler's Germany through the eyes of the American ambassador to Germany, William E. Dodd, and his daughter, Martha Dodd.
2. This book looks at the rise of the Nazi part in Germany at the en

In [None]:
rows = []
for t, info in topic_summaries.items():
    rows.append({
        "topic": t,
        "n_reviews": info["n_reviews"],
        "summary": info["summary_text"]
    })

summary_df = pd.DataFrame(rows)
summary_df.head()


Unnamed: 0,topic,n_reviews,summary
0,0,4247,I think this is a book everyone should read. E...
1,1,997,"Vance is a hillbilly. Vance is a good, not gre..."
2,2,991,This graphic novel has 5 different creepy stor...
3,3,975,In the Garden of Beasts looks at the rise of H...
4,4,800,"Some poetry I understand, a few poems I love a..."


ERROR:root:Did not find quickchart key chart-b44ac02c-63fe-4e23-8de5-3d7b9e150373 in chart cache
ERROR:root:Did not find quickchart key chart-e869c6ef-3d09-40bf-892c-380896998d59 in chart cache


## Summarization for 10 topics

### Extractive Summarization - LexRank

**Install and import libraries**

In [None]:
!pip install lexrank path.py

import pandas as pd
from lexrank import LexRank
from lexrank.mappings.stopwords import STOPWORDS
from nltk.tokenize import sent_tokenize
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

**Load BERTopic results and inspect topics**

In [None]:
df = pd.read_csv("bertopic_results.csv")

print("Columns:", df.columns.tolist())
print("Unique topics in CSV:", sorted(df["topic"].unique()))

# remove topic = -1 (outlier)
df = df[df["topic"] != -1]

print("Total rows after this step:", len(df))

Columns: ['book_id', 'user_id', 'review_id', 'rating', 'review_text', 'date_added', 'date_updated', 'n_votes', 'n_comments', 'language', 'normalized_text', 'cleaned_review_text', 'review_text_nostopword', 'lemmatized_text', 'topic', 'topic_probability']
Unique topics in CSV: [np.int64(-1), np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8)]
Total rows after this step: 9998


**Prepare the LexRank corpus**

In [None]:
# Create corpus
all_docs = []

for text in df["review_text"].dropna().tolist():
    sentences = sent_tokenize(str(text))
    if len(sentences) > 0:
        all_docs.append(sentences)

lexrank = LexRank(all_docs, stopwords=STOPWORDS["en"])

print("Number of documents in LexRank corpus:", len(all_docs))

Number of documents in LexRank corpus: 9998


**Summarization function**

In [None]:
def summarize_topic(texts, summary_size=3):
    """
    texts: list ของข้อความรีวิว (string) ภายใน topic นั้น
    summary_size: จำนวนประโยคที่อยากให้สรุปออกมา

    return: list ของประโยคที่เป็น summary
    """
    sentences = []
    for t in texts:
        sentences.extend(sent_tokenize(str(t)))

    if len(sentences) < 3:
        return sentences[:summary_size]

    try:
        summary = lexrank.get_summary(
            sentences,
            summary_size=min(summary_size, len(sentences)),
            threshold=0.1
        )
        return summary
    except ValueError:
        return sentences[:summary_size]


**Summaries for each topic**

In [None]:
topic_summaries = {}

for t in sorted(df["topic"].unique()):
    subset = df[df["topic"] == t]
    texts = subset["review_text"].dropna().tolist()

    if not texts:
        continue

    summary_sentences = summarize_topic(texts, summary_size=3)

    topic_summaries[t] = {
        "n_reviews": len(subset),
        "summary_sentences": summary_sentences,
        "summary_text": " ".join(summary_sentences)
    }

print("Number of topics summarized:", len(topic_summaries))
print("Topic IDs summarized:", list(topic_summaries.keys()))

Number of topics summarized: 9
Topic IDs summarized: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8)]


**Display the summaries**

In [None]:
for t, info in topic_summaries.items():
    print(f"\n=== Topic {t} | n_reviews = {info['n_reviews']} ===")
    for i, s in enumerate(info["summary_sentences"], 1):
        print(f"{i}. {s}")


=== Topic 0 | n_reviews = 4247 ===
1. Everyone should read this book!
2. I think this is a book everyone should read.
3. I now want to read the next book :)

=== Topic 1 | n_reviews = 998 ===
1. Vance is a hillbilly.
2. Vance is a good, not great, author.
3. Vance, however, did make it out.

=== Topic 2 | n_reviews = 991 ===
1. This graphic novel has 5 different creepy stories, I liked 3 out of the 5 stories which is pretty awesome.
2. Creepy, beautiful art and stories
3. The stories are creepy.

=== Topic 3 | n_reviews = 976 ===
1. In the Garden of Beasts looks at the rise of Hitler's Germany through the eyes of the American ambassador to Germany, William E. Dodd, and his daughter, Martha Dodd.
2. This book looks at the rise of the Nazi part in Germany at the end of World War I through the eyes of the American ambassador to Germany, William Dodd and his daughter Martha.
3. Larson tells the dark story of Hitler's rise to power through the perspective of Dodd, the US ambassador to Germ

**Convert results to a DataFrame**

In [None]:
rows = []
for t, info in topic_summaries.items():
    rows.append({
        "topic": t,
        "n_reviews": info["n_reviews"],
        "summary": info["summary_text"]
    })

summary_df = pd.DataFrame(rows).sort_values("topic")
summary_df

Unnamed: 0,topic,n_reviews,summary
0,0,4247,Everyone should read this book! I think this i...
1,1,998,"Vance is a hillbilly. Vance is a good, not gre..."
2,2,991,This graphic novel has 5 different creepy stor...
3,3,976,In the Garden of Beasts looks at the rise of H...
4,4,800,"Some poetry I understand, a few poems I love a..."
5,5,679,I LOVED Flavia! I loved Flavia! I loved Flavia.
6,6,517,"However, I will say that as with all books/ser..."
7,7,420,"The story of Katniss and Peeta, and the Hunger..."
8,8,370,Read and you'll learn that this book is not ju...


### Abstractive Summarization

**Install and import libraries**

In [None]:
!pip install transformers sentencepiece accelerate -q

In [None]:
from transformers import pipeline
import pandas as pd

**Create an abstractive summarization pipeline using a pre-trained BART model.**

In [None]:
abstractive_summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    tokenizer="facebook/bart-large-cnn",
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


**Rewrites the extractive summary into an abstractive summarization**

In [None]:
def abstractive_summarize(text, max_length=80, min_length=30):
    """
    รับ summary จาก LexRank (text) แล้วให้โมเดลเขียนสรุปใหม่แบบ abstractive
    """
    if not text or len(text.split()) < 10:
        return text

    result = abstractive_summarizer(
        text,
        max_length=max_length,
        min_length=min_length,
        do_sample=False
    )
    return result[0]["summary_text"]

**Abstractive summary for each topic**

In [None]:
abstractive_summaries = []

for i, row in summary_df.iterrows():
    topic_id = row["topic"]
    extractive_summary = row["summary"]

    abs_sum = abstractive_summarize(extractive_summary)

    abstractive_summaries.append(abs_sum)

summary_df["abstractive_summary"] = abstractive_summaries

summary_df

Your max_length is set to 80, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max_length is set to 80, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 80, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 80, but your input_length is only 56. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max

Unnamed: 0,topic,n_reviews,summary,abstractive_summary
0,0,4247,Everyone should read this book! I think this i...,I think this is a book everyone should read. I...
1,1,998,"Vance is a hillbilly. Vance is a good, not gre...","Vance is a good, not great, author. Vance is a..."
2,2,991,This graphic novel has 5 different creepy stor...,This graphic novel has 5 different creepy stor...
3,3,976,In the Garden of Beasts looks at the rise of H...,In the Garden of Beasts looks at the rise of H...
4,4,800,"Some poetry I understand, a few poems I love a...",This was my first ever poetry book. Some poetr...
5,5,679,I LOVED Flavia! I loved Flavia! I loved Flavia.,I LOVED Flavia! I loved Flavia! I loved Flavia.
6,6,517,"However, I will say that as with all books/ser...","""What a great ending to the series of Harry Po..."
7,7,420,"The story of Katniss and Peeta, and the Hunger...","The Hunger Games begins with Katniss, she expl..."
8,8,370,Read and you'll learn that this book is not ju...,"Lou, or Lou, or ""Clark"" (as Will calls her) is..."


**Inspect a few topics with both extractive and abstractive summaries**

In [None]:
for i, row in summary_df.head(5).iterrows():
    print(f"\n=== Topic {row['topic']} | n_reviews = {row['n_reviews']} ===")
    print("Extractive summary (LexRank):")
    print(row["summary"])
    print("\nAbstractive summary (BART):")
    print(row["abstractive_summary"])
    print("-" * 80)


=== Topic 0 | n_reviews = 4247 ===
Extractive summary (LexRank):
Everyone should read this book! I think this is a book everyone should read. I now want to read the next book :)

Abstractive summary (BART):
I think this is a book everyone should read. I now want to read the next book. Everyone should read this book! I think this was a great book and I'm looking forward to the next.
--------------------------------------------------------------------------------

=== Topic 1 | n_reviews = 998 ===
Extractive summary (LexRank):
Vance is a hillbilly. Vance is a good, not great, author. Vance, however, did make it out.

Abstractive summary (BART):
Vance is a good, not great, author. Vance is a hillbilly. Vance, however, did make it out. He's a good author, but not great.
--------------------------------------------------------------------------------

=== Topic 2 | n_reviews = 991 ===
Extractive summary (LexRank):
This graphic novel has 5 different creepy stories, I liked 3 out of the 5 st