In [38]:
import pandas as pd

df = pd.read_csv('ProjectGutenberg-ShortStories-Dataset/stories.csv')

# Inspect dataframe structure
print(df.head())
print(df.columns)

        bookno                                            content
0    51082.txt  *** START OF THIS PROJECT GUTENBERG EBOOK COMI...
1    32243.txt  *** START OF THIS PROJECT GUTENBERG EBOOK CONF...
2    306-0.txt  *** START OF THIS PROJECT GUTENBERG EBOOK EARL...
3    31038.txt  *** START OF THIS PROJECT GUTENBERG EBOOK THE ...
4  28636-8.txt  *** START OF THIS PROJECT GUTENBERG EBOOK THE ...
Index(['bookno', 'content'], dtype='object')


In [39]:
df.iloc[0]['content']

'*** START OF THIS PROJECT GUTENBERG EBOOK COMING ATTRACTION ***\n\n\n\n\n\n\n\n\n\nProduced by Greg Weeks, Mary Meehan and the Online\n\nDistributed Proofreading Team at http://www.pgdp.net\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                           Coming Attraction\n\n\n\n                            BY FRITZ LEIBER\n\n\n\n                       Illustrated by Paul Calle\n\n\n\n           [Transcriber\'s Note: This etext was produced from\n\n                 Galaxy Science Fiction November 1950.\n\n         Extensive research did not uncover any evidence that\n\n         the U.S. copyright on this publication was renewed.]\n\n\n\n\n\n\n\n\n\n           Women will always go on trying to attract men ...\n\n             even when the future seems to have no future!\n\n\n\n\n\nThe coupe with the fishhooks welded to the fender shouldered up over\n\nthe curb like the nose of a nightmare. The girl in its path stood\n\nfrozen, her face probably stiff with fright under her mask. For onc

In [40]:
import re

def clean_gutenberg_text_improved(text):
    # Remove Gutenberg header explicitly
    header_pattern = r"\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK.*?\*\*\*"
    header_match = re.search(header_pattern, text, re.IGNORECASE | re.DOTALL)

    # Remove Gutenberg footer explicitly
    footer_pattern = r"End of (the|this) Project Gutenberg.*"
    footer_match = re.search(footer_pattern, text, re.IGNORECASE | re.DOTALL)

    if header_match:
        text = text[header_match.end():]

    if footer_match:
        text = text[:footer_match.start()]

    # Remove transcriber notes, production notes, and URLs explicitly
    text = re.sub(r'Produced by.*?(?=\s[A-Z])', '', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'\[Transcriber.*?\]', '', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'http\S+', '', text)

    # Remove standalone illustrator or author lines (optional, if you don't want them)
    text = re.sub(r'(Illustrated by.*?\.)', '', text, flags=re.IGNORECASE)

    # Normalize escaped newlines and whitespace
    #text = text.replace('\\n', ' ')
    #text = re.sub(r'\s+', ' ', text).strip()

    return text

import pandas as pd

# df = pd.read_csv('stories.csv')

# Apply the improved cleaning
df['clean_content'] = df['content'].apply(clean_gutenberg_text_improved)

# Quick check on the first story
print(df['clean_content'][0][:1500])











 Greg Weeks, Mary Meehan and the Online

Distributed Proofreading Team at 



















                           Coming Attraction



                            BY FRITZ LEIBER



                       Illustrated by Paul Calle



           









           Women will always go on trying to attract men ...

             even when the future seems to have no future!





The coupe with the fishhooks welded to the fender shouldered up over

the curb like the nose of a nightmare. The girl in its path stood

frozen, her face probably stiff with fright under her mask. For once my

reflexes weren't shy. I took a fast step toward her, grabbed her elbow,

yanked her back. Her black skirt swirled out.



The big coupe shot by, its turbine humming. I glimpsed three faces.

Something ripped. I felt the hot exhaust on my ankles as the big

coupe swerved back into the street. A thick cloud like a black flower

blossomed from its jouncing rear end, while from the fishhooks f

In [41]:
import pandas as pd

# Load original CSV file
# df = pd.read_csv('stories.csv')

# Extract the top 5 rows
top5_df = df.head(5)

# Save these top 5 rows to a new CSV file
top5_df.to_csv('top5_stories.csv', index=False)

print("Top 5 rows successfully saved to top5_stories.csv")

Top 5 rows successfully saved to top5_stories.csv


In [42]:
import pandas as pd
import re

# Robust function to extract story content
def extract_story(text):
    # Step 1: Remove Gutenberg header/footer explicitly
    text = re.sub(r'\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK.*?\*\*\*', '', text, flags=re.I|re.DOTALL)
    text = re.sub(r'End of (the|this) Project Gutenberg.*', '', text, flags=re.I|re.DOTALL)

    # Step 2: Remove production/transcriber notes and URLs
    text = re.sub(r'Produced by.*?(?=[A-Z])', '', text, flags=re.I|re.DOTALL)
    text = re.sub(r'\[.*?Transcriber.*?\]', '', text, flags=re.I|re.DOTALL)
    text = re.sub(r'http\S+', '', text)

    # Step 3: Normalize whitespace
    #text = re.sub(r'\n{4,}', '\n\n\n', text)
    #text = text.replace('\\n', ' ')
    #text = re.sub(r'\s+', ' ', text).strip()

    # Step 4: Identify the story's beginning
    sentences = re.split(r'(?<=[.!?]) +', text)
    
    narrative_start = 0
    for i, sentence in enumerate(sentences):
        words = sentence.split()
        # Heuristic: start from first sentence with >=8 words, ends with '.', '!', '?'
        if len(words) >= 8 and sentence[-1] in '.!?':
            narrative_start = i
            break
    
    story_text = ' '.join(sentences[narrative_start:]).strip()

    return story_text

# Load your CSV
df = pd.read_csv('top5_stories.csv')

# Apply extraction to 'content' column
df['story_only'] = df['content'].apply(extract_story)

# Optional: Save to new CSV
df[['story_only']].to_csv('top5_stories_extracted.csv', index=False)

# Quick verification
print(df['story_only'].iloc[0][:1000])

Greg Weeks, Mary Meehan and the Online

Distributed Proofreading Team at 



















                           Coming Attraction



                            BY FRITZ LEIBER



                       Illustrated by Paul Calle



           









           Women will always go on trying to attract men ...

             even when the future seems to have no future!





The coupe with the fishhooks welded to the fender shouldered up over

the curb like the nose of a nightmare. The girl in its path stood

frozen, her face probably stiff with fright under her mask. For once my

reflexes weren't shy. I took a fast step toward her, grabbed her elbow,

yanked her back. Her black skirt swirled out.



The big coupe shot by, its turbine humming. I glimpsed three faces.

Something ripped. I felt the hot exhaust on my ankles as the big

coupe swerved back into the street. A thick cloud like a black flower

blossomed from its jouncing rear end, while from the fishhooks flew a

blac

In [43]:
story_one = df['story_only'].iloc[0]
print(story_one)

Greg Weeks, Mary Meehan and the Online

Distributed Proofreading Team at 



















                           Coming Attraction



                            BY FRITZ LEIBER



                       Illustrated by Paul Calle



           









           Women will always go on trying to attract men ...

             even when the future seems to have no future!





The coupe with the fishhooks welded to the fender shouldered up over

the curb like the nose of a nightmare. The girl in its path stood

frozen, her face probably stiff with fright under her mask. For once my

reflexes weren't shy. I took a fast step toward her, grabbed her elbow,

yanked her back. Her black skirt swirled out.



The big coupe shot by, its turbine humming. I glimpsed three faces.

Something ripped. I felt the hot exhaust on my ankles as the big

coupe swerved back into the street. A thick cloud like a black flower

blossomed from its jouncing rear end, while from the fishhooks flew a

blac

In [44]:
import nltk
nltk.download('punkt_tab')

# Function to segment text into sentences and paragraphs
def segment_text(text):
    # Segment paragraphs (assuming paragraphs are separated by double newlines)
    paragraphs = text.split('\n\n\n')
    
    # Segment sentences for each paragraph
    segmented_paragraphs = []
    for paragraph in paragraphs:
        sentences = nltk.sent_tokenize(paragraph)
        segmented_paragraphs.append(sentences)
    
    return segmented_paragraphs

# Segment the text into sentences and paragraphs
segmented_text = segment_text(story_one)

# Display the result
for paragraph in segmented_text:
    print("Paragraph:")
    for sentence in paragraph:
        print(f"  {sentence}")


Paragraph:
  Greg Weeks, Mary Meehan and the Online

Distributed Proofreading Team at
Paragraph:
Paragraph:
Paragraph:
Paragraph:
Paragraph:
Paragraph:
  

                           Coming Attraction
Paragraph:
  
                            BY FRITZ LEIBER
Paragraph:
  
                       Illustrated by Paul Calle
Paragraph:
Paragraph:
Paragraph:
Paragraph:
  
           Women will always go on trying to attract men ...

             even when the future seems to have no future!
Paragraph:
Paragraph:
  The coupe with the fishhooks welded to the fender shouldered up over

the curb like the nose of a nightmare.
  The girl in its path stood

frozen, her face probably stiff with fright under her mask.
  For once my

reflexes weren't shy.
  I took a fast step toward her, grabbed her elbow,

yanked her back.
  Her black skirt swirled out.
Paragraph:
  
The big coupe shot by, its turbine humming.
  I glimpsed three faces.
  Something ripped.
  I felt the hot exhaust on my ankles as the 

[nltk_data] Downloading package punkt_tab to /home/william-
[nltk_data]     chandler/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [32]:
import nltk
nltk.download('punkt')

# Function to segment text into sentences and paragraphs
def segment_text(text):
    # Segment paragraphs (assuming paragraphs are separated by triple newlines)
    raw_paragraphs = text.split('\n\n\n')
    
    # Filter out empty or whitespace-only paragraphs
    paragraphs = [p.strip() for p in raw_paragraphs if p.strip()]
    
    # Segment sentences for each non-empty paragraph
    segmented_paragraphs = []
    for paragraph in paragraphs:
        sentences = nltk.sent_tokenize(paragraph)
        segmented_paragraphs.append(sentences)
    
    return segmented_paragraphs

# Example usage (replace `story_one` with your actual text variable)
segmented_text = segment_text(story_one)


# Display the result
for paragraph in segmented_text:
    print("Paragraph:")
    for sentence in paragraph:
        print(f"  {sentence}")


Paragraph:
  Greg Weeks, Mary Meehan and the Online

Distributed Proofreading Team at
Paragraph:
  Coming Attraction
Paragraph:
  BY FRITZ LEIBER
Paragraph:
  Illustrated by Paul Calle
Paragraph:
  Women will always go on trying to attract men ...

             even when the future seems to have no future!
Paragraph:
  The coupe with the fishhooks welded to the fender shouldered up over

the curb like the nose of a nightmare.
  The girl in its path stood

frozen, her face probably stiff with fright under her mask.
  For once my

reflexes weren't shy.
  I took a fast step toward her, grabbed her elbow,

yanked her back.
  Her black skirt swirled out.
Paragraph:
  The big coupe shot by, its turbine humming.
  I glimpsed three faces.
  Something ripped.
  I felt the hot exhaust on my ankles as the big

coupe swerved back into the street.
  A thick cloud like a black flower

blossomed from its jouncing rear end, while from the fishhooks flew a

black shimmering rag.
Paragraph:
  "Did they 

[nltk_data] Downloading package punkt to /home/william-
[nltk_data]     chandler/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


For Harry Potter txt:

In [5]:
import nltk
from sentence_transformers import SentenceTransformer

nltk.download('punkt')

# 1. Define the segment_text function
def segment_text(text):
    raw_paragraphs = text.split('\n\n')
    paragraphs = [p.strip() for p in raw_paragraphs if p.strip()]
    return paragraphs

# 2. Define the embed_paragraphs function
def embed_paragraphs(paragraphs, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    paragraph_embeddings = model.encode(paragraphs)
    return paragraph_embeddings

# 3. Load your text file
with open('HarryPotterBookOne.txt', 'r', encoding='utf-8') as f:
    text_data = f.read()

# 4. Segment text into paragraphs
paragraphs = segment_text(text_data)

# 5. Get embeddings
embeddings = embed_paragraphs(paragraphs, model_name='all-MiniLM-L6-v2')

for paragraph in paragraphs:
    print("Paragraph:")
    print(paragraph)
    print()  # Optional blank line for clarity


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/william-
[nltk_data]     chandler/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Paragraph:
Harry Potter and the Sorcerer's Stone

Paragraph:
CHAPTER ONE

Paragraph:
THE BOY WHO LIVED

Paragraph:
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last
people you'd expect to be involved in anything strange or mysterious,
because they just didn't hold with such nonsense.

Paragraph:
Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache. Mrs. Dursley was thin and blonde and had
nearly twice the usual amount of neck, which came in very useful as she
spent so much of her time craning over garden fences, spying on the
neighbors. The Dursleys had a small son called Dudley and in their
opinion there was no finer boy anywhere.

Paragraph:
The Dursleys had everything they wanted, but they also had a secret, and
their greatest fear was that somebody would discover it. They didn't
t

In [10]:
import numpy as np
import hdbscan

# 6. Perform HDBSCAN clustering
embeddings_array = np.array(embeddings)
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom')
cluster_labels = clusterer.fit_predict(embeddings_array)

for i, label in enumerate(cluster_labels):
    print(f"Paragraph {i+1} -> Cluster {label}")

print(f"Number of clusters: {len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)}")


Paragraph 1 -> Cluster -1
Paragraph 2 -> Cluster 2
Paragraph 3 -> Cluster -1
Paragraph 4 -> Cluster -1
Paragraph 5 -> Cluster -1
Paragraph 6 -> Cluster 49
Paragraph 7 -> Cluster -1
Paragraph 8 -> Cluster -1
Paragraph 9 -> Cluster -1
Paragraph 10 -> Cluster 56
Paragraph 11 -> Cluster -1
Paragraph 12 -> Cluster 63
Paragraph 13 -> Cluster -1
Paragraph 14 -> Cluster -1
Paragraph 15 -> Cluster 72
Paragraph 16 -> Cluster -1
Paragraph 17 -> Cluster -1
Paragraph 18 -> Cluster -1
Paragraph 19 -> Cluster -1
Paragraph 20 -> Cluster -1
Paragraph 21 -> Cluster -1
Paragraph 22 -> Cluster -1
Paragraph 23 -> Cluster 49
Paragraph 24 -> Cluster -1
Paragraph 25 -> Cluster -1
Paragraph 26 -> Cluster 72
Paragraph 27 -> Cluster -1
Paragraph 28 -> Cluster -1
Paragraph 29 -> Cluster -1
Paragraph 30 -> Cluster 63
Paragraph 31 -> Cluster 35
Paragraph 32 -> Cluster -1
Paragraph 33 -> Cluster 49
Paragraph 34 -> Cluster 35
Paragraph 35 -> Cluster -1
Paragraph 36 -> Cluster -1
Paragraph 37 -> Cluster 35
Paragraph 3

In [23]:
from bertopic import BERTopic
# 7. Perform topic modeling with BERTopic
topic_model = BERTopic(embedding_model=SentenceTransformer('all-MiniLM-L6-v2'))
topics, _ = topic_model.fit_transform(paragraphs, embeddings_array)

for i, topic in enumerate(topics):
    print(f"Paragraph {i+1} -> Topic {topic}")

# Count the total number of unique topics
unique_topics = set(topics)  # Use a set to get unique topics
total_topics = len(unique_topics) - (1 if -1 in unique_topics else 0)  # Exclude noise (-1)

# Print the total number of topics
print(f"Total number of topics (excluding noise): {total_topics}")

Paragraph 1 -> Topic -1
Paragraph 2 -> Topic 40
Paragraph 3 -> Topic -1
Paragraph 4 -> Topic 22
Paragraph 5 -> Topic 22
Paragraph 6 -> Topic 22
Paragraph 7 -> Topic 22
Paragraph 8 -> Topic 37
Paragraph 9 -> Topic -1
Paragraph 10 -> Topic 22
Paragraph 11 -> Topic 22
Paragraph 12 -> Topic -1
Paragraph 13 -> Topic 1
Paragraph 14 -> Topic 0
Paragraph 15 -> Topic 22
Paragraph 16 -> Topic 22
Paragraph 17 -> Topic 1
Paragraph 18 -> Topic -1
Paragraph 19 -> Topic 22
Paragraph 20 -> Topic 22
Paragraph 21 -> Topic -1
Paragraph 22 -> Topic 22
Paragraph 23 -> Topic 22
Paragraph 24 -> Topic 56
Paragraph 25 -> Topic 56
Paragraph 26 -> Topic 22
Paragraph 27 -> Topic 22
Paragraph 28 -> Topic 22
Paragraph 29 -> Topic -1
Paragraph 30 -> Topic -1
Paragraph 31 -> Topic -1
Paragraph 32 -> Topic -1
Paragraph 33 -> Topic 22
Paragraph 34 -> Topic -1
Paragraph 35 -> Topic 0
Paragraph 36 -> Topic 0
Paragraph 37 -> Topic -1
Paragraph 38 -> Topic 22
Paragraph 39 -> Topic 2
Paragraph 40 -> Topic 22
Paragraph 41 ->

In [24]:
topic_model.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

For Gutenberg CSV:

In [47]:
from sentence_transformers import SentenceTransformer
import nltk

# Download the NLTK tokenizer model
nltk.download('punkt')

# Function to segment text into sentences and paragraphs
def segment_text(text):
    # Segment paragraphs (assuming paragraphs are separated by double newlines)
    raw_paragraphs = text.split('\n\n')
    
    # Filter out empty or whitespace-only paragraphs
    paragraphs = [p.strip() for p in raw_paragraphs if p.strip()]
    
    return paragraphs

# Function to embed paragraphs using Sentence-BERT
def embed_paragraphs(paragraphs, model_name='all-MiniLM-L6-v2'):
    # Load the pre-trained Sentence-BERT model
    model = SentenceTransformer(model_name)
    
    # Embed each paragraph individually (no grouping)
    paragraph_embeddings = model.encode(paragraphs)  # Embedding each paragraph
    return paragraph_embeddings

# Segment the text into paragraphs
paragraphs = segment_text(story_one)

# Embed the paragraphs using Sentence-BERT
embeddings = embed_paragraphs(paragraphs)

# Display the resulting embeddings (optional, here we print the shape of embeddings)
for idx, embedding in enumerate(embeddings):
    print(f"Embedding for paragraph group {idx+1}:")
    print(embedding[:10])  # Display the first 10 elements of the embedding for brevity


[nltk_data] Downloading package punkt to /home/william-
[nltk_data]     chandler/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Embedding for paragraph group 1:
[-0.12705487 -0.07768417 -0.05324193 -0.03932625  0.0355905   0.09171158
 -0.00305775  0.06588676  0.0497287   0.04944518]
Embedding for paragraph group 2:
[-0.04598238 -0.05160765 -0.04175675 -0.01678063  0.0315697  -0.01619254
 -0.03316135  0.04348642  0.01146646  0.05098707]
Embedding for paragraph group 3:
[-0.03064035  0.00062007 -0.00518435  0.04401003  0.02543898  0.03180334
  0.11151884 -0.03218126  0.08278709  0.01049972]
Embedding for paragraph group 4:
[-0.026724    0.03628696 -0.06489769 -0.02243142 -0.03700282  0.04728059
  0.06263942  0.00857553  0.00702638  0.03676295]
Embedding for paragraph group 5:
[-0.06912449  0.01028547  0.01380999  0.06735839 -0.13995919  0.02184009
  0.01649148  0.09217546  0.0614059   0.00046357]
Embedding for paragraph group 6:
[-0.01914178 -0.04487531  0.05137537  0.02925592 -0.01612964 -0.02888184
  0.00307902 -0.15910424  0.02133381 -0.03053707]
Embedding for paragraph group 7:
[-0.05804767  0.009876    0.007

In [48]:
import hdbscan
import numpy as np
from sentence_transformers import SentenceTransformer

# Assuming you already have the embeddings in the `embeddings` variable

# Step 1: Convert list of embeddings to numpy array (if it's not already)
embeddings_array = np.array(embeddings)

# Step 2: Apply HDBSCAN clustering
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom')
cluster_labels = clusterer.fit_predict(embeddings_array)

# Step 3: Display the results
for i, label in enumerate(cluster_labels):
    print(f"Scene {i+1}: Cluster {label}")

# Optional: View number of clusters
print(f"Number of clusters: {len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)}")


Scene 1: Cluster 0
Scene 2: Cluster -1
Scene 3: Cluster 0
Scene 4: Cluster 0
Scene 5: Cluster 0
Scene 6: Cluster 0
Scene 7: Cluster 0
Scene 8: Cluster 0
Scene 9: Cluster 0
Scene 10: Cluster 0
Scene 11: Cluster 0
Scene 12: Cluster 0
Scene 13: Cluster 0
Scene 14: Cluster 0
Scene 15: Cluster 0
Scene 16: Cluster 0
Scene 17: Cluster 0
Scene 18: Cluster 0
Scene 19: Cluster 0
Scene 20: Cluster 0
Scene 21: Cluster 0
Scene 22: Cluster 0
Scene 23: Cluster 0
Scene 24: Cluster 0
Scene 25: Cluster 0
Scene 26: Cluster 0
Scene 27: Cluster 0
Scene 28: Cluster 0
Scene 29: Cluster 0
Scene 30: Cluster 0
Scene 31: Cluster 0
Scene 32: Cluster 0
Scene 33: Cluster 0
Scene 34: Cluster 0
Scene 35: Cluster 0
Scene 36: Cluster 0
Scene 37: Cluster 0
Scene 38: Cluster 0
Scene 39: Cluster 0
Scene 40: Cluster 0
Scene 41: Cluster 0
Scene 42: Cluster 0
Scene 43: Cluster 0
Scene 44: Cluster 0
Scene 45: Cluster 0
Scene 46: Cluster 1
Scene 47: Cluster 0
Scene 48: Cluster 0
Scene 49: Cluster 0
Scene 50: Cluster 0
Scene 51

In [44]:
import umap
import matplotlib.pyplot as plt

# Step 4: Visualize the clustering with UMAP
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean')
umap_embeddings = umap_model.fit_transform(embeddings_array)

# Step 5: Plot the results
plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=cluster_labels, cmap='Spectral')
plt.title('HDBSCAN Clusters Visualized')
plt.show()


AttributeError: module 'umap' has no attribute 'UMAP'

In [50]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Assuming you already have the embeddings in the `embeddings` variable
# You also need the original texts (or paragraphs) to extract topics

# Step 1: Convert list of embeddings to numpy array (if it's not already)
embeddings_array = np.array(embeddings)


# Step 1: Initialize BERTopic with pre-trained embeddings
topic_model = BERTopic(embedding_model=SentenceTransformer('all-MiniLM-L6-v2'))

# Step 2: Fit the model to the embeddings and extract topics
topics, _ = topic_model.fit_transform(paragraphs, embeddings_array)

# Step 3: Display the topics and the resulting scenes
for i, topic in enumerate(topics):
    print(f"Scene {i+1}: Topic {topic}")

# Count the total number of unique topics
unique_topics = set(topics)  # Use a set to get unique topics
total_topics = len(unique_topics) - (1 if -1 in unique_topics else 0)  # Exclude noise (-1)

# Print the total number of topics
print(f"Total number of topics (excluding noise): {total_topics}")

# Optional: Visualize the topic model
topic_model.visualize_topics()


Scene 1: Topic 0
Scene 2: Topic 0
Scene 3: Topic 0
Scene 4: Topic 0
Scene 5: Topic 0
Scene 6: Topic 0
Scene 7: Topic 0
Scene 8: Topic 0
Scene 9: Topic 0
Scene 10: Topic 0
Scene 11: Topic 0
Scene 12: Topic 0
Scene 13: Topic 0
Scene 14: Topic 0
Scene 15: Topic 0
Scene 16: Topic 0
Scene 17: Topic 0
Scene 18: Topic 0
Scene 19: Topic 0
Scene 20: Topic 0
Scene 21: Topic 0
Scene 22: Topic 0
Scene 23: Topic 0
Scene 24: Topic 0
Scene 25: Topic 0
Scene 26: Topic 0
Scene 27: Topic 0
Scene 28: Topic 0
Scene 29: Topic 0
Scene 30: Topic 0
Scene 31: Topic 0
Scene 32: Topic 0
Scene 33: Topic 0
Scene 34: Topic 0
Scene 35: Topic 0
Scene 36: Topic 0
Scene 37: Topic 0
Scene 38: Topic 0
Scene 39: Topic 0
Scene 40: Topic 0
Scene 41: Topic 0
Scene 42: Topic 0
Scene 43: Topic 0
Scene 44: Topic 0
Scene 45: Topic 0
Scene 46: Topic 1
Scene 47: Topic 0
Scene 48: Topic 0
Scene 49: Topic 0
Scene 50: Topic 0
Scene 51: Topic 0
Scene 52: Topic 0
Scene 53: Topic 0
Scene 54: Topic 0
Scene 55: Topic 0
Scene 56: Topic 0
S

ValueError: zero-size array to reduction operation maximum which has no identity