In [None]:
!pip install sentence_transformers nltk
import nltk
nltk.download('punkt_tab')
# This notebook prepares your text corpus for BERTopic by performing sentence splitting, optional text normalisation, and embedding generation using SentenceTransformers.
# It ensures the data is in the correct format for downstream topic modelling.

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence_transformers)
 

We will begin with a generative example. This is an example of text we want to chunk into sentences, but is lacking punctuation. Using this small scale example, we will understand how the pipeline works

In [None]:
import re
def clean_text(text):
    text = re.sub(r'\d+', '', text)               # Remove digits
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'[^\w\s]', '', text)           # Remove punctuation (grammar)
    text = re.sub(r'\s+', ' ', text).strip()      # Remove extra whitespace/newlines
    return text.lower()

In [None]:
import nltk
text = """
This is a sample paragraph. It includes several sentences, some of which are short.
However, others may be longer or more complex. Therefore, we aim to split this text
into clean, readable chunks using punctuation and connectives.
"""

# Define strong connectives to split at (you can expand this)
connectives = r'\b(?:however|therefore|moreover|but|although|yet|because|so)\b'

# Include major punctuation: comma, semicolon, colon, period, dash, newline, question mark, exclamation
split_regex = r'[;,\.\?!:\-\–\n]+' + '|' + connectives

# Split the text
texts = re.split(split_regex, text, flags=re.IGNORECASE)
cleaned_texts = [clean_text(t) for t in texts if len(t.strip()) > 0]


In [None]:
test_sentences =[]
results = []
buffer = ""
for i in cleaned_texts:
  buffer += i + " "
  words = nltk.word_tokenize(buffer)
  length = len(words)
  if length >= 9:
    results.append(length)
    buffer.strip()
    test_sentences.append(buffer)
    buffer = ""


if buffer.strip():
    test_sentences.append(buffer.strip())
print(test_sentences)

In [None]:
# This output will inform us of how effective this pipeline is
# We want almost none in too_short or too_long
# Average length should be around 9-13 to be even. But higher is fine.
import numpy as np

print("Total chunks:", len(results))
print("Average length:", round(np.mean(results),2))
print("Median length:", np.median(results))
print("Max length:", max(results))
print("Min length:", min(results))

too_short = sum(r < 9 for r in results)
too_long = sum(r > 40 for r in results)
print(f"Chunks < 9 words: {too_short}")
print(f"Chunks > 20 words: {too_long}")


Total chunks: 120882
Average length: 12.84
Median length: 12.0
Max length: 53
Min length: 9
Chunks < 9 words: 0
Chunks > 20 words: 27


In [None]:
import os
import sys
import shutil
# Detect if running in Google Colab

# Set the environment variable for your GitHub token
#os.environ["GITHUB_TOKEN"] =

# This cell is for loading data. If your prefer to do this manually, you will need to set base_dir and data_dir separately

IN_COLAB = 'google.colab' in sys.modules

# Check if running in Google Colab
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

    # Set the base directory on Google Drive (no extra folder will be added)
    base_dir = "/content/drive/MyDrive/Bertopic"
    token = os.getenv("GITHUB_TOKEN")
    #if os.path.exists(base_dir):
     #   shutil.rmtree(base_dir)

    #!git clone https://{token}@github.com/UnbrokenCocoon/OCR-evaluation.git "{base_dir}"

else:
    # Set the base directory locally (set this to your local project folder)
    base_dir = "path/to/your/local/project/folder"

    #!git clone https://{token}@github.com/UnbrokenCocoon/OCR-evaluation.git "{base_dir}"

    # Clone the repository locally


# Set the data directory (this assumes you have a 'Data' folder inside the repository)
data_dir = os.path.join(base_dir, "Data")
os.makedirs(data_dir, exist_ok=True)
output_dir = os.path.join(base_dir, "output")
os.makedirs(output_dir, exist_ok=True)

# Now data_dir points to the cloned Data folder
print(f"Data folder is located at: {data_dir}")


In [None]:
# This loads the real data to process
import re

dir_1 = os.path.join(data_dir,'Batch 4.txt')
dir_2 = os.path.join(data_dir,'batches 1 through 3.txt')
with open(dir_1) as f:
  text_1 = f.read()
with open(dir_2) as f:
  text_2 = f.read()

all_text = text_1 + text_2

connectives = r'\b(?:however|therefore|moreover|but|although|yet|because|so)\b'
# Include major punctuation: comma, semicolon, colon, period, dash, newline, question mark, exclamation
split_regex = r'[;,\.\?!:\-\–\n]+' + '|' + connectives

# Split the text
texts = re.split(split_regex, text, flags=re.IGNORECASE)
cleaned_texts = [clean_text(t) for t in texts if len(t.strip()) > 0]

In [None]:
all_sentences =[]
results = []
buffer = ""
for i in cleaned_texts:
  buffer += i + " "
  words = nltk.word_tokenize(buffer)
  length = len(words)
  if length >= 9:
    results.append(length)
    buffer.strip()
    all_sentences.append(buffer)
    buffer = ""


if buffer.strip():
    all_sentences.append(buffer.strip())

In [None]:
# Now we produce a similar report to confirm everything worked appropriately
# You could get too_long lower by chunking once each sentence reaches a length, but this will effect the context of the chunk
# This method tolerates some long sentences
import numpy as np

print("Total chunks:", len(results))
print("Average length:", round(np.mean(results),2))
print("Median length:", np.median(results))
print("Max length:", max(results))
print("Min length:", min(results))

too_short = sum(r < 9 for r in results)
too_long = sum(r > 40 for r in results)
print(f"Chunks < 9 words: {too_short}")
print(f"Chunks > 20 words: {too_long}")

Total chunks: 130742
Average length: 12.77
Median length: 12.0
Max length: 54
Min length: 9
Chunks < 9 words: 0
Chunks > 20 words: 31


In [None]:
# You can use whichever model you prefer
# This model is quite good for BERTopic, but may take a while.
import sentence_transformers
from sentence_transformers import SentenceTransformer

# Load https://huggingface.co/sentence-transformers/all-mpnet-base-v2
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(all_sentences, show_progress_bar=True)


Batches:   0%|          | 0/3778 [00:00<?, ?it/s]

In [None]:
# This double checks everything is equal before saving
print(len(embeddings))
print(len(all_sentences))

120882
120882


In [None]:
import pickle
with open(os.path.join(data_dir, 'bc_embedding.pkl'), 'wb') as f:
  pickle.dump(embeddings, f)
with open(os.path.join(data_dir, 'bc_sentences.pkl'), 'wb') as f:
  pickle.dump(all_sentences, f)