In [8]:
# =================================================================
# STEP 1: INITIALIZATION
# =================================================================
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

# Configuration
MODEL_NAME = 'all-MiniLM-L6-v2'

In [9]:
# =================================================================
# STEP 2: TEXT SEGMENTATION (INTRA-DOC LOGIC)
# =================================================================
raw_input = """
Photosynthesis is the process by which green plants, algae, and cyanobacteria convert light energy into chemical energy using chlorophyll. During photosynthesis, carbon dioxide and water are transformed into glucose and oxygen through a series of light-dependent and light-independent reactions. Cellular respiration is a complementary process that occurs in the mitochondria of eukaryotic cells, where glucose is broken down in the presence of oxygen to produce ATP, the primary energy currency of the cell. DNA replication is a fundamental biological process that ensures genetic information is accurately copied before cell division, relying on enzymes such as DNA polymerase and helicase. Mutations in DNA can occur due to errors in replication or exposure to environmental factors like radiation, and while many mutations are neutral, some can lead to genetic disorders or evolutionary advantages. The theory of evolution by natural selection, proposed by Charles Darwin, explains how heritable traits that improve survival and reproduction become more common in populations over generations. In human physiology, the nervous system transmits electrical and chemical signals to coordinate movement, sensation, and cognition. The immune system protects organisms from pathogens through innate defenses such as skin barriers and adaptive responses involving antibodies and specialized white blood cells.
"""

# Split by (. ! ?) or Newlines, then filter out empty/tiny fragments
def pdf_sentence_segment(text):
    # 1. Remove line breaks
    text = re.sub(r'\n+', ' ', text)
    # 2. Fix broken words caused by line wrapping
    text = re.sub(r'(?<=\w)\s+(?=\w)', ' ', text)
    # 3. Ensure space after sentence-ending punctuation
    text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)
    # 4. Split strictly on sentence boundaries (., !, ?)
    sentences = re.split(r'(?<=[.!?])\s+', text)
    # 5. Final cleanup
    sentences = [s.strip() for s in sentences if len(s.strip()) > 10]

    return sentences


# ðŸ”½ USE THE NEW LOGIC HERE
corpus = pdf_sentence_segment(raw_input)

print(f"Total segments identified: {len(corpus)}")

Total segments identified: 8


In [10]:
# =================================================================
# STEP 3: SEMANTIC EMBEDDING
# =================================================================
embedder = SentenceTransformer(MODEL_NAME)
embeddings = embedder.encode(corpus)

# Convert to numpy for clustering compatibility
embeddings_np = np.array(embeddings)

In [11]:
embeddings_np

array([[-0.07410745,  0.07730395, -0.08091968, ...,  0.02026472,
         0.09107607,  0.06423632],
       [-0.05470695,  0.07336466, -0.06498743, ...,  0.02376426,
         0.08041412, -0.02585154],
       [-0.06594134,  0.03318985, -0.05179024, ...,  0.0355609 ,
         0.08534993, -0.02678361],
       ...,
       [-0.0753957 ,  0.02794126,  0.01401788, ...,  0.12738834,
         0.08204874,  0.04642491],
       [ 0.02501933, -0.04113683, -0.01588627, ...,  0.12887022,
         0.02198967, -0.093583  ],
       [-0.08186902,  0.07080593, -0.00403837, ...,  0.00621335,
         0.08481239,  0.0519124 ]], shape=(8, 384), dtype=float32)

In [12]:
# =================================================================
# STEP 4: CLUSTERING & LOGICAL LABELING
# =================================================================
def get_logical_clusters(data_list, vectors, n_clusters):
    # Safety: Ensure we don't ask for more clusters than we have sentences
    actual_n = min(len(data_list), n_clusters)
    
    if actual_n < 2:
        return [0], {0: "General Content"}, {0: data_list}

    # Clustering
    model = AgglomerativeClustering(n_clusters=actual_n)
    labels = model.fit_predict(vectors)
    
    # Grouping and Automatic Keyword Labeling
    groups = {}
    logical_names = {}
    
    for text, label in zip(data_list, labels):
        groups.setdefault(label, []).append(text)
        
    for label, texts in groups.items():
        combined = " ".join(texts)
        try:
            tfidf = TfidfVectorizer(stop_words='english', max_features=10)
            tfidf.fit([combined])
            # Pick the most important keyword as the label
            keywords = tfidf.get_feature_names_out()
            logical_names[label] = keywords[0].capitalize() if len(keywords) > 0 else f"Topic_{label}"
        except:
            logical_names[label] = f"Cluster_{label}"
            
    return labels, logical_names, groups

# Set desired number of clusters
N = 2
labels, auto_labels, clustered_groups = get_logical_clusters(corpus, embeddings_np, N)

In [13]:
# =================================================================
# STEP 5: RESULTS VISUALIZATION
# =================================================================
for cid, sentences in clustered_groups.items():
    print(f"ðŸ“‚ CATEGORY: {auto_labels[cid]}")
    for s in sentences:
        print(f"  - {s}")
    print("-" * 40)

ðŸ“‚ CATEGORY: Algae
  - Photosynthesis is the process by which green plants, algae, and cyanobacteria convert light energy into chemical energy using chlorophyll.
  - During photosynthesis, carbon dioxide and water are transformed into glucose and oxygen through a series of light-dependent and light-independent reactions.
  - Cellular respiration is a complementary process that occurs in the mitochondria of eukaryotic cells, where glucose is broken down in the presence of oxygen to produce ATP, the primary energy currency of the cell.
----------------------------------------
ðŸ“‚ CATEGORY: Accurately
  - DNA replication is a fundamental biological process that ensures genetic information is accurately copied before cell division, relying on enzymes such as DNA polymerase and helicase.
  - Mutations in DNA can occur due to errors in replication or exposure to environmental factors like radiation, and while many mutations are neutral, some can lead to genetic disorders or evolutionary a