In [1]:
# === 1) IMPORTS ===
import re
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import hdbscan
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from pathlib import Path
import json
from tqdm import tqdm
import pickle
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# === 2) EXPERIENTIAL TERMS & SPACY SETUP ===

experiential_terms = {
    'emotional': [
        'felt','feeling','emotion','joy','fear','anxiety','bliss','love','terror','peace','calm',
        'excited','overwhelmed','gratitude','euphoria','sadness','longing','crying','ecstasy','relief',
        'compassion','grief','awe','anger','release','hope','despair','serenity','agitation','comfort',
        'purging','vulnerability','intimacy','empathy','tension','melancholy','abandon','appreciation'
    ],
    'sensory': [
        'visual','hear','sound','color','bright','pattern','geometry','music','taste','smell','see',
        'saw','colors','sounds','shapes','textures','movement','melting','vibrations','pulsing',
        'fractal','echo','flashing','tunnel','fluid','shimmering','sparkling','synesthesia',
        'auditory','trails','glow','hallucination','pulsate','distortion','radiance','static',
        'blurred','lightness','glimmer','resonance','tactile','kaleidoscopic'
    ],
    'cognitive': [
        'thought','mind','consciousness','aware','realize','understand','insight','clarity','confused',
        'clear','thinking','perception','concepts','identity','ego','dissolve','looping','logic',
        'recognition','belief','interpretation','memory','language','narrative','meaning','mindspace',
        'headspace','overthinking','mental','clarification','self-talk','rational','intellect',
        'philosophical','metacognition','rumination','stream of consciousness','inner dialogue',
        'cognitive dissonance','hyperfocus'
    ],
    'physical': [
        'body','skin','breath','heart','energy','vibration','tingling','warm','heavy','light','pressure',
        'sensation','nausea','shaking','sweating','floating','stillness','tightness','spasm','motion',
        'trembling','cold','breathing','heartbeat','twitching','dry mouth','muscles','stiffness',
        'paralysis','numbness','restlessness','chills','sweat','clenching','somatic','bodyload',
        'temperature','digestive','physical release'
    ],
    'mystical': [
        'ego','self','unity','divine','spiritual','transcend','infinite','oneness','god','universe',
        'connected','sacred','eternal','death','rebirth','timeless','interconnected','presence','source',
        'void','light','beyond','higher power','awakening','realm','dimension','truth','immortality',
        'ego death','no-self','nirvana','cosmic','transcendence','pure being','karma','light being',
        'soul','heaven','angelic','time distortion','godlike','divinity','portal','third eye',
        'nondual','dissolution','samsara','infinity','entity','timelessness'
    ],
    'temporal': [
        'onset','peak','comedown','duration','timeline','hours','minutes','start','beginning',
        'after','later','build-up','before','end','wave','early','gradual','suddenly',
        'phase','stage','passed','elapsed','over time','rush','fade','linger','moment',
        'slowly','time passed','time distorted','hour mark','entry','exit'
    ]
}

# Flatten to lowercase set
EXP_TERMS_FLAT = {word.lower() for words in experiential_terms.values() for word in words}

# Load SpaCy
try:
    nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "lemmatizer", "ner"])
    if "senter" not in nlp.pipe_names:
        nlp.add_pipe("sentencizer")
    print(" SpaCy loaded.")
except OSError:
    raise RuntimeError("SpaCy model not found. Run: python -m spacy download en_core_web_sm")

# Stopwords (lowercase)
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
SPACY_STOPWORDS = {word.lower() for word in SPACY_STOPWORDS}
print(f" {len(SPACY_STOPWORDS)} stopwords loaded.")

 SpaCy loaded.
 326 stopwords loaded.


In [3]:
_whitespace_re = re.compile(r"[ \t\v\f]+")
_newlines_re   = re.compile(r"\s*\n\s*")

def clean_text_basic(txt: str) -> str:
    if not isinstance(txt, str) or pd.isna(txt):
        return ""
    txt = txt.lower()
    txt = txt.replace("\r\n", "\n").replace("\r", "\n")
    txt = _newlines_re.sub("\n", txt)
    txt = _whitespace_re.sub(" ", txt).strip()
    return txt

def sentencize_and_clean(raw_text: str, nlp):
    raw_text_clean = clean_text_basic(raw_text)
    doc = nlp.make_doc(raw_text_clean)
    if "senter" in nlp.pipe_names:
        nlp.get_pipe("senter")(doc)
    else:
        doc = nlp(raw_text_clean)
    sents = [s.text.strip() for s in doc.sents if s.text.strip()]
    return sents, raw_text_clean

In [None]:
DATA_PATH = Path(r"final_train_900.csv")
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Missing: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
if not {"report_text", "substance"}.issubset(df.columns):
    raise KeyError("CSV must have 'report_text' and 'substance' columns")

def _norm_sub(x):
    if not isinstance(x, str): return "OTHER"
    y = x.strip().upper()
    if y in {"DMT"}: return "DMT"
    if y in {"LSD", "ACID"}: return "LSD"
    if y in {"PSILOCYBIN", "PSILOCYBIN MUSHROOM", "MUSHROOM", "MUSHROOMS", "PSILOCYBE"}:
        return "Psilocybin"
    return y

df["_subst_norm"] = df["substance"].map(_norm_sub)
TARGETS = ["DMT", "LSD", "Psilocybin"]
N_PER_CLASS = 100
RANDOM_SEED = 42

balanced_dfs = []
for s in TARGETS:
    pool = df[df["_subst_norm"] == s]
    n_pick = min(N_PER_CLASS, len(pool))
    if n_pick < N_PER_CLASS:
        print(f" Only {n_pick} available for {s}")
    balanced_dfs.append(pool.sample(n=n_pick, random_state=RANDOM_SEED))

df_sub = pd.concat(balanced_dfs, ignore_index=True).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
print(f" Balanced subset: {df_sub.shape}")
print(df_sub['_subst_norm'].value_counts())

 Balanced subset: (300, 8)
_subst_norm
Psilocybin    100
LSD           100
DMT           100
Name: count, dtype: int64


In [5]:
print(" Pre-splitting sentences...")
lsa_cache = []
for text in tqdm(df_sub["report_text"].astype(str).fillna(""), total=len(df_sub)):
    sentences, cleaned = sentencize_and_clean(text, nlp)
    lsa_cache.append({
        "sentences": sentences,
        "doc_clean": cleaned
    })
print(f" Precomputed: {len(lsa_cache)} documents")

✂️ Pre-splitting sentences...


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [00:16<00:00, 17.92it/s]

 Precomputed: 300 documents





In [6]:
_word_re_cache = {}
def _has_term(sentence: str, term: str) -> bool:  # CHANGED
    rx = _word_re_cache.get(term)
    if rx is None:
        # Build a word-boundary regex for phrases (e.g., "ego death")
        term_rx = r"\b" + r"\s+".join(map(re.escape, term.split())) + r"\b"
        rx = re.compile(term_rx)
        _word_re_cache[term] = rx
    return rx.search(sentence) is not None

def _exp_terms_in(sentences):  # CHANGED
    present = set()
    for t in EXP_TERMS_FLAT:
        if any(_has_term(s, t) for s in sentences):
            present.add(t)
    return present

In [7]:
def lsa_hdbscan_summary(
    sentences,
    n_components: int = 48,            # CHANGED (safer default)
    min_cluster_size: int = 5,
    min_samples: int = 3,
    top_k_ratio: float = 0.20,         # CHANGED (closer to LexRank-like length)
    max_clusters: int = 2,
    pos_bias: float = 0.05             # CHANGED (small lead bias helps coherence)
) -> str:
    if not sentences:
        return ""
    sents = [s for s in sentences if s.strip()]
    if len(sents) < 2:
        return " ".join(sents)

    # TF-IDF + safer LSA rank (per-doc)
    tfidf = TfidfVectorizer(
        lowercase=False, stop_words="english",
        max_features=2000, sublinear_tf=True, norm="l2"  # CHANGED
    )
    X_tfidf = tfidf.fit_transform(sents)
    n_features = X_tfidf.shape[1]

    max_rank = max(2, min(n_components, n_features - 1, len(sents) - 1, 64))  # CHANGED
    if max_rank < 2:
        k = max(1, int(round(len(sents) * top_k_ratio)))
        return " ".join(sents[:k])

    svd = TruncatedSVD(n_components=max_rank, random_state=42)
    X_lsa = svd.fit_transform(X_tfidf)
    X_lsa = X_lsa / (np.linalg.norm(X_lsa, axis=1, keepdims=True) + 1e-10)

    # HDBSCAN (euclidean on unit vectors ~ cosine)
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=True
    )
    labels = clusterer.fit_predict(X_lsa)

    unique_labels = np.setdiff1d(np.unique(labels), [-1])
    if len(unique_labels) == 0:
        k = max(1, int(round(len(sents) * top_k_ratio)))
        return " ".join(sents[:k])

    # Density proxy: membership probabilities (robust across versions)
    try:                                           # CHANGED
        probs = clusterer.probabilities_
    except Exception:
        probs = np.ones(len(sents))

    # Experiential coverage (phrase-aware) within doc and cluster
    doc_exp_terms = _exp_terms_in(sents)          # CHANGED

    cluster_scores = []
    for lbl in unique_labels:
        idxs = np.where(labels == lbl)[0]
        size = len(idxs)
        if size == 0:
            continue
        prob_mean = float(np.mean(probs[idxs]))   #  dense core (CHANGED)
        sents_in_cluster = [sents[i] for i in idxs]
        exp_in_cluster = _exp_terms_in(sents_in_cluster)
        exp_coverage = (len(exp_in_cluster & doc_exp_terms) / max(1, len(doc_exp_terms))) if doc_exp_terms else 1.0
        # Reward big, dense, experientially rich clusters (CHANGED)
        score = (size**0.7) * (prob_mean**1.2) * (0.6 + 0.4*exp_coverage)
        cluster_scores.append((score, idxs))

    # Pick top clusters
    cluster_scores.sort(key=lambda z: z[0], reverse=True)
    cluster_scores = cluster_scores[:max_clusters]

    # Allocate sentences proportional to cluster weights (not equal split. CHANGED)
    k_final = max(1, int(round(len(sents) * top_k_ratio)))
    weights = np.array([s for s, _ in cluster_scores], dtype=float)
    weights = weights / (weights.sum() + 1e-9)
    quotas = np.maximum(1, np.round(weights * k_final)).astype(int)
    while quotas.sum() > k_final:
        i = np.argmax(quotas)
        quotas[i] -= 1
    while quotas.sum() < k_final:
        i = np.argmin(quotas)
        quotas[i] += 1

    selected = []
    for (score, idxs), q in zip(cluster_scores, quotas):
        centroid = np.mean(X_lsa[idxs], axis=0)
        centroid /= (np.linalg.norm(centroid) + 1e-10)
        sims = cosine_similarity(X_lsa[idxs], centroid.reshape(1, -1)).ravel()

        # small positional bias toward earlier sentences (CHANGED)
        sims = sims + pos_bias * (1.0 - (idxs / (len(sents) + 1e-9)))

        order = idxs[np.argsort(-sims)]
        selected.extend(order[:q])

    # De-dup & restore narrative order
    selected = sorted(dict.fromkeys(selected))
    return " ".join(sents[i] for i in selected[:k_final])

In [8]:
def _tfidf_cosine(a: str, b: str) -> float:
    if not a.strip() or not b.strip():
        return 0.0
    vec = TfidfVectorizer(lowercase=False, stop_words="english", max_features=4000)
    try:
        X = vec.fit_transform([a, b])
        return float(cosine_similarity(X[0], X[1])[0,0])
    except Exception:
        return 0.0

def score_semantic(summary: str, document: str) -> float:
    return _tfidf_cosine(summary, document)

def score_experiential(summary: str, document: str) -> float:  # CHANGED
    if not summary or not document:
        return 0.0
    # Sentence split for phrase-aware matching
    doc_sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', document) if s.strip()]
    sum_sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', summary) if s.strip()]
    doc_terms = _exp_terms_in(doc_sents)
    if not doc_terms:
        return 1.0
    sum_terms = _exp_terms_in(sum_sents)
    return len(doc_terms & sum_terms) / len(doc_terms)

def score_coherence(summary: str) -> float:
    if not summary.strip():
        return 0.0
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', summary) if s.strip()]
    if len(sents) < 2:
        return 1.0
    vec = TfidfVectorizer(lowercase=False, stop_words="english", max_features=4000)
    try:
        X = vec.fit_transform(sents)
        sims = [float(cosine_similarity(X[i], X[i+1])[0,0]) for i in range(len(sents)-1)]
        return float(np.mean(sims)) if sims else 0.0
    except Exception:
        return 0.0

def custom_score(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    sem = score_semantic(summary, document)
    exp = score_experiential(summary, document)
    coh = score_coherence(summary)
    return 0.5*sem + 0.3*exp + 0.2*coh

In [9]:
N_COMP_CHOICES     = [16, 24, 32, 40, 48, 64]          # CHANGED
MIN_SIZE_CHOICES   = [3, 4, 5, 6, 7, 8]                # CHANGED
MIN_SAMPLE_CHOICES = [1, 2, 3, 4, 5]                   # CHANGED

EVAL_INTERVAL = max(10, len(lsa_cache) // 12)
print(f"🔍 Tuning on {len(lsa_cache)} docs | Pruning every {EVAL_INTERVAL} docs")

def objective(trial):
    n_components   = trial.suggest_categorical("n_components", N_COMP_CHOICES)
    min_cluster_sz = trial.suggest_categorical("min_cluster_size", MIN_SIZE_CHOICES)
    min_samples    = trial.suggest_categorical("min_samples", MIN_SAMPLE_CHOICES)
    top_k_ratio    = trial.suggest_float("top_k_ratio", 0.15, 0.25, step=0.02)  # CHANGED
    pos_bias       = trial.suggest_float("pos_bias", 0.0, 0.08, step=0.02)      # CHANGED

    scores = []
    for i, entry in enumerate(lsa_cache, 1):
        summary = lsa_hdbscan_summary(
            sentences=entry["sentences"],
            n_components=n_components,
            min_cluster_size=min_cluster_sz,
            min_samples=min_samples,
            top_k_ratio=top_k_ratio,
            max_clusters=2,
            pos_bias=pos_bias
        )
        score = custom_score(summary, entry["doc_clean"])
        scores.append(score)

        # MedianPruner heartbeat
        if i % EVAL_INTERVAL == 0:
            trial.report(np.mean(scores), step=i)
            if trial.should_prune():
                raise optuna.TrialPruned()

    return np.mean(scores) if scores else 0.0

# Create study
sampler = TPESampler(seed=RANDOM_SEED)
pruner = MedianPruner(n_startup_trials=10, n_warmup_steps=0, interval_steps=1)

study = optuna.create_study(
    direction="maximize",
    sampler=sampler,
    pruner=pruner,
    study_name="LSA_HDBSCAN_Tuned"
)

# Run optimization
study.optimize(objective, n_trials=60, show_progress_bar=True)

# Get best results
best_params = study.best_params
best_score = float(study.best_value)

print("Best Params:")
print(json.dumps(best_params, indent=2))
print("Best Custom Score:", round(best_score, 4))

[I 2025-08-11 13:26:10,937] A new study created in memory with name: LSA_HDBSCAN_Tuned


🔍 Tuning on 300 docs | Pruning every 25 docs


  0%|          | 0/60 [00:00<?, ?it/s]

[I 2025-08-11 13:26:42,439] Trial 0 finished with value: 0.3700002794852972 and parameters: {'n_components': 24, 'min_cluster_size': 8, 'min_samples': 1, 'top_k_ratio': 0.21, 'pos_bias': 0.04}. Best is trial 0 with value: 0.3700002794852972.
[I 2025-08-11 13:27:09,421] Trial 1 finished with value: 0.3110802654771703 and parameters: {'n_components': 24, 'min_cluster_size': 3, 'min_samples': 4, 'top_k_ratio': 0.16999999999999998, 'pos_bias': 0.0}. Best is trial 0 with value: 0.3700002794852972.
[I 2025-08-11 13:27:39,005] Trial 2 finished with value: 0.374389693791553 and parameters: {'n_components': 64, 'min_cluster_size': 4, 'min_samples': 1, 'top_k_ratio': 0.25, 'pos_bias': 0.0}. Best is trial 2 with value: 0.374389693791553.
[I 2025-08-11 13:28:06,250] Trial 3 finished with value: 0.3632859978833547 and parameters: {'n_components': 64, 'min_cluster_size': 7, 'min_samples': 1, 'top_k_ratio': 0.22999999999999998, 'pos_bias': 0.06}. Best is trial 2 with value: 0.374389693791553.
[I 2025

[I 2025-08-11 13:43:08,815] Trial 49 pruned. 
[I 2025-08-11 13:43:11,341] Trial 50 pruned. 
[I 2025-08-11 13:43:42,181] Trial 51 finished with value: 0.394596988780306 and parameters: {'n_components': 24, 'min_cluster_size': 7, 'min_samples': 2, 'top_k_ratio': 0.25, 'pos_bias': 0.02}. Best is trial 47 with value: 0.40289293550093647.
[I 2025-08-11 13:44:13,594] Trial 52 finished with value: 0.394596988780306 and parameters: {'n_components': 24, 'min_cluster_size': 7, 'min_samples': 2, 'top_k_ratio': 0.25, 'pos_bias': 0.02}. Best is trial 47 with value: 0.40289293550093647.
[I 2025-08-11 13:44:46,458] Trial 53 finished with value: 0.394596988780306 and parameters: {'n_components': 24, 'min_cluster_size': 7, 'min_samples': 2, 'top_k_ratio': 0.25, 'pos_bias': 0.02}. Best is trial 47 with value: 0.40289293550093647.
[I 2025-08-11 13:44:49,517] Trial 54 pruned. 
[I 2025-08-11 13:45:21,304] Trial 55 finished with value: 0.394596988780306 and parameters: {'n_components': 24, 'min_cluster_size

####  Best Params: { "n_components": 24, "min_cluster_size": 7, "min_samples": 1, "top_k_ratio": 0.25, "pos_bias": 0.00}
#### Best Custom Score: 0.4029