In [1]:
# === 1) IMPORTS ===
import re
import pandas as pd
import numpy as np
import spacy
from spacy.util import filter_spans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from pathlib import Path
import json
from tqdm import tqdm
import pickle

print("Imports done.")

Imports done.


In [3]:
# === 2) EXPERIENTIAL TERMS & SPACY LOADING ===

# Experiential terms
experiential_terms = {
    'emotional': [
        'felt','feeling','emotion','joy','fear','anxiety','bliss','love','terror','peace','calm',
        'excited','overwhelmed','gratitude','euphoria','sadness','longing','crying','ecstasy','relief',
        'compassion','grief','awe','anger','release','hope','despair','serenity','agitation','comfort',
        'purging','vulnerability','intimacy','empathy','tension','melancholy','abandon','appreciation'
    ],
    'sensory': [
        'visual','hear','sound','color','bright','pattern','geometry','music','taste','smell','see',
        'saw','colors','sounds','shapes','textures','movement','melting','vibrations','pulsing',
        'fractal','echo','flashing','tunnel','fluid','shimmering','sparkling','synesthesia',
        'auditory','trails','glow','hallucination','pulsate','distortion','radiance','static',
        'blurred','lightness','glimmer','resonance','tactile','kaleidoscopic'
    ],
    'cognitive': [
        'thought','mind','consciousness','aware','realize','understand','insight','clarity','confused',
        'clear','thinking','perception','concepts','identity','ego','dissolve','looping','logic',
        'recognition','belief','interpretation','memory','language','narrative','meaning','mindspace',
        'headspace','overthinking','mental','clarification','self-talk','rational','intellect',
        'philosophical','metacognition','rumination','stream of consciousness','inner dialogue',
        'cognitive dissonance','hyperfocus'
    ],
    'physical': [
        'body','skin','breath','heart','energy','vibration','tingling','warm','heavy','light','pressure',
        'sensation','nausea','shaking','sweating','floating','stillness','tightness','spasm','motion',
        'trembling','cold','breathing','heartbeat','twitching','dry mouth','muscles','stiffness',
        'paralysis','numbness','restlessness','chills','sweat','clenching','somatic','bodyload',
        'temperature','digestive','physical release'
    ],
    'mystical': [
        'ego','self','unity','divine','spiritual','transcend','infinite','oneness','god','universe',
        'connected','sacred','eternal','death','rebirth','timeless','interconnected','presence','source',
        'void','light','beyond','higher power','awakening','realm','dimension','truth','immortality',
        'ego death','no-self','nirvana','cosmic','transcendence','pure being','karma','light being',
        'soul','heaven','angelic','time distortion','godlike','divinity','portal','third eye',
        'nondual','dissolution','samsara','infinity','entity','timelessness'
    ],
    'temporal': [
        'onset','peak','comedown','duration','timeline','hours','minutes','start','beginning',
        'after','later','build-up','before','end','wave','early','gradual','suddenly',
        'phase','stage','passed','elapsed','over time','rush','fade','linger','moment',
        'slowly','time passed','time distorted','hour mark','entry','exit'
    ]
}

# Flatten
EXP_TERMS_FLAT = {word.lower() for words in experiential_terms.values() for word in words}

# Load SpaCy + stopwords
try:
    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer", "ner"])
    if "senter" not in nlp.pipe_names:
        nlp.add_pipe("sentencizer")
    print("SpaCy model loaded.")
except OSError:
    raise RuntimeError("SpaCy model not found. Run: python -m spacy download en_core_web_sm")

# Define stopwords
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
print(f"{len(SPACY_STOPWORDS)} stopwords loaded.")

SpaCy model loaded.
326 stopwords loaded.


In [5]:
# === 3) TEXT CLEANING & SENTENCIZATION (No NER, Lowercase Early) ===

_whitespace_re = re.compile(r"[ \t\v\f]+")
_newlines_re   = re.compile(r"\s*\n\s*")

def clean_text_basic(txt: str) -> str:
    """Lowercase and clean whitespace."""
    if not isinstance(txt, str) or pd.isna(txt):
        return ""
    txt = txt.lower()  # ✅ Lowercase early
    txt = txt.replace("\r\n", "\n").replace("\r", "\n")
    txt = _newlines_re.sub("\n", txt)
    txt = _whitespace_re.sub(" ", txt).strip()
    return txt

def sentencize_and_clean(raw_text: str, nlp, remove_sw_for_graph=True, remove_sw_for_summary=False):
    """Split into sentences. All text is lowercased."""
    raw_text_clean = clean_text_basic(raw_text)
    doc = nlp.make_doc(raw_text_clean)

    # Segment sentences
    if "senter" in nlp.pipe_names:
        nlp.get_pipe("senter")(doc)
    else:
        doc = nlp(raw_text_clean)

    sents = [s.text.strip() for s in doc.sents if s.text.strip()]

    def remove_stopwords(s):
        d = nlp.make_doc(s)
        tokens = [t.text for t in d if t.is_alpha and t.text not in SPACY_STOPWORDS]
        return " ".join(tokens).strip()

    s_graph = [remove_stopwords(s) for s in sents] if remove_sw_for_graph else sents
    s_summary = sents if not remove_sw_for_summary else [remove_stopwords(s) for s in sents]

    return sents, s_graph, s_summary, raw_text_clean  # all lowercase

In [None]:
# === 4) LOAD DATA & BALANCED SUBSET ===

DATA_PATH = Path(r"final_train_900.csv")
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Missing: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
REQUIRED_COLS = {"report_text", "substance"}
missing = REQUIRED_COLS - set(df.columns)
if missing:
    raise KeyError(f"Missing columns: {missing}")

def _norm_sub(x):
    if not isinstance(x, str): return "OTHER"
    y = x.strip().upper()
    if y in {"DMT"}: return "DMT"
    if y in {"LSD", "ACID"}: return "LSD"
    if y in {"PSILOCYBIN", "PSILOCYBIN MUSHROOM", "MUSHROOM", "MUSHROOMS", "PSILOCYBE"}:
        return "Psilocybin"
    return y

df["_subst_norm"] = df["substance"].map(_norm_sub)
TARGETS = ["DMT", "LSD", "Psilocybin"]
N_PER_CLASS = 100
RANDOM_SEED = 42

dfs = []
for s in TARGETS:
    pool = df[df["_subst_norm"] == s]
    n_pick = min(N_PER_CLASS, len(pool))
    if n_pick < N_PER_CLASS:
        print(f" Only {n_pick} available for {s}")
    dfs.append(pool.sample(n=n_pick, random_state=RANDOM_SEED))

df_sub = pd.concat(dfs, ignore_index=True).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
print(f"Balanced subset: {df_sub.shape}")
print(df_sub['_subst_norm'].value_counts())

Balanced subset: (300, 8)
_subst_norm
Psilocybin    100
LSD           100
DMT           100
Name: count, dtype: int64


In [8]:
# === 5) PRECOMPUTE SENTENCES + COSINE MATRICES ===

def _precompute_doc(text, nlp):
    sents, s_graph, s_summary, cleaned = sentencize_and_clean(
        text, nlp,
        remove_sw_for_graph=True,
        remove_sw_for_summary=False
    )
    if len(sents) == 0:
        return [], np.zeros((0,0), dtype=np.float32), cleaned

    vec = TfidfVectorizer(lowercase=False,  
                          stop_words="english",
                          max_features=4000)
    X = vec.fit_transform(s_graph)
    sim = cosine_similarity(X).astype(np.float32)
    np.fill_diagonal(sim, 0.0)
    return s_summary, sim, cleaned  # s_summary = original sentences (with stopwords, lowercase)

print("🧠 Precomputing sentences & similarity matrices...")
cache = []
for text in tqdm(df_sub["report_text"].astype(str).fillna(""), total=len(df_sub)):
    s_render, sim, cleaned = _precompute_doc(text, nlp)
    cache.append({
        "s_render": s_render,
        "sim": sim,
        "doc_clean": cleaned
    })

print(f"Precomputed: {len(cache)} documents")

🧠 Precomputing sentences & similarity matrices...


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [00:18<00:00, 16.32it/s]

Precomputed: 300 documents





In [9]:
# === 6) FAST PAGE RANK & SUMMARY FROM CACHE ===

def _pagerank(P: np.ndarray, damping: float = 0.85, eps: float = 1e-6, max_iter: int = 100) -> np.ndarray:
    n = P.shape[0]
    if n == 0:
        return np.array([])
    v = np.ones(n, dtype=np.float32) / n
    teleport = np.ones(n, dtype=np.float32) / n
    for _ in range(max_iter):
        v_new = damping * P.T.dot(v) + (1 - damping) * teleport
        if np.linalg.norm(v_new - v, ord=1) < eps:
            return v_new
        v = v_new
    return v

def _summary_from_cache(entry, compression_ratio: float, similarity_threshold: float, damping_factor: float):
    s_render = entry["s_render"]
    sim = entry["sim"]
    if len(s_render) == 0 or sim.size == 0:
        return ""

    A = np.where(sim >= similarity_threshold, sim, 0.0).astype(np.float32)
    row_sums = A.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1.0
    P = A / row_sums

    scores = _pagerank(P, damping=damping_factor)
    k = max(1, int(round(len(s_render) * compression_ratio)))
    top_idx = np.argsort(-scores)[:k]
    return " ".join(s_render[i] for i in sorted(top_idx)).strip()

In [10]:
# === 7) SCORING FUNCTIONS (No .lower() needed — all text is lowercase) ===

def _tfidf_cosine(a: str, b: str) -> float:
    if not a.strip() or not b.strip():
        return 0.0
    vec = TfidfVectorizer(lowercase=False, stop_words="english", max_features=4000)
    try:
        X = vec.fit_transform([a, b])
        return float(cosine_similarity(X[0], X[1])[0,0])
    except:
        return 0.0

def score_semantic(summary: str, document: str) -> float:
    return _tfidf_cosine(summary, document)

def score_experiential(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    doc_terms = {w for w in EXP_TERMS_FLAT if w in document}  # ✅ No .lower()!
    if not doc_terms:
        return 1.0
    sum_terms = {w for w in EXP_TERMS_FLAT if w in summary}  # ✅ No .lower()!
    return len(doc_terms & sum_terms) / len(doc_terms)

def score_coherence(summary: str) -> float:
    if not summary.strip():
        return 0.0
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', summary) if s.strip()]
    if len(sents) < 2:
        return 1.0
    vec = TfidfVectorizer(lowercase=False, stop_words="english", max_features=4000)
    try:
        X = vec.fit_transform(sents)
        sims = [float(cosine_similarity(X[i], X[i+1])[0,0]) for i in range(len(sents)-1)]
        return float(np.mean(sims)) if sims else 0.0
    except:
        return 0.0

def custom_score(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    sem = score_semantic(summary, document)
    exp = score_experiential(summary, document)
    coh = score_coherence(summary)
    return 0.5*sem + 0.3*exp + 0.2*coh

In [11]:
# === 8) OPTUNA TUNING ===

CR_CHOICES = [0.22, 0.25, 0.28, 0.30]
SIM_CHOICES = [0.15, 0.25, 0.35]
DF_CHOICES = [0.85, 0.90]

EVAL_INTERVAL = max(10, len(cache) // 12)
print(f"Tuning on {len(cache)} docs | Pruning every {EVAL_INTERVAL} docs")

def objective(trial):
    cr = trial.suggest_categorical("compression_ratio", CR_CHOICES)
    sim = trial.suggest_categorical("similarity_threshold", SIM_CHOICES)
    df = trial.suggest_categorical("damping_factor", DF_CHOICES)

    scores = []
    for i, entry in enumerate(cache, 1):
        summary = _summary_from_cache(entry, cr, sim, df)
        score = custom_score(summary, entry["doc_clean"])
        scores.append(score)

        if i % EVAL_INTERVAL == 0:
            trial.report(np.mean(scores), step=i)
            if trial.should_prune():
                raise optuna.TrialPruned()

    return np.mean(scores) if scores else 0.0

sampler = TPESampler(seed=RANDOM_SEED)
pruner = MedianPruner(n_startup_trials=8, n_warmup_steps=0, interval_steps=1)

study = optuna.create_study(
    direction="maximize",
    sampler=sampler,
    pruner=pruner,
    study_name="LexRank_M1_cached_subset"
)

study.optimize(objective, n_trials=80, show_progress_bar=True)

best_params = study.best_params
best_score = float(study.best_value)
print("Best params:", best_params)
print("Best score:", round(best_score, 4))

[I 2025-08-10 21:20:20,307] A new study created in memory with name: LexRank_M1_cached_subset


Tuning on 300 docs | Pruning every 25 docs


  0%|          | 0/80 [00:00<?, ?it/s]

[I 2025-08-10 21:20:25,764] Trial 0 finished with value: 0.4285565307919664 and parameters: {'compression_ratio': 0.25, 'similarity_threshold': 0.15, 'damping_factor': 0.85}. Best is trial 0 with value: 0.4285565307919664.
[I 2025-08-10 21:20:31,835] Trial 1 finished with value: 0.4498540462281225 and parameters: {'compression_ratio': 0.28, 'similarity_threshold': 0.15, 'damping_factor': 0.9}. Best is trial 1 with value: 0.4498540462281225.
[I 2025-08-10 21:20:37,535] Trial 2 finished with value: 0.4107405257508243 and parameters: {'compression_ratio': 0.28, 'similarity_threshold': 0.35, 'damping_factor': 0.85}. Best is trial 1 with value: 0.4498540462281225.
[I 2025-08-10 21:20:43,472] Trial 3 finished with value: 0.42813820900630123 and parameters: {'compression_ratio': 0.3, 'similarity_threshold': 0.35, 'damping_factor': 0.85}. Best is trial 1 with value: 0.4498540462281225.
[I 2025-08-10 21:20:49,350] Trial 4 finished with value: 0.4234385439530432 and parameters: {'compression_rat

[I 2025-08-10 21:24:03,082] Trial 46 pruned. 
[I 2025-08-10 21:24:03,626] Trial 47 pruned. 
[I 2025-08-10 21:24:09,823] Trial 48 finished with value: 0.4651380985266547 and parameters: {'compression_ratio': 0.3, 'similarity_threshold': 0.15, 'damping_factor': 0.85}. Best is trial 35 with value: 0.4651380985266547.
[I 2025-08-10 21:24:10,274] Trial 49 pruned. 
[I 2025-08-10 21:24:16,549] Trial 50 finished with value: 0.4651380985266547 and parameters: {'compression_ratio': 0.3, 'similarity_threshold': 0.15, 'damping_factor': 0.85}. Best is trial 35 with value: 0.4651380985266547.
[I 2025-08-10 21:24:22,834] Trial 51 finished with value: 0.4651380985266547 and parameters: {'compression_ratio': 0.3, 'similarity_threshold': 0.15, 'damping_factor': 0.85}. Best is trial 35 with value: 0.4651380985266547.
[I 2025-08-10 21:24:29,109] Trial 52 finished with value: 0.4651380985266547 and parameters: {'compression_ratio': 0.3, 'similarity_threshold': 0.15, 'damping_factor': 0.85}. Best is trial 3

### Best params: {'compression_ratio': 0.3, 'similarity_threshold': 0.15, 'damping_factor': 0.85}
#### Best score: 0.4651