In [1]:
# === 0) INSTALL REQUIRED LIBRARIES ===
!pip install sentence-transformers spacy scikit-learn optuna pandas numpy tqdm

# Download SpaCy model
!python -m spacy download en_core_web_sm

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.

In [2]:
# === 1) IMPORTS ===
import re
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from pathlib import Path
import json
from tqdm import tqdm
import spacy
import math

print(" Libraries imported.")

 Libraries imported.


In [3]:
# === 2) UPLOAD OR MOUNT DATA ===

# Option A: Upload your CSV
from google.colab import files
uploaded = files.upload()  # Upload final_train_900.csv

# Load it
df = pd.read_csv("final_train_900.csv")  # change if filename differs

Saving final_train_900.csv to final_train_900.csv


In [4]:
# === 3) BALANCED SUBSET: 100 x DMT, LSD, Psilocybin ===

# Normalize substance names
def _norm_sub(x):
    if not isinstance(x, str): return "OTHER"
    y = x.strip().upper()
    if y in {"DMT"}: return "DMT"
    if y in {"LSD", "ACID"}: return "LSD"
    if y in {"PSILOCYBIN", "PSILOCYBIN MUSHROOM", "MUSHROOM", "MUSHROOMS", "PSILOCYBE"}:
        return "Psilocybin"
    return y

df["_subst_norm"] = df["substance"].map(_norm_sub)

# Select targets
TARGETS = ["DMT", "LSD", "Psilocybin"]
N_PER_CLASS = 100
RANDOM_SEED = 42

dfs = []
for s in TARGETS:
    pool = df[df["_subst_norm"] == s]
    n_pick = min(N_PER_CLASS, len(pool))
    if n_pick < N_PER_CLASS:
        print(f" Only {n_pick} available for {s}")
    dfs.append(pool.sample(n=n_pick, random_state=RANDOM_SEED))

df_sub = pd.concat(dfs, ignore_index=True).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print(f"Balanced subset: {len(df_sub)} reports")
print(df_sub['_subst_norm'].value_counts())

Balanced subset: 300 reports
_subst_norm
Psilocybin    100
LSD           100
DMT           100
Name: count, dtype: int64


In [5]:
# === 4) EXPERIENTIAL TERMS (for scoring) ===

experiential_terms = {
    'emotional': [
        'felt','feeling','emotion','joy','fear','anxiety','bliss','love','terror','peace','calm',
        'excited','overwhelmed','gratitude','euphoria','sadness','longing','crying','ecstasy','relief',
        'compassion','grief','awe','anger','release','hope','despair','serenity','agitation','comfort',
        'purging','vulnerability','intimacy','empathy','tension','melancholy','abandon','appreciation'
    ],
    'sensory': [
        'visual','hear','sound','color','bright','pattern','geometry','music','taste','smell','see',
        'saw','colors','sounds','shapes','textures','movement','melting','vibrations','pulsing',
        'fractal','echo','flashing','tunnel','fluid','shimmering','sparkling','synesthesia',
        'auditory','trails','glow','hallucination','pulsate','distortion','radiance','static',
        'blurred','lightness','glimmer','resonance','tactile','kaleidoscopic'
    ],
    'cognitive': [
        'thought','mind','consciousness','aware','realize','understand','insight','clarity','confused',
        'clear','thinking','perception','concepts','identity','ego','dissolve','looping','logic',
        'recognition','belief','interpretation','memory','language','narrative','meaning','mindspace',
        'headspace','overthinking','mental','clarification','self-talk','rational','intellect',
        'philosophical','metacognition','rumination','stream of consciousness','inner dialogue',
        'cognitive dissonance','hyperfocus'
    ],
    'physical': [
        'body','skin','breath','heart','energy','vibration','tingling','warm','heavy','light','pressure',
        'sensation','nausea','shaking','sweating','floating','stillness','tightness','spasm','motion',
        'trembling','cold','breathing','heartbeat','twitching','dry mouth','muscles','stiffness',
        'paralysis','numbness','restlessness','chills','sweat','clenching','somatic','bodyload',
        'temperature','digestive','physical release'
    ],
    'mystical': [
        'ego','self','unity','divine','spiritual','transcend','infinite','oneness','god','universe',
        'connected','sacred','eternal','death','rebirth','timeless','interconnected','presence','source',
        'void','light','beyond','higher power','awakening','realm','dimension','truth','immortality',
        'ego death','no-self','nirvana','cosmic','transcendence','pure being','karma','light being',
        'soul','heaven','angelic','time distortion','godlike','divinity','portal','third eye',
        'nondual','dissolution','samsara','infinity','entity','timelessness'
    ],
    'temporal': [
        'onset','peak','comedown','duration','timeline','hours','minutes','start','beginning',
        'after','later','build-up','before','end','wave','early','gradual','suddenly',
        'phase','stage','passed','elapsed','over time','rush','fade','linger','moment',
        'slowly','time passed','time distorted','hour mark','entry','exit'
    ]
}

# Flatten to lowercase set
EXP_TERMS_FLAT = {word.lower() for words in experiential_terms.values() for word in words}
print(f" Loaded {len(EXP_TERMS_FLAT)} experiential terms.")

 Loaded 240 experiential terms.


In [6]:
# === 5) TEXT CLEANING & SENTENCE UTILS ===

_whitespace_re = re.compile(r"[ \t\v\f]+")
_newlines_re   = re.compile(r"\s*\n\s*")

def clean_text_basic(txt: str) -> str:
    if not isinstance(txt, str) or pd.isna(txt):
        return ""
    txt = txt.replace("\r\n", "\n").replace("\r", "\n")
    txt = _newlines_re.sub("\n", txt)
    txt = _whitespace_re.sub(" ", txt).strip()
    return txt.lower()

def tokenize(text: str) -> list:
    return re.findall(r'\b\w+\b', text)

def detokenize(tokens: list) -> str:
    return " ".join(tokens)

def chunk_text_by_tokens(text: str, max_len: int = 512, stride: int = 384):
    tokens = tokenize(text)
    if len(tokens) <= max_len:
        return [text]
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_len
        chunk_tokens = tokens[start:end]
        chunks.append(detokenize(chunk_tokens))
        if end >= len(tokens):
            break
        start += stride
    return chunks

# Load SpaCy
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer", "ner"])
if "senter" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")

def extract_sentences(text: str) -> list:
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

def reassemble_sentences(chunks: list) -> list:
    seen = set()
    sentences = []
    for chunk in chunks:
        sents = extract_sentences(chunk)
        for sent in sents:
            if sent not in seen:
                seen.add(sent)
                sentences.append(sent)
    return sentences

In [7]:
# === 6) PRE-SPLIT SENTENCES WITH CHUNKING ===

print(" Pre-splitting sentences with 512-token chunks (384 stride)...")
sents_cache = []

for text in tqdm(df_sub["report_text"].astype(str).fillna(""), total=len(df_sub)):
    cleaned = clean_text_basic(text)
    chunks = chunk_text_by_tokens(cleaned, max_len=512, stride=384)
    sentences = reassemble_sentences(chunks)
    sents_cache.append({
        "sentences": sentences,
        "doc_clean": cleaned
    })

print(f" Built sentence cache: {len(sents_cache)} reports")

 Pre-splitting sentences with 512-token chunks (384 stride)...


100%|██████████| 300/300 [00:20<00:00, 14.39it/s]

 Built sentence cache: 300 reports





In [8]:
# === 7) LOAD SBERT MODEL (Uses GPU if available) ===

print("📥 Loading SBERT model: all-mpnet-base-v2...")
model = SentenceTransformer("all-mpnet-base-v2")
print(f"✅ SBERT loaded. Using device: {model.device}")

📥 Loading SBERT model: all-mpnet-base-v2...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ SBERT loaded. Using device: cuda:0


In [9]:
# === 8) SBERT-BASED MMR SUMMARY ===

def sbert_mmr_summary(
    sentences: list,
    relevance_weight: float = 0.7,
    diversity_weight: float = 0.3,
    position_bias: float = 0.1,
    similarity_threshold: float = 0.3,
    top_k_ratio: float = 0.25,
    agg: str = "mean"
) -> str:
    if len(sentences) == 0:
        return ""
    if len(sentences) == 1:
        return sentences[0]

    embeddings = model.encode(sentences, convert_to_numpy=True, show_progress_bar=False)

    # Document-level query
    if agg == "mean":
        query_vec = embeddings.mean(axis=0, keepdims=True)
    elif agg == "first":
        query_vec = embeddings[:1]
    else:
        query_vec = embeddings.mean(axis=0, keepdims=True)

    relevance = cosine_similarity(embeddings, query_vec).flatten()

    # Position bias
    pos_weights = np.exp(-position_bias * np.arange(len(sentences)))
    relevance = relevance * pos_weights

    # Filter low-similarity sentences
    high_rel_mask = relevance >= similarity_threshold
    if not high_rel_mask.any():
        k = max(1, int(round(len(sentences) * top_k_ratio)))
        top_idx = np.argsort(-relevance)[:k]
    else:
        candidates = np.where(high_rel_mask)[0]
        k = max(1, int(round(len(sentences) * top_k_ratio)))
        k = min(k, len(candidates))

        selected = []
        candidate_list = candidates.tolist()

        while len(selected) < k and candidate_list:
            if not selected:
                best_idx = candidate_list[np.argmax(relevance[candidate_list])]
            else:
                sel_embs = embeddings[selected]
                diversity = np.min(cosine_similarity(embeddings[candidate_list], sel_embs), axis=1)
                mmr_score = relevance[candidate_list] * relevance_weight - diversity * diversity_weight
                best_local = np.argmax(mmr_score)
                best_idx = candidate_list[best_local]
            selected.append(best_idx)
            candidate_list.remove(best_idx)
        top_idx = sorted(selected)

    return " ".join(sentences[i] for i in top_idx)

In [10]:
# === 9) SBERT-BASED SCORING ===

def score_semantic(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    emb_s = model.encode([summary], convert_to_numpy=True)
    emb_d = model.encode([document], convert_to_numpy=True)
    return float(cosine_similarity(emb_s, emb_d)[0,0])

def score_experiential(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    doc_terms = {w for w in EXP_TERMS_FLAT if w in document}
    if not doc_terms:
        return 1.0
    sum_terms = {w for w in EXP_TERMS_FLAT if w in summary}
    return len(doc_terms & sum_terms) / len(doc_terms)

def score_coherence(summary: str) -> float:
    if not summary.strip():
        return 0.0
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', summary) if s.strip()]
    if len(sents) < 2:
        return 1.0
    emb = model.encode(sents, convert_to_numpy=True)
    sims = [cosine_similarity(emb[i:i+1], emb[i+1:i+2])[0,0] for i in range(len(sents)-1)]
    return float(np.mean(sims)) if sims else 0.0

def custom_score(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    sem = score_semantic(summary, document)
    exp = score_experiential(summary, document)
    coh = score_coherence(summary)
    return 0.5*sem + 0.3*exp + 0.2*coh

In [11]:
# === 10) OPTUNA TUNING ===

RELEVANCE_WEIGHT = [0.6, 0.7, 0.8]
DIVERSITY_WEIGHT = [0.2, 0.3, 0.4, 0.5]
POSITION_BIAS = [0.05, 0.10, 0.15, 0.20]
SIM_THRESHOLD = [0.25, 0.35, 0.45, 0.55, 0.60]
AGG_CHOICES = ["mean", "first"]

EVAL_INTERVAL = max(10, len(sents_cache) // 12)
print(f" Tuning on {len(sents_cache)} reports | Pruning every {EVAL_INTERVAL} docs")

def objective(trial):
    rel_weight = trial.suggest_categorical("relevance_weight", RELEVANCE_WEIGHT)
    div_weight = trial.suggest_categorical("diversity_weight", DIVERSITY_WEIGHT)
    pos_bias = trial.suggest_categorical("position_bias", POSITION_BIAS)
    sim_thresh = trial.suggest_categorical("similarity_threshold", SIM_THRESHOLD)
    agg = trial.suggest_categorical("agg", AGG_CHOICES)

    scores = []
    for i, entry in enumerate(sents_cache, 1):
        summary = sbert_mmr_summary(
            sentences=entry["sentences"],
            relevance_weight=rel_weight,
            diversity_weight=div_weight,
            position_bias=pos_bias,
            similarity_threshold=sim_thresh,
            top_k_ratio=0.25,
            agg=agg
        )
        score = custom_score(summary, entry["doc_clean"])
        scores.append(score)

        if i % EVAL_INTERVAL == 0:
            trial.report(np.mean(scores), step=i)
            if trial.should_prune():
                raise optuna.TrialPruned()

    return np.mean(scores) if scores else 0.0

# Study
sampler = TPESampler(seed=42)
pruner = MedianPruner(n_startup_trials=10, n_warmup_steps=0, interval_steps=1)

study = optuna.create_study(
    direction="maximize",
    sampler=sampler,
    pruner=pruner,
    study_name="SBERT_MMR_Tuned"
)

study.optimize(objective, n_trials=60, show_progress_bar=True)

best_params = study.best_params
best_score = float(study.best_value)

print(" Best Params:")
print(json.dumps(best_params, indent=2))
print(" Best Custom Score:", round(best_score, 4))

[I 2025-08-10 17:27:36,234] A new study created in memory with name: SBERT_MMR_Tuned


 Tuning on 300 reports | Pruning every 25 docs


  0%|          | 0/60 [00:00<?, ?it/s]

[I 2025-08-10 17:28:23,143] Trial 0 finished with value: 0.8564371340904783 and parameters: {'relevance_weight': 0.7, 'diversity_weight': 0.2, 'position_bias': 0.05, 'similarity_threshold': 0.25, 'agg': 'first'}. Best is trial 0 with value: 0.8564371340904783.
[I 2025-08-10 17:29:12,077] Trial 1 finished with value: 0.83603723294594 and parameters: {'relevance_weight': 0.8, 'diversity_weight': 0.5, 'position_bias': 0.05, 'similarity_threshold': 0.6, 'agg': 'mean'}. Best is trial 0 with value: 0.8564371340904783.
[I 2025-08-10 17:30:01,058] Trial 2 finished with value: 0.8358090278177349 and parameters: {'relevance_weight': 0.8, 'diversity_weight': 0.4, 'position_bias': 0.05, 'similarity_threshold': 0.55, 'agg': 'mean'}. Best is trial 0 with value: 0.8564371340904783.
[I 2025-08-10 17:30:49,930] Trial 3 finished with value: 0.8478560475393718 and parameters: {'relevance_weight': 0.7, 'diversity_weight': 0.5, 'position_bias': 0.1, 'similarity_threshold': 0.6, 'agg': 'mean'}. Best is tria

 Best Params:
{
  "relevance_weight": 0.6,
  "diversity_weight": 0.2,
  "position_bias": 0.05,
  "similarity_threshold": 0.25,
  "agg": "first"
}
 Best Custom Score: 0.8565

In [13]:
#!mkdir -p /content/drive/MyDrive/sbert-models
#model.save("/content/drive/MyDrive/sbert-models/sbert-mpnet-psychoactive-v1")