In [1]:
!pip install sentence-transformers spacy scikit-learn optuna pandas numpy tqdm matplotlib seaborn
!python -m spacy download en_core_web_sm

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.

In [2]:
import re
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import files
import json
from tqdm import tqdm
import spacy

In [3]:
uploaded = files.upload()  # Upload: dmt_test_100.csv, lsd_test_100.csv, mushroom_test_100.csv

Saving dmt_test_100.csv to dmt_test_100.csv
Saving lsd_test_100.csv to lsd_test_100.csv
Saving mushroom_test_100.csv to mushroom_test_100.csv


In [4]:
# List of expected files
expected_files = {
    "DMT": "dmt_test_100.csv",
    "LSD": "lsd_test_100.csv",
    "Psilocybin": "mushroom_test_100.csv"
}

# Check if all files are uploaded
missing = [name for name in expected_files.values() if name not in uploaded]
if missing:
    raise FileNotFoundError(f"Missing files: {missing}")
else:
    print(" All test files uploaded.")

 All test files uploaded.


In [5]:
experiential_terms = {
    'emotional': [
        'felt','feeling','emotion','joy','fear','anxiety','bliss','love','terror','peace','calm',
        'excited','overwhelmed','gratitude','euphoria','sadness','longing','crying','ecstasy','relief',
        'compassion','grief','awe','anger','release','hope','despair','serenity','agitation','comfort',
        'purging','vulnerability','intimacy','empathy','tension','melancholy','abandon','appreciation'
    ],
    'sensory': [
        'visual','hear','sound','color','bright','pattern','geometry','music','taste','smell','see',
        'saw','colors','sounds','shapes','textures','movement','melting','vibrations','pulsing',
        'fractal','echo','flashing','tunnel','fluid','shimmering','sparkling','synesthesia',
        'auditory','trails','glow','hallucination','pulsate','distortion','radiance','static',
        'blurred','lightness','glimmer','resonance','tactile','kaleidoscopic'
    ],
    'cognitive': [
        'thought','mind','consciousness','aware','realize','understand','insight','clarity','confused',
        'clear','thinking','perception','concepts','identity','ego','dissolve','looping','logic',
        'recognition','belief','interpretation','memory','language','narrative','meaning','mindspace',
        'headspace','overthinking','mental','clarification','self-talk','rational','intellect',
        'philosophical','metacognition','rumination','stream of consciousness','inner dialogue',
        'cognitive dissonance','hyperfocus'
    ],
    'physical': [
        'body','skin','breath','heart','energy','vibration','tingling','warm','heavy','light','pressure',
        'sensation','nausea','shaking','sweating','floating','stillness','tightness','spasm','motion',
        'trembling','cold','breathing','heartbeat','twitching','dry mouth','muscles','stiffness',
        'paralysis','numbness','restlessness','chills','sweat','clenching','somatic','bodyload',
        'temperature','digestive','physical release'
    ],
    'mystical': [
        'ego','self','unity','divine','spiritual','transcend','infinite','oneness','god','universe',
        'connected','sacred','eternal','death','rebirth','timeless','interconnected','presence','source',
        'void','light','beyond','higher power','awakening','realm','dimension','truth','immortality',
        'ego death','no-self','nirvana','cosmic','transcendence','pure being','karma','light being',
        'soul','heaven','angelic','time distortion','godlike','divinity','portal','third eye',
        'nondual','dissolution','samsara','infinity','entity','timelessness'
    ],
    'temporal': [
        'onset','peak','comedown','duration','timeline','hours','minutes','start','beginning',
        'after','later','build-up','before','end','wave','early','gradual','suddenly',
        'phase','stage','passed','elapsed','over time','rush','fade','linger','moment',
        'slowly','time passed','time distorted','hour mark','entry','exit'
    ]
}

EXP_TERMS_FLAT = {word.lower() for words in experiential_terms.values() for word in words}
print(f" Loaded {len(EXP_TERMS_FLAT)} experiential terms.")

 Loaded 240 experiential terms.


In [6]:
_whitespace_re = re.compile(r"[ \t\v\f]+")
_newlines_re   = re.compile(r"\s*\n\s*")

def clean_text_basic(txt: str) -> str:
    if not isinstance(txt, str) or pd.isna(txt):
        return ""
    txt = txt.replace("\r\n", "\n").replace("\r", "\n")
    txt = _newlines_re.sub("\n", txt)
    txt = _whitespace_re.sub(" ", txt).strip()
    return txt.lower()

def tokenize(text: str) -> list:
    return re.findall(r'\b\w+\b', text)

def detokenize(tokens: list) -> str:
    return " ".join(tokens)

def chunk_text_by_tokens(text: str, max_len: int = 512, stride: int = 384):
    tokens = tokenize(text)
    if len(tokens) <= max_len:
        return [text]
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_len
        chunk_tokens = tokens[start:end]
        chunks.append(detokenize(chunk_tokens))
        if end >= len(tokens):
            break
        start += stride
    return chunks

# Load SpaCy
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer", "ner"])
if "senter" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")

def extract_sentences(text: str) -> list:
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

def reassemble_sentences(chunks: list) -> list:
    seen = set()
    sentences = []
    for chunk in chunks:
        sents = extract_sentences(chunk)
        for sent in sents:
            if sent not in seen:
                seen.add(sent)
                sentences.append(sent)
    return sentences

In [7]:
model = SentenceTransformer("all-mpnet-base-v2")
print(f" SBERT loaded. Using device: {model.device}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

 SBERT loaded. Using device: cuda:0


In [8]:
def sbert_mmr_summary(
    sentences: list,
    relevance_weight: float = 0.6,
    diversity_weight: float = 0.2,
    position_bias: float = 0.05,
    similarity_threshold: float = 0.25,
    top_k_ratio: float = 0.25,
    agg: str = "first"
) -> str:
    if len(sentences) == 0:
        return ""
    if len(sentences) == 1:
        return sentences[0]

    embeddings = model.encode(sentences, convert_to_numpy=True, show_progress_bar=False)

    # Document-level query
    if agg == "mean":
        query_vec = embeddings.mean(axis=0, keepdims=True)
    elif agg == "first":
        query_vec = embeddings[:1]  # Use first sentence as query
    else:
        query_vec = embeddings.mean(axis=0, keepdims=True)

    relevance = cosine_similarity(embeddings, query_vec).flatten()

    # Position bias
    pos_weights = np.exp(-position_bias * np.arange(len(sentences)))
    relevance = relevance * pos_weights

    # Filter low-similarity sentences
    high_rel_mask = relevance >= similarity_threshold
    if not high_rel_mask.any():
        k = max(1, int(round(len(sentences) * top_k_ratio)))
        top_idx = np.argsort(-relevance)[:k]
    else:
        candidates = np.where(high_rel_mask)[0]
        k = max(1, int(round(len(sentences) * top_k_ratio)))
        k = min(k, len(candidates))

        selected = []
        candidate_list = candidates.tolist()

        while len(selected) < k and candidate_list:
            if not selected:
                best_idx = candidate_list[np.argmax(relevance[candidate_list])]
            else:
                sel_embs = embeddings[selected]
                diversity = np.min(cosine_similarity(embeddings[candidate_list], sel_embs), axis=1)
                mmr_score = relevance[candidate_list] * relevance_weight - diversity * diversity_weight
                best_local = np.argmax(mmr_score)
                best_idx = candidate_list[best_local]
            selected.append(best_idx)
            candidate_list.remove(best_idx)
        top_idx = sorted(selected)

    return " ".join(sentences[i] for i in top_idx)

In [9]:
def score_semantic(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    emb_s = model.encode([summary], convert_to_numpy=True)
    emb_d = model.encode([document], convert_to_numpy=True)
    return float(cosine_similarity(emb_s, emb_d)[0,0])

def score_experiential(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    doc_terms = {w for w in EXP_TERMS_FLAT if w in document}
    if not doc_terms:
        return 1.0
    sum_terms = {w for w in EXP_TERMS_FLAT if w in summary}
    return len(doc_terms & sum_terms) / len(doc_terms)

def score_coherence(summary: str) -> float:
    if not summary.strip():
        return 0.0
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', summary) if s.strip()]
    if len(sents) < 2:
        return 1.0
    emb = model.encode(sents, convert_to_numpy=True)
    sims = [cosine_similarity(emb[i:i+1], emb[i+1:i+2])[0,0] for i in range(len(sents)-1)]
    return float(np.mean(sims)) if sims else 0.0

def custom_score(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    sem = score_semantic(summary, document)
    exp = score_experiential(summary, document)
    coh = score_coherence(summary)
    return 0.5*sem + 0.3*exp + 0.2*coh


In [10]:
best_params = {
    "relevance_weight": 0.6,
    "diversity_weight": 0.2,
    "position_bias": 0.05,
    "similarity_threshold": 0.25,
    "agg": "first",
    "top_k_ratio": 0.25
}


In [11]:
results = []

for substance, filename in expected_files.items():
    print(f"\n Processing {substance} from {filename}...")
    df = pd.read_csv(filename)

    if "report_text" not in df.columns:
        raise KeyError(f"Missing 'report_text' in {filename}")

    print(f"Loaded {len(df)} reports for {substance}")

    summaries = []
    semantic_scores = []
    experiential_scores = []
    coherence_scores = []
    final_scores = []

    for text in tqdm(df["report_text"].astype(str).fillna(""), total=len(df), desc=f"Summarizing {substance}"):
        cleaned = clean_text_basic(text)
        chunks = chunk_text_by_tokens(cleaned, max_len=512, stride=384)
        sentences = reassemble_sentences(chunks)

        summary = sbert_mmr_summary(sentences, **best_params)
        summaries.append(summary)

        sem = score_semantic(summary, cleaned)
        exp = score_experiential(summary, cleaned)
        coh = score_coherence(summary)
        final = custom_score(summary, cleaned)

        semantic_scores.append(sem)
        experiential_scores.append(exp)
        coherence_scores.append(coh)
        final_scores.append(final)

    # Add summary column
    df["summary"] = summaries
    output_filename = filename.replace(".csv", "_with_summary.csv")
    df.to_csv(output_filename, index=False)
    files.download(output_filename)  # Auto-download in Colab
    print(f" Saved and downloaded: {output_filename}")

    # Aggregate scores
    avg_sem = np.mean(semantic_scores)
    avg_exp = np.mean(experiential_scores)
    avg_coh = np.mean(coherence_scores)
    avg_final = np.mean(final_scores)

    results.append({
        "Model": "M3",
        "Substance": substance,
        "Semantic (TF-IDF/SBERT)": f"{avg_sem:.2f} (SBERT)",
        "Experiential": f"{avg_exp:.2f}",
        "Coherence (TF-IDF/SBERT)": f"{avg_coh:.2f} (SBERT)",
        "Final Score": f"{avg_final:.2f}"
    })

    print(f" {substance} - Avg Final Score: {avg_final:.3f}")


 Processing DMT from dmt_test_100.csv...
Loaded 100 reports for DMT


Summarizing DMT: 100%|██████████| 100/100 [00:26<00:00,  3.83it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

 Saved and downloaded: dmt_test_100_with_summary.csv
 DMT - Avg Final Score: 0.884

 Processing LSD from lsd_test_100.csv...
Loaded 100 reports for LSD


Summarizing LSD: 100%|██████████| 100/100 [00:31<00:00,  3.19it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

 Saved and downloaded: lsd_test_100_with_summary.csv
 LSD - Avg Final Score: 0.847

 Processing Psilocybin from mushroom_test_100.csv...
Loaded 100 reports for Psilocybin


Summarizing Psilocybin: 100%|██████████| 100/100 [00:32<00:00,  3.06it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

 Saved and downloaded: mushroom_test_100_with_summary.csv
 Psilocybin - Avg Final Score: 0.839


In [12]:
results_df = pd.DataFrame(results)
print("\n" + "="*70)
print(" FINAL RESULTS: SBERT+MMR (M3) ON TEST SET")
print("="*70)
print(results_df.to_string(index=False))


📊 FINAL RESULTS: SBERT+MMR (M3) ON TEST SET
Model  Substance Semantic (TF-IDF/SBERT) Experiential Coherence (TF-IDF/SBERT) Final Score
   M3        DMT            0.95 (SBERT)         0.70             0.99 (SBERT)        0.88
   M3        LSD            0.94 (SBERT)         0.59             0.99 (SBERT)        0.85
   M3 Psilocybin            0.95 (SBERT)         0.56             0.99 (SBERT)        0.84


In [13]:
# Save and download results
results_df.to_csv("sbert_mmr_test_scores.csv", index=False)
files.download("sbert_mmr_test_scores.csv")
print(f"\n Results table downloaded: sbert_mmr_test_scores.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


💾 Results table downloaded: sbert_mmr_test_scores.csv
