In [1]:
import re
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
import pickle
from tqdm import tqdm

print("Imports done.")

Imports done.


In [2]:
experiential_terms = {
    'emotional': [
        'felt','feeling','emotion','joy','fear','anxiety','bliss','love','terror','peace','calm',
        'excited','overwhelmed','gratitude','euphoria','sadness','longing','crying','ecstasy','relief',
        'compassion','grief','awe','anger','release','hope','despair','serenity','agitation','comfort',
        'purging','vulnerability','intimacy','empathy','tension','melancholy','abandon','appreciation'
    ],
    'sensory': [
        'visual','hear','sound','color','bright','pattern','geometry','music','taste','smell','see',
        'saw','colors','sounds','shapes','textures','movement','melting','vibrations','pulsing',
        'fractal','echo','flashing','tunnel','fluid','shimmering','sparkling','synesthesia',
        'auditory','trails','glow','hallucination','pulsate','distortion','radiance','static',
        'blurred','lightness','glimmer','resonance','tactile','kaleidoscopic'
    ],
    'cognitive': [
        'thought','mind','consciousness','aware','realize','understand','insight','clarity','confused',
        'clear','thinking','perception','concepts','identity','ego','dissolve','looping','logic',
        'recognition','belief','interpretation','memory','language','narrative','meaning','mindspace',
        'headspace','overthinking','mental','clarification','self-talk','rational','intellect',
        'philosophical','metacognition','rumination','stream of consciousness','inner dialogue',
        'cognitive dissonance','hyperfocus'
    ],
    'physical': [
        'body','skin','breath','heart','energy','vibration','tingling','warm','heavy','light','pressure',
        'sensation','nausea','shaking','sweating','floating','stillness','tightness','spasm','motion',
        'trembling','cold','breathing','heartbeat','twitching','dry mouth','muscles','stiffness',
        'paralysis','numbness','restlessness','chills','sweat','clenching','somatic','bodyload',
        'temperature','digestive','physical release'
    ],
    'mystical': [
        'ego','self','unity','divine','spiritual','transcend','infinite','oneness','god','universe',
        'connected','sacred','eternal','death','rebirth','timeless','interconnected','presence','source',
        'void','light','beyond','higher power','awakening','realm','dimension','truth','immortality',
        'ego death','no-self','nirvana','cosmic','transcendence','pure being','karma','light being',
        'soul','heaven','angelic','time distortion','godlike','divinity','portal','third eye',
        'nondual','dissolution','samsara','infinity','entity','timelessness'
    ],
    'temporal': [
        'onset','peak','comedown','duration','timeline','hours','minutes','start','beginning',
        'after','later','build-up','before','end','wave','early','gradual','suddenly',
        'phase','stage','passed','elapsed','over time','rush','fade','linger','moment',
        'slowly','time passed','time distorted','hour mark','entry','exit'
    ]
}

EXP_TERMS_FLAT = {word.lower() for words in experiential_terms.values() for word in words}

# Load SpaCy
try:
    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer", "ner"])
    if "senter" not in nlp.pipe_names:
        nlp.add_pipe("sentencizer")
    print("SpaCy model loaded.")
except OSError:
    raise RuntimeError("SpaCy model not found. Run: python -m spacy download en_core_web_sm")

from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS

SpaCy model loaded.


In [3]:
_whitespace_re = re.compile(r"[ \t\v\f]+")
_newlines_re   = re.compile(r"\s*\n\s*")

def clean_text_basic(txt: str) -> str:
    if not isinstance(txt, str) or pd.isna(txt):
        return ""
    txt = txt.lower()
    txt = txt.replace("\r\n", "\n").replace("\r", "\n")
    txt = _newlines_re.sub("\n", txt)
    txt = _whitespace_re.sub(" ", txt).strip()
    return txt

def sentencize_and_clean(raw_text: str, nlp, remove_sw_for_graph=True, remove_sw_for_summary=False):
    raw_text_clean = clean_text_basic(raw_text)
    doc = nlp.make_doc(raw_text_clean)
    if "senter" in nlp.pipe_names:
        nlp.get_pipe("senter")(doc)
    else:
        doc = nlp(raw_text_clean)
    sents = [s.text.strip() for s in doc.sents if s.text.strip()]

    def remove_stopwords(s):
        d = nlp.make_doc(s)
        tokens = [t.text for t in d if t.is_alpha and t.text not in SPACY_STOPWORDS]
        return " ".join(tokens).strip()

    s_graph = [remove_stopwords(s) for s in sents] if remove_sw_for_graph else sents
    s_summary = sents if not remove_sw_for_summary else [remove_stopwords(s) for s in sents]

    return sents, s_graph, s_summary, raw_text_clean

In [4]:
def _precompute_doc(text, nlp):
    sents, s_graph, s_summary, cleaned = sentencize_and_clean(
        text, nlp,
        remove_sw_for_graph=True,
        remove_sw_for_summary=False
    )
    if len(sents) == 0:
        return [], np.zeros((0,0), dtype=np.float32), cleaned

    vec = TfidfVectorizer(lowercase=False, stop_words="english", max_features=4000)
    X = vec.fit_transform(s_graph)
    sim = cosine_similarity(X).astype(np.float32)
    np.fill_diagonal(sim, 0.0)
    return s_summary, sim, cleaned

In [5]:
def _pagerank(P: np.ndarray, damping: float = 0.85, eps: float = 1e-6, max_iter: int = 100) -> np.ndarray:
    n = P.shape[0]
    if n == 0:
        return np.array([])
    v = np.ones(n, dtype=np.float32) / n
    teleport = np.ones(n, dtype=np.float32) / n
    for _ in range(max_iter):
        v_new = damping * P.T.dot(v) + (1 - damping) * teleport
        if np.linalg.norm(v_new - v, ord=1) < eps:
            return v_new
        v = v_new
    return v

def _summary_from_cache(entry, compression_ratio: float, similarity_threshold: float, damping_factor: float):
    s_render = entry["s_render"]
    sim = entry["sim"]
    if len(s_render) == 0 or sim.size == 0:
        return ""

    A = np.where(sim >= similarity_threshold, sim, 0.0).astype(np.float32)
    row_sums = A.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1.0
    P = A / row_sums

    scores = _pagerank(P, damping=damping_factor)
    k = max(1, int(round(len(s_render) * compression_ratio)))
    top_idx = np.argsort(-scores)[:k]
    return " ".join(s_render[i] for i in sorted(top_idx)).strip()

In [6]:
def _tfidf_cosine(a: str, b: str) -> float:
    if not a.strip() or not b.strip():
        return 0.0
    vec = TfidfVectorizer(lowercase=False, stop_words="english", max_features=4000)
    try:
        X = vec.fit_transform([a, b])
        return float(cosine_similarity(X[0], X[1])[0,0])
    except:
        return 0.0

def score_semantic(summary: str, document: str) -> float:
    return _tfidf_cosine(summary, document)

def score_experiential(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    doc_terms = {w for w in EXP_TERMS_FLAT if w in document}
    if not doc_terms:
        return 1.0
    sum_terms = {w for w in EXP_TERMS_FLAT if w in summary}
    return len(doc_terms & sum_terms) / len(doc_terms)

def score_coherence(summary: str) -> float:
    if not summary.strip():
        return 0.0
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', summary) if s.strip()]
    if len(sents) < 2:
        return 1.0
    vec = TfidfVectorizer(lowercase=False, stop_words="english", max_features=4000)
    try:
        X = vec.fit_transform(sents)
        sims = [float(cosine_similarity(X[i], X[i+1])[0,0]) for i in range(len(sents)-1)]
        return float(np.mean(sims)) if sims else 0.0
    except:
        return 0.0

def custom_score(summary: str, document: str) -> float:
    if not summary or not document:
        return 0.0
    sem = score_semantic(summary, document)
    exp = score_experiential(summary, document)
    coh = score_coherence(summary)
    return 0.5*sem + 0.3*exp + 0.2*coh

In [7]:
best_params = {
    'compression_ratio': 0.3,
    'similarity_threshold': 0.15,
    'damping_factor': 0.85
}

In [None]:
DATA_DIR = Path(r"")
test_files = {
    "DMT": DATA_DIR / "dmt_test_100.csv",
    "LSD": DATA_DIR / "lsd_test_100.csv",
    "Psilocybin": DATA_DIR / "mushroom_test_100.csv"
}

# Normalize substance names
def _norm_sub(x):
    if not isinstance(x, str): return "OTHER"
    y = x.strip().upper()
    if y in {"DMT"}: return "DMT"
    if y in {"LSD", "ACID"}: return "LSD"
    if y in {"PSILOCYBIN", "PSILOCYBIN MUSHROOM", "MUSHROOM", "MUSHROOMS", "PSILOCYBE"}:
        return "Psilocybin"
    return y

results = []

for substance, file_path in test_files.items():
    print(f"\n Processing {substance}...")
    if not file_path.exists():
        print(f" File not found: {file_path}")
        continue

    df = pd.read_csv(file_path)
    if "report_text" not in df.columns:
        raise KeyError(f"Missing 'report_text' in {file_path}")

    df["_subst_norm"] = df["substance"].map(_norm_sub)
    df = df[df["_subst_norm"] == substance].copy()
    print(f"Loaded {len(df)} reports for {substance}")

    # Precompute cache
    cache = []
    for text in df["report_text"].astype(str).fillna("").tolist():
        s_render, sim, cleaned = _precompute_doc(text, nlp)
        cache.append({
            "s_render": s_render,
            "sim": sim,
            "doc_clean": cleaned
        })

    # Generate summaries
    summaries = []
    semantic_scores = []
    experiential_scores = []
    coherence_scores = []
    final_scores = []

    for entry in tqdm(cache, desc=f"Summarizing {substance}", total=len(cache)):
        summary = _summary_from_cache(entry, **best_params)
        summaries.append(summary)

        sem = score_semantic(summary, entry["doc_clean"])
        exp = score_experiential(summary, entry["doc_clean"])
        coh = score_coherence(summary)
        final = custom_score(summary, entry["doc_clean"])

        semantic_scores.append(sem)
        experiential_scores.append(exp)
        coherence_scores.append(coh)
        final_scores.append(final)

    # Add summary to df
    df["summary"] = summaries

    # Save updated df
    output_path = file_path.with_name(file_path.stem + "_with_summary.csv")
    df.to_csv(output_path, index=False)
    print(f"✅ Saved summaries to {output_path}")

    # Aggregate scores
    avg_sem = np.mean(semantic_scores)
    avg_exp = np.mean(experiential_scores)
    avg_coh = np.mean(coherence_scores)
    avg_final = np.mean(final_scores)

    results.append({
        "Model": "M1",  # LexRank
        "Substance": substance,
        "Semantic (TF-IDF/SBERT)": f"{avg_sem:.2f} (TF-IDF)",
        "Experiential": f"{avg_exp:.2f}",
        "Coherence (TF-IDF/SBERT)": f"{avg_coh:.2f} (TF-IDF)",
        "Final Score": f"{avg_final:.2f}"
    })

    print(f"✔️ {substance} - Avg Final Score: {avg_final:.3f}")


📊 Processing DMT...
Loaded 100 reports for DMT


Summarizing DMT: 100%|███████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 21.37it/s]


✅ Saved summaries to D:\GitHub\Psychedelics Summary\Dataset\Train Test\dmt_test_100_with_summary.csv
✔️ DMT - Avg Final Score: 0.443

📊 Processing LSD...
Loaded 100 reports for LSD


Summarizing LSD: 100%|███████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 24.09it/s]


✅ Saved summaries to D:\GitHub\Psychedelics Summary\Dataset\Train Test\lsd_test_100_with_summary.csv
✔️ LSD - Avg Final Score: 0.461

📊 Processing Psilocybin...
Loaded 100 reports for Psilocybin


Summarizing Psilocybin: 100%|████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 21.76it/s]

✅ Saved summaries to D:\GitHub\Psychedelics Summary\Dataset\Train Test\mushroom_test_100_with_summary.csv
✔️ Psilocybin - Avg Final Score: 0.469





In [9]:
results_df = pd.DataFrame(results)
print("\n" + "="*60)
print(" FINAL RESULTS TABLE (FOR REPORTING)")
print("="*60)
print(results_df.to_string(index=False))


 FINAL RESULTS TABLE (FOR REPORTING)
Model  Substance Semantic (TF-IDF/SBERT) Experiential Coherence (TF-IDF/SBERT) Final Score
   M1        DMT           0.63 (TF-IDF)         0.37            0.09 (TF-IDF)        0.44
   M1        LSD           0.65 (TF-IDF)         0.40            0.08 (TF-IDF)        0.46
   M1 Psilocybin           0.66 (TF-IDF)         0.41            0.08 (TF-IDF)        0.47


In [10]:
results_df.to_csv(DATA_DIR / "lexrank_test_scores.csv", index=False)
print(f"\n Results saved to {DATA_DIR / 'lexrank_test_scores.csv'}")


 Results saved to D:\GitHub\Psychedelics Summary\Dataset\Train Test\lexrank_test_scores.csv
