In [2]:
!pip install datasets pandas spacy contractions tqdm

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m 

In [28]:
import pandas as pd
import re
import contractions
import spacy
from datasets import load_dataset
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

In [29]:
def clean_text(text):
    if pd.isna(text) or text == "": return ""
    text = contractions.fix(str(text))
    text = text.replace("\\n", "\n")
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b(user|assistant|bot|therapist|counselor|client)\s*:', '', text, flags=re.I)
    return text.strip()

def decompose_text(text):
    """Splits into atomic chunks so the RAG finds specific thoughts."""
    doc = nlp(clean_text(text))
    atoms = []
    for sent in doc.sents:
        current_chunk = []
        for token in sent:
            if token.dep_ == "cc" and token.text.lower() in ["and", "but", "so"]:
                if current_chunk: atoms.append(" ".join(current_chunk).strip())
                current_chunk = []
            else:
                current_chunk.append(token.text)
        if current_chunk: atoms.append(" ".join(current_chunk).strip())
    return [a for a in atoms if len(a) > 15]

In [30]:
def get_conversational_data():
    print("Loading Conversational Sets...")

    ds_shen = load_dataset("ShenLab/MentalChat16K", split='train')
    df_shen = pd.DataFrame(ds_shen)[['input', 'output']].rename(columns={'input': 'q', 'output': 'a'})

    ds_amod = load_dataset("Amod/mental_health_counseling_conversations", split='train')
    df_amod = pd.DataFrame(ds_amod)[['Context', 'Response']].rename(columns={'Context': 'q', 'Response': 'a'})

    combined = pd.concat([df_shen, df_amod], ignore_index=True)
    rows = []

    for _, row in tqdm(combined.iterrows(), total=len(combined)):
        if "tough one" in str(row['a']).lower() or len(str(row['a'])) < 20:
            continue

        atoms = decompose_text(row['q'])
        for atom in atoms:
            rows.append({
                "text": atom,
                "source": "general_counseling",
                "technique": "empathy-base",
                "response": clean_text(row['a'])
            })
    return pd.DataFrame(rows)

In [31]:
def get_expert_data():
    print("Loading CBT-Bench & Cactus...")
    ds_cbt = load_dataset("Psychotherapy-LLM/CBT-Bench", "core_fine_seed", split="train")
    ds_cactus = load_dataset("LangAGI-Lab/cactus", split="train")

    expert_rows = []

    for entry in ds_cbt:
        content = f"{entry['situation']} {entry['thoughts']}"
        atoms = decompose_text(content)
        insight = f"**Core Belief Identified:** {', '.join(entry['core_belief_fine_grained'])}"
        for atom in atoms:
            expert_rows.append({"text": atom, "source": "CBT-Bench", "technique": "Core-Belief", "response": insight})

    for entry in ds_cactus:
        expert_rows.append({
            "text": clean_text(entry['thought']),
            "source": "Cactus-Strategy",
            "technique": entry['cbt_technique'],
            "response": f"**CBT Plan:**\n{entry['cbt_plan']}"
        })
        diag = entry['dialogue']
        for i in range(len(diag)-1):
            if "Client:" in diag[i] and "Counselor:" in diag[i+1]:
                expert_rows.append({
                    "text": clean_text(diag[i]),
                    "source": "Cactus-Dialogue",
                    "technique": entry['cbt_technique'],
                    "response": clean_text(diag[i+1])
                })

    return pd.DataFrame(expert_rows)

In [32]:
def get_protocols():
    protocols = []
    p5_steps = ["Presenting Problem", "Predisposing", "Precipitating", "Perpetuating", "Protective"]
    for i, p in enumerate(p5_steps, 1):
        protocols.append({
            "text": f"How to do 5Ps Step {i}: {p}",
            "source": "CBT-Manual",
            "technique": "5Ps-Formulation",
            "response": f"**5Ps Step {i}: {p}**. Let's identify this together."
        })
    col7_steps = ["Situation", "Mood", "Auto-Thought", "Evidence For", "Evidence Against", "Alt Perspective", "Rerate Mood"]
    for i, c in enumerate(col7_steps, 1):
        protocols.append({
            "text": f"Thought Record Column {i}: {c}",
            "source": "CBT-Manual",
            "technique": "7-Column-Record",
            "response": f"**7-Column Step {i}: {c}**. Please describe this part of your experience."
        })
    return pd.DataFrame(protocols)

In [33]:
df1 = get_conversational_data()
df2 = get_expert_data()
df3 = get_protocols()

final_kb = pd.concat([df1, df2, df3], ignore_index=True)

final_kb['response'] = final_kb['response'].apply(lambda x: x.replace("\\n", "\n") if isinstance(x, str) else x)
final_kb = final_kb.drop_duplicates(subset=['text']).reset_index(drop=True)
final_kb['chunk_id'] = range(len(final_kb))

print(f"Final Count: {len(final_kb)}")
final_kb.to_csv("clinical_kb.csv", index=False)

Loading Conversational Sets...


100%|██████████| 19596/19596 [10:11<00:00, 32.04it/s]


Loading CBT-Bench & Cactus...
Final Count: 140289


In [34]:
final_kb

Unnamed: 0,text,source,technique,response,chunk_id
0,I have been struggling with my mental health f...,general_counseling,empathy-base,I understand that you have been dealing with a...,0
1,I can not seem to find a way to cope with it .,general_counseling,empathy-base,I understand that you have been dealing with a...,1
2,"I have tried visualization , positive thinking ,",general_counseling,empathy-base,I understand that you have been dealing with a...,2
3,"even medication ,",general_counseling,empathy-base,I understand that you have been dealing with a...,3
4,nothing seems to work .,general_counseling,empathy-base,I understand that you have been dealing with a...,4
...,...,...,...,...,...
140284,Thought Record Column 3: Auto-Thought,CBT-Manual,7-Column-Record,**7-Column Step 3: Auto-Thought**. Please desc...,140284
140285,Thought Record Column 4: Evidence For,CBT-Manual,7-Column-Record,**7-Column Step 4: Evidence For**. Please desc...,140285
140286,Thought Record Column 5: Evidence Against,CBT-Manual,7-Column-Record,**7-Column Step 5: Evidence Against**. Please ...,140286
140287,Thought Record Column 6: Alt Perspective,CBT-Manual,7-Column-Record,**7-Column Step 6: Alt Perspective**. Please d...,140287


In [35]:
final_kb["technique"].unique()

array(['empathy-base', 'Core-Belief', 'Decatastrophizing',
       'Alternative Perspective', 'Reality Testing',
       'Evidence-Based Questioning', 'Behavior Experiment',
       'Changing Rules to Wishes', 'Efficiency Evaluation',
       'Problem-Solving Skills Training', 'Pros and Cons Analysis',
       'Activity Scheduling', 'Thought Experiment',
       'Self-Assertiveness Training', 'Continuum Technique',
       'Reframes: Alternative Perspective',
       '-Decatastrophizing \n-Alternative Perspective \n-Thought Experiment',
       'Alternative perspective', '5Ps-Formulation', '7-Column-Record'],
      dtype=object)