In [50]:
# üß† Step 1: Setup & Imports
import json
import random
from pathlib import Path
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import ClassifierMixin

In [33]:
ds = load_dataset("ShenLab/MentalChat16K", split='train')
print(ds.column_names)

['instruction', 'input', 'output']


In [40]:
def load_conversation_data_from_hf(ds):
    pairs = []
    tags = []

    # Skip first 100 rows
    for row in ds:
        # If "input" is empty, use "instruction" as context
        context = str(row["input"]).strip() if str(row["input"]).strip() else str(row["instruction"]).strip()
        response = str(row["output"]).strip()
        pairs.append((context, response))
        tags.append("mental_health")  # generic tag

    return pairs, tags

# df is already loaded from Hugging Face or JSON
pairs, tags = load_conversation_data_from_hf(ds)
print(f"Loaded {len(pairs)} conversation pairs (after skipping first 100).")

Loaded 16084 conversation pairs (after skipping first 100).


In [45]:
# üß¨ Step 3: Embed Patterns & Store in Chroma DB

def chunkify(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]

batch_size = 5400

texts = [p[0] for p in pairs]
metadatas = [{"response": p[1], "tag": tag} for p, tag in zip(pairs, tags)]

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

chroma_db = Chroma(
    collection_name="pandora_conversations",
    embedding_function=embedding_model,
    persist_directory="./chroma_langchain_db"
)

for text_batch, meta_batch in zip(chunkify(texts, batch_size), chunkify(metadatas, batch_size)):
    chroma_db.add_texts(texts=text_batch, metadatas=meta_batch)

print(f"‚úÖ Added all embeddings in batches of {batch_size} to Chroma database.")


‚úÖ Added all embeddings in batches of 5400 to Chroma database.


In [46]:
# üîç Step 4: Test Semantic Query
def semantic_response(query, db):
    results = db.similarity_search(query, k=1)
    if results:
        return results[0].metadata['response']
    return "I'm not sure how to respond to that."

semantic_response("I've been very tense recently but heard about meditation for relaxation. Should I try it?", chroma_db)


"Your upbringing and past experiences with meditation have undoubtedly influenced your current relationship with the practice. It's essential to acknowledge the emotional baggage that comes with it and approach meditation with compassion and curiosity. The frustration and agitation you feel when you meditate may be a manifestation of past experiences or unmet expectations. It's important to remember that meditation is not about achieving a particular state or outcome but rather about being present with your thoughts and emotions."

In [53]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    torch_dtype=torch.float16,  # Use float16 to reduce memory
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:04<00:00,  2.42s/it]


In [62]:
# üì¶ Step 6: Predict Tag & Response
def generate_response(query, max_new_tokens=400, temperature=0.7):
    # Retrieve and deduplicate context
    results = chroma_db.similarity_search(query, k=3)
    unique_contexts = list(set([doc.page_content for doc in results]))
    context = "\n".join([f"- {ctx}" for ctx in unique_contexts])
    
    # Improved prompt template
    system_prompt = f"""You are a compassionate mental health assistant. 
    Consider these insights from similar situations (adapt appropriately):
    {context}
    
    Key principles:
    1. Be wise, empathetic and understanding
    2. Suggest practical steps
    3. Recommend professional help when appropriate
    4. Maintain loving, caring, but direct tone"""
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": query}
    ]
    
    # Tokenize with proper handling
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
        return_attention_mask=False  # We'll create manually
    ).to(model.device)
    
    # Create optimized attention mask
    attention_mask = inputs.ne(tokenizer.pad_token_id).float()
    
    # Enhanced generation config
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=0.9,  # Nucleus sampling
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Clean output processing
    full_response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    
    # Post-processing for clinical safety
    if "suicide" in query.lower() or "self-harm" in query.lower():
        disclaimer = "\n\n[Important] If you're having thoughts of harming yourself, please call the National Suicide Prevention Lifeline at 988 (US) or your local crisis hotline immediately."
        return full_response + disclaimer
    
    return full_response
# Example usage
response = generate_response("I hate going to lunch at my school. While I stand in line, people call me out of my name and disrespect me.")
print(response)


It is truly distressful that you feel this way about your experience during daily activities such as having lunch with peers‚Äîit‚Äôs crucial for everyone involved herein feels respected regardless where or how we engage socially within our environments; schools serve not only academic purposes by fostering growth through education-based learning opportunities alongside interpersonal development amongst students themselves too! Here some suggestions based on key guiding principals mentioned previously which might aid better handling instances occurring while standing queues amidst classroom settings whilst awaitng one`S turn against time constraint imposed thereupon :  üëâ **Firstly** ‚Äì try stepping away momentarily whenever possible without drawing undue attention towards yourself doing so if feasible since isolating oneself occasionally could reduce exposure levels thereby limit incidents happening subsequently‚Ä¶ *(Implement gentle deflection)* Second Step*‚Äì Remember Self Care 