In [None]:
import pandas as pd
import torch
import os
import re
import faiss
import hashlib
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline
import gradio as gr
from concurrent.futures import ThreadPoolExecutor

In [None]:
# === Load and clean dataset ===
df = pd.read_csv("20220401_counsel_chat.zip")
df.dropna(subset=["questionText", "questionTitle", "answerText", "therapistInfo", "therapistURL"], inplace=True)
df.reset_index(drop=True, inplace=True)
df["combinedQuestion"] = df["questionTitle"].str.strip() + " - " + df["questionText"].str.strip()
print(df.head())

   questionID                              questionTitle  \
0           0  Do I have too many issues for counseling?   
1           0  Do I have too many issues for counseling?   
2           0  Do I have too many issues for counseling?   
3           0  Do I have too many issues for counseling?   
4           0  Do I have too many issues for counseling?   

                                        questionText  \
0  I have so many issues to address. I have a his...   
1  I have so many issues to address. I have a his...   
2  I have so many issues to address. I have a his...   
3  I have so many issues to address. I have a his...   
4  I have so many issues to address. I have a his...   

                                        questionLink       topic  \
0  https://counselchat.com/questions/do-i-have-to...  depression   
1  https://counselchat.com/questions/do-i-have-to...  depression   
2  https://counselchat.com/questions/do-i-have-to...  depression   
3  https://counselchat.com/que

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load from local folders (offline)
embed_model = SentenceTransformer("hf_models/all-mpnet-base-v2")
reranker = CrossEncoder("hf_models/ms-marco-MiniLM-L-6-v2", device=device)
summarizer = pipeline("summarization", model="hf_models/distilbart-cnn-12-6", device=0 if torch.cuda.is_available() else -1)

print("All models loaded from local cache.")


Device set to use cpu


All models loaded from local cache.


In [None]:
# === Embedding + FAISS ===
EMBED_PATH = "mpnet_embeddings.pt"
INDEX_PATH = "faiss_index.index"
summary_cache = {}
query_cache = {}

if not os.path.exists(EMBED_PATH):
    embeddings = embed_model.encode(df["combinedQuestion"].tolist(), convert_to_tensor=True, show_progress_bar=True, normalize_embeddings=True)
    torch.save(embeddings, EMBED_PATH)
else:
    embeddings = torch.load(EMBED_PATH)

dimension = embeddings.shape[1]
if not os.path.exists(INDEX_PATH):
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings.cpu().numpy())
    faiss.write_index(index, INDEX_PATH)
else:
    index = faiss.read_index(INDEX_PATH)

In [None]:
# === Helpers ===
def hash_text(text: str) -> str:
    return hashlib.md5(text.encode()).hexdigest()

def build_psychologytoday_url(name: str) -> str:
    cleaned = re.sub(r'(?<=[a-z0-9])(?=[A-Z])', ' ', name)
    cleaned = re.sub(r'[\d/]+', '', cleaned)
    capitalized_words = re.findall(r'\b[A-Z][a-zA-Z\-]*\b', cleaned)
    first_two = capitalized_words[:2] if capitalized_words else ["Therapist"]
    query = '+'.join(first_two)
    return f"https://www.psychologytoday.com/us/therapists?search={query}"


In [None]:
#display ans
def format_answer(idx, row, summary):
    therapist = row.therapistInfo.strip()
    topic = row.topic.strip() if pd.notna(row.topic) else "Unknown"
    therapist_url = build_psychologytoday_url(therapist)
    views = int(row.views) if pd.notna(row.views) else 0
    upvotes = int(row.upvotes) if pd.notna(row.upvotes) else 0
    answer = row.answerText.strip()

    return f"""
💡 *Topic*: `{topic}`

### 🔷 Answer {idx + 1}
👩‍⚕️ **Therapist**: {therapist}  
🔗 [PsychologyToday Profile]({therapist_url})  
⚠️ _We link to public therapist listings for convenience. We do not verify or endorse them._

#### 📝 Summary:
{summary}

<details>
<summary>📖 Click to view full answer</summary>

{answer}

</details>

👁️ **Views**: {views}
👍 **Upvotes**: {upvotes}
"""

In [None]:
def summarize(text):
    key = hash_text(text)
    if key in summary_cache:
        return summary_cache[key]
    short_text = text[:512] if len(text) > 512 else text
    result = summarizer(short_text)[0]["summary_text"]
    summary_cache[key] = result
    return result


In [None]:
# === Main pipeline ===
def process_query(query: str) -> str:
    if not query:
        return "⚠️ Please enter a query..!!"

    query_key = hash_text(query)
    if query_key in query_cache:
        return query_cache[query_key]

    query_embedding = embed_model.encode([query], convert_to_tensor=True, normalize_embeddings=True)
    D, I = index.search(query_embedding.cpu().numpy(), k=50)
    filtered_indices = [i for i, score in zip(I[0], D[0]) if score >= 0.4]

    if not filtered_indices:
        return "❌ No relevant answers found."

    top_df = df.iloc[filtered_indices].copy()
    pairs = [[query, f"{row.combinedQuestion} {row.answerText}"] for row in top_df.itertuples()]
    rerank_scores = reranker.predict(pairs)
    top_df["rerank_score"] = rerank_scores
    top_df = top_df.sort_values(by=["rerank_score", "views", "upvotes"], ascending=[False, False, False]).head(3)

    summaries, results = [], []

    with ThreadPoolExecutor() as executor:
        sum_futures = [executor.submit(summarize, row.answerText) for row in top_df.itertuples()]
        for idx, (row, future) in enumerate(zip(top_df.itertuples(), sum_futures)):
            sum_text = future.result()
            summaries.append(sum_text)
            results.append(format_answer(idx, row, sum_text))

    final_summary = summarizer(" ".join(summaries))[0]["summary_text"]
    full_output = "\n\n---\n\n".join(results) + f"\n\n---\n\n🧠 **Final Summary**:\n{final_summary}"
    query_cache[query_key] = full_output
    return full_output

In [None]:
import pickle

# Save FAISS index (correct name: index)
faiss.write_index(index, "faiss_index.index")

# Save corpus embeddings (correct name: embeddings)
with open("mpnet_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings.cpu().numpy(), f)


In [None]:
# === Launch Gradio App ===
gr.Interface(
    fn=process_query,
    inputs=gr.Textbox(label="❓ Ask your mental health question here", placeholder="e.g. How can I deal with anxiety about work?", lines=2),
    outputs=gr.Markdown(label="🩺 Top Therapist Answers"),
    title="🧘 CounselChat Q&A Assisstant",
    description="Explain in brief about your problems.",
    allow_flagging="never"
).launch()




* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Your max_length is set to 142, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 142, but your input_length is only 110. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
Your max_length is set to 142, but your input_length is only 110. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
