In [9]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import joblib
import os

In [10]:
# Optional import of sentence-transformers with compatibility message
try:
    from sentence_transformers import SentenceTransformer
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
except ImportError:
    embedding_model = None
    print("⚠️ sentence-transformers not installed or incompatible. Please install with:")
    print("pip install -U sentence-transformers huggingface-hub transformers")

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NISCHAY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NISCHAY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NISCHAY\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# Load dataset
df = pd.read_csv(r"D:\chatbot\20200325_counsel_chat.csv", encoding="utf-8")

In [12]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

df['questionText'] = df['questionText'].apply(clean_text)
df['answerText'] = df['answerText'].apply(clean_text)

In [13]:
# Encode labels
X = df['questionText']
y = df['topic']
le = LabelEncoder()
y_encoded = le.fit_transform(y)
joblib.dump(le, 'label_encoder_toc.joblib')

['label_encoder_toc.joblib']

In [21]:
if embedding_model:
    print("Encoding questions and answers...")
    question_embeddings = embedding_model.encode(X.tolist(), show_progress_bar=True)
    answer_embeddings = embedding_model.encode(df['answerText'].tolist(), show_progress_bar=True)

    # Save embeddings
    os.makedirs("model", exist_ok=True)
    joblib.dump(question_embeddings, 'model/toc_question_embeddings.dump')
    joblib.dump(answer_embeddings, 'model/toc_answer_embeddings.dump')

    # Save topic-answer map
    topic_answer_map = {}
    for topic, answer in zip(df['topic'], df['answerText']):
        topic_answer_map.setdefault(topic, []).append(answer)
    with open("model/toc_topic_answers.pkl", "wb") as f:
        pickle.dump(topic_answer_map, f)

    # Sample retrieval function
    def retrieve_similar_question(user_query):
        print(f"\nUser Query: {user_query}")
        query_embedding = embedding_model.encode([clean_text(user_query)])
        sims = cosine_similarity(query_embedding, question_embeddings)[0]
        top_idx = np.argmax(sims)
        print("\nMost Similar Question:")
        print(f"→ {df.iloc[top_idx]['questionText']}\n→ {df.iloc[top_idx]['answerText']}")

    # Example usage
    retrieve_similar_question("How do I handle depression and anxiety?")
else:
    print("SentenceTransformer model is unavailable. Skipping embedding and retrieval.")


Encoding questions and answers...


Batches:   0%|          | 0/67 [00:00<?, ?it/s]

Batches:   0%|          | 0/67 [00:00<?, ?it/s]


User Query: How do I handle depression and anxiety?

Most Similar Question:
→ dealing depression anxiety number year medication lately depression felt worse counseling help
→ thank asking important question find three step getting ready treatment step one expressing interest wanting receiving treatment outcome positive behavioral change congratulation first step showing readiness start counseling asking question second step find counselor specializes treating client anxiety depression therapeutic orientation found helpful treating client anxiety depression combination cognitive behavioral therapy mindfulness solution focused brief therapy receiving meditation symptom part treatment part receiving counseling increase resilience future event research found medication psychotherapy treatment together show effective outcome depression third step increase positive selftalk motivate attend treatment counselor aware anxiety fear associated talking new professional first time however remind i