In [2]:
import pandas as pd
import numpy as np
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer ## using this model to convert text into vector
import faiss
import warnings
from transformers import AutoTokenizer, AutoModelForCausalLM
warnings.filterwarnings("ignore")
import re

In [3]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.5.1
True


In [4]:
from datasets import load_dataset

ds = load_dataset("mshojaei77/merged_mental_health_dataset")

## Load the dataset and get the general understanding of it

In [5]:
data = pd.read_csv("E:\Mental health\Data\data.csv")
print("Data loaded successfully")

Data loaded successfully


In [6]:
data.rename(columns={"instruction": "context", "input": "question", "response": "answer"}, inplace=True)

In [7]:
data.head()

Unnamed: 0,context,question,answer
0,You are an empathetic and supportive AI chatbo...,I've been feeling so sad and overwhelmed latel...,"Hey there, I'm here to listen and support you...."
1,You are an empathetic and supportive AI chatbo...,"I recently got a promotion at work, which I th...",I can understand how it can be overwhelming wh...
2,You are an empathetic and supportive AI chatbo...,"Well, the workload has increased significantly...",It sounds like you're dealing with a lot of pr...
3,You are an empathetic and supportive AI chatbo...,I've been trying to prioritize my tasks and de...,It's great to hear that you're already impleme...
4,You are an empathetic and supportive AI chatbo...,You're right. I haven't really opened up about...,"It's completely normal to feel that way, but r..."


In [8]:
## check for Missing Values
print("DATA")
print("-----------------")
print(data.shape)
print(data.isnull().sum())
print()

DATA
-----------------
(797947, 3)
context     0
question    0
answer      0
dtype: int64



In [9]:
data.drop(["context"], axis=1, inplace=True) ## remove it cause it is not needed it can create problem while training

In [10]:
data.head()

Unnamed: 0,question,answer
0,I've been feeling so sad and overwhelmed latel...,"Hey there, I'm here to listen and support you...."
1,"I recently got a promotion at work, which I th...",I can understand how it can be overwhelming wh...
2,"Well, the workload has increased significantly...",It sounds like you're dealing with a lot of pr...
3,I've been trying to prioritize my tasks and de...,It's great to hear that you're already impleme...
4,You're right. I haven't really opened up about...,"It's completely normal to feel that way, but r..."


In [11]:
data.duplicated().sum()
data = data.drop_duplicates()

In [12]:
data2 = pd.DataFrame(ds['train'])

In [13]:
data2.rename(columns={"Context": "question", "Response": "answer"}, inplace=True)

In [14]:
data2.head()

Unnamed: 0,question,answer
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


In [15]:
data2.duplicated().sum()
data2 = data2.drop_duplicates()

In [16]:
data2.dropna(subset=['question', 'answer'], inplace=True)

In [17]:
## check for Missing Values
print("DATA2")
print("-----------------")
print(data2.shape)
print(data2.isnull().sum())
print()

DATA2
-----------------
(838857, 2)
question    0
answer      0
dtype: int64



In [18]:
data = pd.concat([data, data2], ignore_index=True)
print(data.shape)
print(data.head())

(1636801, 2)
                                            question  \
0  I've been feeling so sad and overwhelmed latel...   
1  I recently got a promotion at work, which I th...   
2  Well, the workload has increased significantly...   
3  I've been trying to prioritize my tasks and de...   
4  You're right. I haven't really opened up about...   

                                              answer  
0  Hey there, I'm here to listen and support you....  
1  I can understand how it can be overwhelming wh...  
2  It sounds like you're dealing with a lot of pr...  
3  It's great to hear that you're already impleme...  
4  It's completely normal to feel that way, but r...  


In [19]:
data.head()

Unnamed: 0,question,answer
0,I've been feeling so sad and overwhelmed latel...,"Hey there, I'm here to listen and support you...."
1,"I recently got a promotion at work, which I th...",I can understand how it can be overwhelming wh...
2,"Well, the workload has increased significantly...",It sounds like you're dealing with a lot of pr...
3,I've been trying to prioritize my tasks and de...,It's great to hear that you're already impleme...
4,You're right. I haven't really opened up about...,"It's completely normal to feel that way, but r..."


In [20]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data.head(10)

Unnamed: 0,question,answer
0,I think both of us have certain expectations a...,It sounds like you're juggling multiple priori...
1,I think my first goal will be to join a local ...,"That's a fantastic goal, ! Making new connecti..."
2,"I haven't thought about that, but it's worth a...",Taking the initiative can make a big differenc...
3,"Yes,, there is a pattern. Whenever I start fee...",It's insightful that you've identified this pa...
4,"I'm feeling really distracted lately, Alex. It...","I'm so sorry to hear that, Charlie. It sounds ..."
5,I'm grateful for your guidance. It's comforti...,"You're welcome. Remember, you're never alone ..."
6,"Yes, Alex, I have. I've noticed that when we d...","That's wonderful to hear, Charlie. It's amazin..."
7,"I haven't thought about it, to be honest. I su...",A professional mediator might indeed be benefi...
8,"Absolutely, Alex! So, I've come to understand ...",It's remarkable that you've been able to ident...
9,I'm having trouble sleeping at night. What can...,I'm having trouble sleeping at night. What can...


## Data-Preprocess and cleaning

In [21]:
## we will be doing light cleaning cause we will use the sentence transformer model to convert text into vector

def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'\s+', ' ', text).strip()
        text = text.replace('\n', ' ').replace('\r', ' ')
        
        return text
    return ""

print("Processing the data")
## Apply the cleaning to both the columns
data['question'] = data['question'].apply(clean_text)
data['answer'] = data['answer'].apply(clean_text)

print("Cleaning completed")

print("Dropping the rows with empty question or answer and remove NUll Values")
## Drop the rows with empty question or answer and remove NUll Values 
data.dropna(subset=['question', 'answer'], inplace=True)
data = data[(data["question"].str.strip() != "") & (data["answer"].str.strip() != "")]
data.reset_index(drop=True, inplace=True)

print("Processing complete")

Processing the data
Cleaning completed
Dropping the rows with empty question or answer and remove NUll Values
Processing complete


## Load The Transformer Model 

In [None]:
## Load the model
model = SentenceTransformer('BAAI/bge-large-en', device='cuda')

## let's convert the question column to list for processing
print("Converting the question column to list for processing")
questions = data['question'].tolist()
embeddings = model.encode(questions, show_progress_bar=True, batch_size=64)
print("Embedding completed")

Converting the question column to list for processing


Batches:   0%|          | 0/25575 [00:00<?, ?it/s]

## Let's use FAISS (Facebook AI Similarity Search)

In [12]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(np.array(embeddings).astype("float32"))
print(index.ntotal)

797944


In [13]:
faiss.write_index(index, "faiss_index.idx")

## Perform retrieval for new query

In [14]:
user = "I am getting bad dreams"
query_embedding = model.encode([user]).astype("float32")

print("Performing retrieval")

k = 5
distances, indices = index.search(query_embedding, k)

print("retrieval completed")        
## Show the results
for idx in indices[0]:
    matched_question = data.iloc[idx]['question']
    matched_answer = data.iloc[idx]['answer']
    
    print(f"Question: {matched_question}")
    print(f"Answer: {matched_answer}")

Performing retrieval
retrieval completed
Question: I'm scared. My dreams seem to be slipping away.
Answer: I can sense the sadness in your words. Addiction can have a powerful hold on our lives, making it difficult to chase our dreams. Can you tell me more about what you're going through?
Question: Well, my dreams have become really intense and disturbing. They are filled with dark imagery, and I always wake up feeling scared and unsettled. I think they are a reflection of the constant worry I have during the day.
Answer: It sounds like your dreams are amplifying your worries and adding to your anxiety. Anxiety can manifest in our dreams, often as a way for our subconscious mind to process our fears. Have you noticed any specific themes or patterns in these dreams?
Question: I'm scared. It feels like my dreams are slipping away.
Answer: I'm here for you. Can you tell me more about why you're feeling scared and how your dreams are being affected?
Question: I'm worried. My dreams lately 

In [15]:
'''from huggingface_hub import login
login()'''

'from huggingface_hub import login\nlogin()'

In [16]:
data.to_csv("cleaned_data.csv", index=False)