In [None]:
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import os

# --- 1. Load the Data ---
print("Loading conversation data...")
df = pd.read_csv('../data/raw/mental_health_conversations.csv')
df.dropna(subset=['Context', 'Response'], inplace=True)
print(f"Loaded {len(df)} conversations.")

# --- 2. Prepare Documents for LangChain ---
# We need to structure our data into LangChain's "Document" format.
# The main text will be the counselor's 'Response', 
# and we'll store the patient's 'Context' as metadata.
print("Preparing documents for vector store...")
documents = []
for _, row in df.iterrows():
    # The 'page_content' is what gets embedded and searched
    page_content = row['Response']
    
    # The 'metadata' holds extra info we want to retrieve later
    metadata = {
        'patient_context': row['Context'],
        'response_length': len(row['Response'])
    }
    doc = Document(page_content=page_content, metadata=metadata)
    documents.append(doc)

print(f"Created {len(documents)} documents.")

# --- 3. Create Embeddings and FAISS Vector Store ---
print("Initializing embedding model (this may take a moment)...")
# Use the same model as in your Streamlit app
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

print("Creating FAISS vector store from documents...")
# This will convert all documents to vectors and build the index
vector_store = FAISS.from_documents(documents, embedding_model)

# --- 4. Save the Vector Store ---
output_path = "../data/faiss_index"
if not os.path.exists(output_path):
    os.makedirs(output_path)

print(f"Saving FAISS index to {output_path}...")
vector_store.save_local(output_path)

print("\n🎉 SUCCESS! Your FAISS vector store is ready.")
print("You can now run your Streamlit application.")

Loading conversation data...
Loaded 3508 conversations.
Preparing documents for vector store...
Created 3508 documents.
Initializing embedding model (this may take a moment)...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating FAISS vector store from documents...


  return forward_call(*args, **kwargs)


Saving FAISS index to ../data/faiss_index...

🎉 SUCCESS! Your FAISS vector store is ready.
You can now run your Streamlit application.
