In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS
from langchain_community.vectorstores import FAISS 
from langchain.docstore.document import Document

import logging
import os
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import kagglehub
import faiss
import numpy as np
from langchain.docstore import InMemoryDocstore





def create_faiss_db(documents):
    splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
    split_docs = splitter.split_documents(documents)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
    print('before faiss from dcoument ')
    vector_store = FAISS.from_documents(split_docs, embeddings)
    vector_store.save_local(Path(os.getcwd()) / 'storage')
    logger.info("FAISS vector store created and saved")
    return vector_store


In [5]:

# Load environment variables
route = Path(os.getcwd()) / ".env"

if os.path.exists(route):
    load_dotenv(dotenv_path=route)
    logger.info(f".env file loaded from {route}")
else:
    logger.warning(f".env file not found at {route}. Falling back to Colab secrets.")

# Get API key (try .env first, then Colab secrets)
api_key = os.getenv("GOOGLE_API_KEY")

INFO:__main__:.env file loaded from /Users/ayomideoraegbu/WORK/GreenLife-Mental-Health-Chatbot/backend/.env


In [3]:

# Load existing datasets
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRGbYKgw5x2xteCHSjLhkY5FHTPjtVnvBkN_5m2p6clfdUJK77CLDRSdq5RbPygNygaTFsK3xghrfi4/pub?output=csv'
try:
    df1 = pd.read_csv(url)
    logger.info("Google Sheets dataset loaded")
except Exception as e:
    logger.error(f"Failed to load Google Sheets dataset: {str(e)}")
    raise

path_ = kagglehub.dataset_download("emirhanai/social-media-usage-and-emotional-well-being")
# path_ = "/kaggle/input/social-media-usage-and-emotional-well-being"
try:
    df2 = pd.read_csv(f"{path_}/train.csv")
    logger.info("Kaggle dataset loaded")
except Exception as e:
    logger.error(f"Failed to load Kaggle dataset: {str(e)}")
    raise

# Convert rows to documents
def create_documents(df, content_column=None):
    documents = []
    for index, row in df.iterrows():
        if content_column:
            content = str(row[content_column])
        else:
            content = "".join(str(value) for value in row)
        documents.append(Document(page_content=content))
    return documents

# Create documents from all datasets
docs1 = create_documents(df1)
docs2 = create_documents(df2)
# docs3 = create_documents(df3, content_column="Comments")
all_docs = docs1 + docs2 # + docs3
logger.info(f"Created {len(all_docs)} documents from datasets")

INFO:__main__:Google Sheets dataset loaded
INFO:__main__:Kaggle dataset loaded
INFO:__main__:Created 1270 documents from datasets


In [4]:
vector_store = create_faiss_db(all_docs)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


before faiss from dcoument 


INFO:faiss.loader:Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.
INFO:faiss:Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss.
INFO:__main__:FAISS vector store created and saved


In [6]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})

documents = all_docs


embedded_docs = [(doc, embedding_model.embed_query(doc.page_content)) for doc in documents]
doc_texts = [doc.page_content for doc in documents]
embeddings = [embedding_model.embed_query(text) for text in doc_texts]

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [11]:
#  Initialize the FAISS index
dimension = len(embeddings[0])  # Assuming all embeddings have the same length
index = faiss.IndexFlatL2(dimension) # Using L2 (Euclidean) distance
index.add(np.array(embeddings))



<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x14cf35200> >

In [13]:
# Initialize the FAISS vector store with InMemoryDocstore
docstore = InMemoryDocstore(dict(enumerate(documents)))
vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id={i: i for i in range(len(documents))})

In [15]:
vector_store.save_local(Path(os.getcwd()) / 'storage')