## Implemenation

In [1]:
import os
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
import numpy as np
from utils.embedding_loader import CustomOpenAIEmbeddings

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GoogleGeminiEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings


## loading .env file

In [2]:
from dotenv import load_dotenv

load_dotenv(dotenv_path="C:/Users/ADMIN/Documents/venv/.env")

True

In [3]:

# Custom embedding classes
class CustomOpenAIEmbeddings(OpenAIEmbeddings):
    def embed_documents(self, texts, vector_size: int = 1024):
        # Get embeddings from OpenAI
        embeddings_list = super().embed_documents(texts)
        
        # Truncate each embedding to 1024 dimensions
        truncated_embeddings = []
        for emb in embeddings_list:
            # Take only the first 1024 dimensions
            truncated_emb = emb[:vector_size]
            truncated_embeddings.append(truncated_emb)
        
        return truncated_embeddings


In [6]:
def embedding_model(model_type: str = "OPENAI"):
    # Initialize custom OpenAI embeddings
    if model_type == "OPENAI":
        embeddings = CustomOpenAIEmbeddings(
            api_key=os.environ.get("OPENAI_API_KEY"),
            model="text-embedding-3-small"
        )
        
    elif model_type == "GEMINI":
        embeddings = GoogleGeminiEmbeddings(
            api_key=os.environ.get("GEMINI_API_KEY"),
            model="gemini-1.5-flash"
        )
    else:
        embeddings = HuggingFaceEmbeddings(
            api_key=os.environ.get("HUGGINGFACE_API_KEY"),
            model="sentence-transformers/all-MiniLM-L6-v2"
        )
    return embeddings



In [7]:
def load_and_create_chroma_db():
    db_directory = "db/faiss_db_openai"
    if not os.path.exists(db_directory):
        os.makedirs(db_directory)
    vector_store = Chroma(
        collection_name="openai_db",
        embedding_function=embedding_model,
        persist_directory="./db/chroma_db", 
    )