## LLM Model Define

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI

_ = load_dotenv(find_dotenv())

# api for openai
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


chat_groq_model = ChatOpenAI(
    model='gpt-3.5-turbo-0125',
    temperature= 0.7,
    max_retries=3,
)

## Data Loader

In [50]:
from langchain_community.document_loaders import TextLoader

# Define Text Loader Class
loader = TextLoader('data/state_of_the_union.txt', encoding='utf-8')

# load Data
loaded_data = loader.load()

## Data Split ( Using CharacterTextSpliter)

In [51]:
from langchain_text_splitters import CharacterTextSplitter

In [6]:
# define text spliter
text_spliter = CharacterTextSplitter(
    separator= "\n\n",
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len,
    is_separator_regex= False
)

In [21]:
docs = text_spliter.create_documents(loaded_data[0].page_content)

## Data Split ( Using RecursiveCharacterTextSpliter)

In [52]:
from langchain_text_splitters import RecursiveCharacterTextSplitter 

In [53]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 26,
    chunk_overlap = 4
)

In [54]:
re_docs = recursive_splitter.split_text(loaded_data[0].page_content)

## Embedding

In [56]:
from langchain_openai import OpenAIEmbeddings

In [57]:
embedding_model = OpenAIEmbeddings()

In [58]:
# Example Embedding
embedding_Data = await embedding_model.aembed_documents(re_docs)

# Store Embedding vector in vector Database

### Chroma DB Example

In [65]:
from langchain_chroma import Chroma

In [67]:
vector_db = Chroma.from_texts(re_docs, OpenAIEmbeddings())

In [68]:
vector_db

<langchain_chroma.vectorstores.Chroma at 0x23475ccc310>

### FAISS vector DB Example

In [69]:
from langchain_community.vectorstores import FAISS

In [71]:
faiss_vector = FAISS.from_texts(re_docs,OpenAIEmbeddings())

In [None]:
# saving in local
faiss_vector.save_local(folder_path="data",
                        index_name="data")

# Retrival using sementic search

In [74]:
retriever = vector_db.as_retriever(search_kwargs = {'k': 3}) 

In [75]:
retriever.invoke("What did he say about kentanji brown jackson?")

[Document(metadata={}, page_content='Judge Ketanji Brown'),
 Document(metadata={}, page_content='said in his speech to the'),
 Document(metadata={}, page_content='As Ohio Senator Sherrod')]