# Prep Documents for Generative AI 📃

Document prep for generative AI generally requires three steps:

1. Chunking: Splitting the documents into smaller chunks

2. Vectorizing: Sending our data to the embeddings model to convert to vectors

3. Upserting: Placing the vectors in Azure Search for retrieval

## Import Libraries

In [None]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings
load_dotenv()

## Create Azure OpenAI 🤖 & Azure Search Instances 🔎

In [None]:
embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
    azure_deployment="embeddings",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY")
)

index_name: str = "products"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
    azure_search_key=os.getenv("AZURE_SEARCH_KEY"),
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

## Loop through each file, chunk, and upsert in to Azure Search

In [None]:
for filename in os.listdir('..\sample-docs'):
    if filename.endswith(".txt"):  # Adjust the file extension as needed
        file_path = os.path.join('..\sample-docs', filename)
        
        # Load the document
        loader = TextLoader(file_path, encoding="utf-8")
        document = loader.load()
        
        # Split the document
        text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=10)
        docs = text_splitter.split_documents(document)
        vector_store.add_documents(documents=docs)

## Question #1

In [None]:
docs = vector_store.similarity_search(
    query="what smart phones do you sell?",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

## Question #2

In [None]:
docs = vector_store.similarity_search(
    query="how much is the NexTech phone?",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

## Question #3

In [None]:
docs = vector_store.similarity_search(
    query="what laptops do you have?",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)