In [None]:
# langsmith
# embedding models and understand how similarity works
# rag pipeline on a large html document using langchain

In [None]:
!pip install langchain langchain-openai

In [None]:
# Observability
import os
from langchain_openai import AzureChatOpenAI
os.environ["OPENAI_API_VERSION"] = "api-version"
os.environ["AZURE_OPENAI_ENDPOINT"] = "endpoint"
os.environ["AZURE_OPENAI_API_KEY"] = "api-key"

os.environ['LANGSMITH_TRACING']='true'
os.environ['LANGSMITH_ENDPOINT']='https://api.smith.langchain.com'
os.environ['LANGSMITH_API_KEY']='langsmith-key'
os.environ['LANGSMITH_PROJECT']='project-name'

In [None]:
llm = AzureChatOpenAI(
  deployment_name = "gpt-4.1",
)

llm.invoke('hello')

In [None]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-3-large",
)

In [None]:
vector = embeddings.embed_query('ai agents are llms with superpower')

len(vector)

In [None]:
texts = [
    "The central bank increased interest rates to combat inflation.",
    "A sports car with a V8 engine accelerates very quickly.",
    "The Reserve Bank announced a hike in repo rate to curb inflation.",
    "Electric cars are growing in popularity due to lower emissions.",
]

In [None]:
import numpy as np
from numpy.linalg import norm

def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray) -> float:
    """Return cosine similarity between two 1-D numpy vectors."""
    # add tiny eps to denominator to avoid division by zero
    denom = (norm(vec_a) * norm(vec_b)) + 1e-12
    return float(np.dot(vec_a, vec_b) / denom)

def euclidean_distance(vec_a, vec_b):
  return  np.linalg.norm(vec_a - vec_b)



In [None]:
vectors = [np.array(embeddings.embed_query(text)) for text in texts]

In [None]:
len(vectors[0])

In [None]:
for index in range(0, len(vectors)):
  print(index, cosine_similarity(vectors[0], vectors[index]))

In [None]:
for index in range(0, len(vectors)):
  print(index, euclidean_distance(vectors[0], vectors[index]))

In [None]:
# Data Loading Phase
# Document Loaders
# Chunking / Splitting
# Embeddings
# Vector DB

# User Querying Phase
# User Query
# Embedding
# Similarity Search
# User Query + Similar Chunks
# LLM

In [None]:
! pip install -Uq langchain_community unstructured

In [None]:
import requests
import tempfile
# from langchain.document_loaders import UnstructuredHTMLLoader
amzn_def14a_doc = "https://www.sec.gov/Archives/edgar/data/1018724/000110465925033442/tm252295-1_def14a.htm"

In [None]:
from langchain_community.document_loaders import UnstructuredHTMLLoader

In [None]:
# Download locally
headers = {
    "User-Agent": "RushikeshAnalytics/2.0 (rushikesh@gmail.com)"
}

response = requests.get(amzn_def14a_doc, headers=headers)
if response.status_code != 200:
    print(f"Failed to fetch: {response.status_code}")
else:
    with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_file:
        tmp_file.write(response.content)
        local_html_path = tmp_file.name

In [None]:
local_html_path

In [None]:
loader = UnstructuredHTMLLoader(local_html_path)
documents = loader.load()

In [None]:
print(documents[0].page_content)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
texts

In [None]:
texts[0]

In [None]:
texts[1]

In [None]:
texts[2]

In [None]:
len(texts)

In [None]:
! pip install -qU langchain-chroma

In [None]:
from langchain_chroma import Chroma
vector_store = Chroma.from_documents(
    texts,
    embedding=embeddings,
    persist_directory="./chroma_finance3"
)

# Data Loading Part : END

In [None]:
! ls chroma_finance3

In [None]:
retriever=vector_store.as_retriever(search_kwargs={"k": 5})

In [None]:
user_query = 'what is the ceo pay ratio'



def get_rag_answers(user_query):
  chunks = retriever.invoke(user_query) # get matching chunks from vector db
  # embed user_query 
  # match embeddings with vector db chunks
  # return top k chunks

  
  context = ' '.join([chunk.page_content for chunk in chunks]) # merge chunks into single string
  prompt = f"""
  You are a AI assitant, below is the user query and context, answer the user query based on the context only
  user query : {user_query}
  context: {context}
  """  
  response = llm.invoke(prompt)
  return response

In [None]:
context = ' '.join([chunk.page_content for chunk in chunks])

In [None]:
prompt = f"""
You are a AI assitant, below is the user query and context, answer the user query based on the context only
user query : {user_query}
context: {context}
"""

In [None]:
response = llm.invoke(prompt)
response