In [1]:
import os
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader , PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate , PromptTemplate
from langchain_classic.chains import create_history_aware_retriever , create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain.messages import HumanMessage , AIMessage 
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


load_dotenv()


os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [2]:
###DATA INGESTION AND PROCESSING
sample_documents = [
    Document(
        page_content="""
        Artificial Intelligence (AI) is the simulation of human intelligence in machines.
        These systems are designed to think like humans and mimic their actions.
        AI can be categorized into narrow AI and general AI.
        """,
        metadata={"source": "AI Introduction", "page": 1, "topic": "AI"}
    ),
    Document(
        page_content="""
        Machine Learning is a subset of AI that enables systems to learn from data.
        Instead of being explicitly programmed, ML algorithms find patterns in data.
        Common types include supervised, unsupervised, and reinforcement learning.
        """,
        metadata={"source": "ML Basics", "page": 1, "topic": "ML"}
    ),
    Document(
        page_content="""
        Deep Learning is a subset of machine learning based on artificial neural networks.
        It uses multiple layers to progressively extract higher-level features from raw input.
        Deep learning has revolutionized computer vision, NLP, and speech recognition.
        """,
        metadata={"source": "Deep Learning", "page": 1, "topic": "DL"}
    ),
    Document(
        page_content="""
        Natural Language Processing (NLP) is a branch of AI that helps computers understand human language.
        It combines computational linguistics with machine learning and deep learning models.
        Applications include chatbots, translation, sentiment analysis, and text summarization.
        """,
        metadata={"source": "NLP Overview", "page": 1, "topic": "NLP"}
    )
]

print(sample_documents)

[Document(metadata={'source': 'AI Introduction', 'page': 1, 'topic': 'AI'}, page_content='\n        Artificial Intelligence (AI) is the simulation of human intelligence in machines.\n        These systems are designed to think like humans and mimic their actions.\n        AI can be categorized into narrow AI and general AI.\n        '), Document(metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='\n        Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.\n        '), Document(metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='\n        Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revolu

In [3]:
#text spliiting
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500 , chunk_overlap = 50 , separators=[" "] , length_function = len)
chunks = text_splitter.split_documents(sample_documents)

In [4]:
chunks

[Document(metadata={'source': 'AI Introduction', 'page': 1, 'topic': 'AI'}, page_content='Artificial Intelligence (AI) is the simulation of human intelligence in machines.\n        These systems are designed to think like humans and mimic their actions.\n        AI can be categorized into narrow AI and general AI.'),
 Document(metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.'),
 Document(metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revolutionized computer vision, NLP, and speech recogn

In [5]:
#load the embedding models 
embeddings = OpenAIEmbeddings(
    model = "text-embedding-3-small",
    dimensions=1536
)

In [6]:
import numpy as np

In [7]:
#Compare cosine similarity 
def compare_embeddings(text1:str , text2:str):
    """Compare semantic similarity of 2 text using embeddings"""

    emb1 = np.array(embeddings.embed_query(text1))
    emb2 = np.array(embeddings.embed_query(text2))


    #Calculate the similarity score
    similarity = np.dot(emb1 , emb2)/(np.linalg.norm(emb1) * np.linalg.norm(emb2))

    return similarity

In [8]:
print(compare_embeddings("AI" , "Artificial Intelligence"))

0.5634442191715333


In [9]:
#Create FAISS VECTOR DB 
vectorstore = FAISS.from_documents(
    documents=chunks,
    embedding=embeddings

)
print(vectorstore.index.ntotal)

4


In [10]:
#Save faiss vector for later
vectorstore.save_local("faiss_index")

In [11]:
query = "What is Deep Learning"
results = vectorstore.similarity_search(query , k=3)
results

[Document(id='19e2b901-526e-4b43-9278-ba6b68bb3806', metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revolutionized computer vision, NLP, and speech recognition.'),
 Document(id='5cd90e5f-4958-45ba-8200-7c9b90ee4555', metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.'),
 Document(id='4a59b502-3930-46bb-830d-49fbf8e01a55', metadata={'source': 'NLP Overview', 'page': 1, 'topic': 'NLP'}, page_content='Natural Language Processing (NLP) is a branch of AI that helps computers understand human la

In [12]:
 #Building LCEL 
llm = init_chat_model("groq:qwen/qwen3-32b")

In [13]:
llm

ChatGroq(profile={'max_input_tokens': 131072, 'max_output_tokens': 16384, 'image_inputs': False, 'audio_inputs': False, 'video_inputs': False, 'image_outputs': False, 'audio_outputs': False, 'video_outputs': False, 'reasoning_output': True, 'tool_calling': True}, client=<groq.resources.chat.completions.Completions object at 0x000001B18E3B9790>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x000001B18E2D3A10>, model_name='qwen/qwen3-32b', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [14]:
simple_prompt = ChatPromptTemplate.from_template(
    """Answer the question based on the following context:
    
    Context : {context}

    Question : {question}

    Answer:
    
    """
)

In [15]:
retriever = vectorstore.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k":3}
)

In [17]:
from typing import List
# Format documents for the prompt
def format_docs(docs: List[Document]) -> str:
    """Format documents for insertion into prompt"""
    formatted = []
    for i, doc in enumerate(docs):
        source = doc.metadata.get('source', 'Unknown')
        formatted.append(f"Document {i+1} (Source: {source}):\n{doc.page_content}")
    return "\n\n".join(formatted)

In [18]:
simple_rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    |simple_prompt|llm|StrOutputParser()
)

In [19]:
### Conversational RAg Chain

conversational_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI assistant. Use the provided context to answer questions."),
    ("placeholder", "{chat_history}"),
    ("human", "Context: {context}\n\nQuestion: {input}"),
])

In [20]:
def create_conversational_rag():
    """Create a conversational RAG chain with memory"""
    return (
        RunnablePassthrough.assign(
            context=lambda x: format_docs(retriever.invoke(x["input"]))
        )
        | conversational_prompt
        | llm
        | StrOutputParser()
    )

conversational_rag = create_conversational_rag()

In [21]:
conversational_rag

RunnableAssign(mapper={
  context: RunnableLambda(lambda x: format_docs(retriever.invoke(x['input'])))
})
| ChatPromptTemplate(input_variables=['context', 'input'], optional_variables=['chat_history'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag(tag='HumanMessageChunk')], typing.Annotated[langchain_core.messages.chat.ChatMessageChunk, Tag(tag=

In [22]:
### streaming RAG chain
streaming_rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | simple_prompt
    | llm
)

print("Modern RAG chains created successfully!")
print("Available chains:")
print("- simple_rag_chain: Basic Q&A")
print("- conversational_rag: Maintains conversation history")
print("- streaming_rag_chain: Supports token streaming")

Modern RAG chains created successfully!
Available chains:
- simple_rag_chain: Basic Q&A
- conversational_rag: Maintains conversation history
- streaming_rag_chain: Supports token streaming


In [23]:
# Test function for different chain types
def test_rag_chains(question: str):
    """Test all RAG chain variants"""
    print(f"Question: {question}")
    print("=" * 80)
    
    # 1. Simple RAG
    print("\n1. Simple RAG Chain:")
    answer = simple_rag_chain.invoke(question)
    print(f"Answer: {answer}")

    print("\n2. Streaming RAG:")
    print("Answer: ", end="", flush=True)
    for chunk in streaming_rag_chain.stream(question):
        print(chunk.content, end="", flush=True)
    print()

In [24]:
test_rag_chains("What is the difference between AI and machine learning")

Question: What is the difference between AI and machine learning

1. Simple RAG Chain:
Answer: <think>
Okay, I need to answer the question: "What is the difference between AI and machine learning?" Let me start by recalling the context provided.

First, looking at Document 2 (AI Introduction), it says AI is the simulation of human intelligence in machines, designed to think like humans and mimic their actions. AI is a broader concept. Then, Document 1 (ML Basics) mentions that Machine Learning is a subset of AI. So ML is part of AI. 

The user is asking the difference between the two. So AI is the big picture, and ML is a specific approach within AI. But I need to elaborate. From the documents, AI includes things like narrow AI and general AI. ML is a method used to enable systems to learn from data without explicit programming. 

Document 3 (Deep Learning) adds that Deep Learning is a subset of ML, using neural networks. So the hierarchy is AI > ML > Deep Learning. 

So the main diffe

In [25]:
#Converstaional Example
chat_history = []

q1 = "What is Machine Learning"
a1 = conversational_rag.invoke({
    "input":q1,
    "chat_history": chat_history
})

In [27]:
a1

'<think>\nOkay, let\'s see. The user is asking "What is Machine Learning?" and they provided three documents to reference.\n\nFirst, I need to look at Document 1, which is from ML Basics. It says Machine Learning is a subset of AI that allows systems to learn from data. It mentions that instead of being explicitly programmed, ML algorithms find patterns in data. The common types are supervised, unsupervised, and reinforcement learning. That\'s the key info here.\n\nDocument 2 talks about Deep Learning, which is a subset of ML, but the question is about ML in general. So maybe I don\'t need to include Deep Learning details unless it\'s relevant. The user is asking for a definition, so I should stick to the basics from Document 1.\n\nDocument 3 is about AI in general. It defines AI as simulating human intelligence in machines, but since the question is about Machine Learning, I should focus on Document 1 first. However, maybe a sentence about AI being the parent category would help clari

In [28]:
chat_history.extend([
    HumanMessage(content=q1),
    AIMessage(content= a1)
])