## Library Installation

In [12]:
# !pip install langchain_community langchain langchainhub chromadb langchain-openai

## API Key

In [13]:
from google.colab import userdata
import os
os.environ['OPENAI_API_KEY'] = userdata.get('OpenAI_API_Key')

## Scraping a site to load data

In [14]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(web_paths=["https://www.educosys.com/course/genai"])

docs = loader.load()

print(docs)

[Document(metadata={'source': 'https://www.educosys.com/course/genai', 'title': 'Hands-on Generative AI Course', 'description': 'Hands-on Generative AI Course', 'language': 'en'}, page_content="Hands-on Generative AI CourseCoursesBundle CoursesMentorFree ContentTestimonialsFAQLogin Signup Hands-on Generative AI CourseLearn, Build, Deploy and Apply Generative AI7 weeks · 3 classes/week · 2 hrs/class + Post-class Doubt SupportAccess all Live BatchesLifetime access of RecordingsAccess Discord CommunityCode availableBuild ProjectsLearn Future-Ready TechEnroll 1Week 1Foundations of Generative AI Introduction to AI Mathematical Foundations for AI Probability, Statistics, and Linear Algebra Basics of Neural Networks Gradient Descent and Optimization Basics Architectures: Feedforward, RNN, and CNN Mini Project - Build a Simple Neural Network Using TensorFlow Mini Project - Train an Autoencoder on the MNIST Dataset2Week 2Deep Generative Models Discriminative and Generative models Generative Adv

In [15]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap = 200)
splits = text_splitter.split_documents(docs)

In [16]:
splits[0]

Document(metadata={'source': 'https://www.educosys.com/course/genai', 'title': 'Hands-on Generative AI Course', 'description': 'Hands-on Generative AI Course', 'language': 'en'}, page_content='Hands-on Generative AI CourseCoursesBundle CoursesMentorFree ContentTestimonialsFAQLogin Signup Hands-on Generative AI CourseLearn, Build, Deploy and Apply Generative AI7 weeks · 3 classes/week · 2 hrs/class + Post-class Doubt SupportAccess all Live BatchesLifetime access of RecordingsAccess Discord CommunityCode availableBuild ProjectsLearn Future-Ready TechEnroll 1Week 1Foundations of Generative AI Introduction to AI Mathematical Foundations for AI Probability, Statistics, and Linear Algebra Basics of Neural Networks Gradient Descent and Optimization Basics Architectures: Feedforward, RNN, and CNN Mini Project - Build a Simple Neural Network Using TensorFlow Mini Project - Train an Autoencoder on the MNIST Dataset2Week 2Deep Generative Models Discriminative and Generative models Generative Adve

In [17]:
import chromadb
from sentence_transformers import SentenceTransformer
import numpy as np # Import numpy for potential array operations if needed, though .tolist() handles it

text_chunks = [doc.page_content for doc in splits]
metadatas = [doc.metadata for doc in splits]

# 2. Load your SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2') # Or 'all-MiniLM-L6-v2' if you prefer

# 3. Generate embeddings for your text chunks
print("Generating embeddings...")
phrase_embeddings = model.encode(text_chunks, convert_to_numpy=True)
print(f"Generated {len(phrase_embeddings)} embeddings, each with dimension: {phrase_embeddings.shape[1]}")
print(f"{phrase_embeddings[0][:10]}")
# 4. Initialize ChromaDB Client
# For an in-memory client (good for testing/development):
client = chromadb.Client()

# 5. Create or get a collection
collection_name = "my_document_chunks_collection"
try:
    collection = client.get_or_create_collection(name=collection_name) # get_or_create_collection is convenient
    print(f"Collection '{collection_name}' ready.")
except Exception as e:
    print(f"Error getting/creating collection: {e}")
    # Handle specific errors if needed, e.g., if the client path is wrong

ids = [f"doc_{i}" for i in range(len(text_chunks))]

# 7. Add embeddings and data to the collection
print("Adding embeddings and documents to ChromaDB...")
try:
    collection.add(
        embeddings=phrase_embeddings.tolist(),# Convert numpy array to list of lists
        documents=text_chunks,                # The original text content
        metadatas=metadatas,                  # The extracted metadata
        ids=ids                               # Unique identifiers
    )
    print(f"Successfully added {len(text_chunks)} chunks to the collection '{collection_name}'.")
except Exception as e:
    print(f"Error adding documents to collection: {e}")


Generating embeddings...
Generated 11 embeddings, each with dimension: 384
[-0.09630623 -0.08975942  0.04699757 -0.01128613 -0.02908106  0.02911405
 -0.02468756 -0.03765856 -0.10992973 -0.05269187]
Collection 'my_document_chunks_collection' ready.
Adding embeddings and documents to ChromaDB...
Successfully added 11 chunks to the collection 'my_document_chunks_collection'.


In [18]:
phrase_embeddings

array([[-0.09630623, -0.08975942,  0.04699757, ...,  0.06487723,
        -0.12483238, -0.01229357],
       [-0.10834444, -0.06192407,  0.05941018, ...,  0.07756441,
        -0.03659645,  0.0271165 ],
       [-0.02639303, -0.04659655,  0.06478772, ...,  0.04558603,
        -0.0018365 ,  0.01896166],
       ...,
       [-0.05776486, -0.07663081, -0.03873942, ..., -0.01104692,
        -0.08679643,  0.00611247],
       [-0.05165751, -0.01618834, -0.02688786, ...,  0.01069521,
        -0.10619239,  0.01832873],
       [-0.04477612, -0.00829121, -0.00358509, ..., -0.03145244,
        -0.02841158, -0.03470898]], dtype=float32)

In [9]:
# print(collection.get(ids=['doc_0'], include=['embeddings','documents']))

In [10]:
query_results = collection.query(
    query_texts=["What exactly is in the Gen AI course?"], # A query text
    n_results=2, # Number of top similar results
    # You can also use where clauses for metadata filtering:
    # where={"source": "https://www.educosys.com/course/genai"}
)

print("\nQuery Results:")
if query_results and query_results['documents']:
    for i, doc in enumerate(query_results['documents'][0]):
        print(f"  Result {i+1}:")
        print(f"    Document: {doc}")
        # Note: 'distances' might not always be present depending on the query type
        if query_results['distances'] and query_results['distances'][0]:
            print(f"    Distance: {query_results['distances'][0][i]}")
        print(f"    Metadata: {query_results['metadatas'][0][i]}")
        print(f"    ID: {query_results['ids'][0][i]}")
else:
    print("No results found or an error occurred during query.")



/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 63.5MiB/s]



Query Results:
  Result 1:
    Document: in your mind regarding the topic. I haven't found such intuitions and easy to understand explanations elsewhere. Taking this course has increased my confidence towards GenAI and it has given me a solid platform from where I can read more AI related blogs, whitepapers, etc. without feeling overwhelmed. It has even helped me demonstrate a POC within my team that was well received. I recommend this course to anyone starting out with GenAIRead moreSudarshan Suresh SrikantSoftware EngineerCienaKudos to the Educosys team and thanks to Keerti for making learning AI/ML easy. Starting from the basics upto advanced level with most up to date AI developments, the course instilled me with great confidence in approaching problems that could be solved with AI. Every minute detail with code was covered in-depth. I would definitely recommend taking up this course.Read moreSyed IInformation Security OfficerForm3A real game changer of a course especially when it

In [19]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI()

In [21]:
def format_docs(docs):
  return "\n".join(doc.page_content for doc in docs)

In [22]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [None]:
prompt = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:"""

In [31]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")



In [27]:
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [34]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document # Important: if your format_docs expects this

# 1. Define your format_docs function
# This function typically takes a list of LangChain Document objects
# and formats their page_content into a single string.
def format_docs(docs: list[Document]) -> str:
    """
    Combines a list of LangChain Document objects into a single string context.
    Each document's content is separated by a double newline.
    """
    return "\n\n".join(doc.page_content for doc in docs)

# 2. Set up your components (placeholders - replace with your actual objects)
#    a. Retriever: This is crucial. It should be a LangChain Retriever object
#       that, when invoked with a query, returns a list of relevant Document objects.
#       Example:
from langchain_community.vectorstores import Chroma
# from langchain_openai import OpenAIEmbeddings # or SentenceTransformerEmbeddings
# from chromadb import PersistentClient # or chromadb.Client()

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Assuming you've already set up your ChromaDB client and collection
# client = PersistentClient(path="./chroma_db_data")

vectorstore = Chroma(
    client=client,
    collection_name="my_document_chunks_collection",
    embedding_function=embedding_function # Or your SentenceTransformer embedding function
)
retriever = vectorstore.as_retriever()

#    b. Prompt Template: Defines the structure of your LLM input.
#       It should have a placeholder for 'context' and 'question'.
#       from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI assistant. Answer the user's question based only on the provided context. If you don't know the answer, say 'I don't know.'\n\nContext:\n{context}"),
    ("human", "Question: {question}")
])

#    c. LLM: Your Large Language Model.
from langchain_openai import ChatOpenAI
llm = ChatOpenAI() # Example LLM
# For demonstration, a simple lambda that simulates LLM response
# llm = RunnableLambda(lambda x: {"content": f"LLM Response to: {x['question']} based on context: {x['context'][:50]}..."})


# 3. Construct the RAG Chain using LCEL
# The 'context' key in the input dictionary to the prompt now comes from:
# - The 'retriever' being invoked (it receives the 'question' as input by default)
# - The output of the retriever (a list of Documents) is then piped to 'format_docs'
#   using RunnableLambda to make the regular Python function 'format_docs' a Runnable.
rag_chain = (
    {
        "context": retriever | RunnableLambda(format_docs),
        "question": RunnablePassthrough() # Passes the original 'question' input through
    }
    | prompt
    | llm
    | StrOutputParser() # Parses the LLM's output to a string
)

# 4. Invoke the chain
# input_data = {"question": "How long is the Generative AI course?"}
# response = rag_chain.invoke(input_data)
# print("\nFinal RAG Chain Response:")
# print(response)


In [None]:
rag_chain.invoke("Will I get course completion certificates?")