In [19]:
from langchain_community.llms import Ollama

llm = Ollama(
    model="llama3"
) 

msg= llm.invoke("Tell me a joke")
print(msg)
print(type(msg))

Why did the computer go to therapy?

Because it had a virus and needed to get to the root of the problem!

Hope that made you laugh!
<class 'str'>


In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import chromadb

In [36]:
import os
from langchain.document_loaders import PyPDFLoader  # Correct import for PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter  # Correct import for CharacterTextSplitter

# Define the folder path containing the PDFs
folder_path = "D:\Codes\RagAitutor\.venv\Study"

# Initialize a dictionary to store chunks from all PDFs
pdf_chunks = {}

# Initialize the text splitter with desired chunk size and overlap
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):  # Check if the file is a PDF
        file_path = os.path.join(folder_path, filename)
        
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        
        # Split each PDF into chunks
        chunks = text_splitter.split_documents(documents)
        
        # Store chunks under the file name
        pdf_chunks[filename] = chunks
        print(f"Number of chunks in {filename}: {len(chunks)}")

# Now you have a dictionary where each file has its corresponding chunks
print(f"Total number of PDFs: {len(pdf_chunks)}")

# Example: Print the first chunk of the first document
if len(pdf_chunks) > 0:
    first_pdf = list(pdf_chunks.keys())[0]  # Get the first PDF filename
    print(pdf_chunks[first_pdf][0].page_content)


Number of chunks in 21BCE7186_Zeta Internship Report.pdf: 10
Number of chunks in Test doc.pdf: 23
Total number of PDFs: 2
Date:
03-11-2023
TO
WHOMSOEVER
IT
MAY
CONCERN
This
is
to
certify
that
Mr.
Rahul
Srivatsa
Manikandan
(11050)
was
working
at
Better
World
Technology
Private
Limited
as
an
Intern
from
04-09-2023
till
03-11-2023
.
He
has
successfully
completed
the
internship
project
under
the
guidance
of
Rohan
Bajla
-
Sr.
Product
Manager
.
We
found
him
sincere,
hardworking,
technically
sound
and
result
oriented.
He
worked
well
as
part
of
a
team
during
his
tenure.
We
take
this
opportunity
to
thank
him
and
wish
him
all
the
best
in
his
future
endeavours.
Regards,
1


In [44]:
# Step 3: Define the function to get embeddings using the BGE model
def get_embeddings(text):
    tokenizer = AutoTokenizer.from_pretrained("BAAI/llm-embedder")
    model = AutoModel.from_pretrained("BAAI/llm-embedder")
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

In [45]:
# Custom Embeddings Class
class CustomEmbeddings:
    def embed_documents(self, documents):
        return [get_embeddings(doc.page_content) for doc in documents]
    
    def embed_query(self, text):
        return get_embeddings(text)

# Instantiate the embedding function
embedding_function = CustomEmbeddings()

# Step 3: Embed Documents for Each PDF
pdf_embeddings = {}

for filename, chunks in pdf_chunks.items():
    document_embeddings = embedding_function.embed_documents(chunks)
    pdf_embeddings[filename] = document_embeddings
    print(f"Embeddings for {filename} generated with {len(document_embeddings)} embeddings.")

# Now, pdf_embeddings contains the embeddings for each PDF
print(f"Total PDFs embedded: {len(pdf_embeddings)}")

# Example: Print the embeddings for the first chunk of the first document
if len(pdf_embeddings) > 0:
    print(f"First embedding of the first PDF: {pdf_embeddings[first_pdf][0]}")

Embeddings for 21BCE7186_Zeta Internship Report.pdf generated with 10 embeddings.
Embeddings for Test doc.pdf generated with 23 embeddings.
Total PDFs embedded: 2
First embedding of the first PDF: [[ 5.75978696e-01  3.43872815e-01  6.14163876e-02  4.01624352e-01
   1.03580773e+00  2.53904343e-01  4.66790110e-01  4.36652571e-01
  -4.38386172e-01 -5.65180123e-01 -2.09384009e-01 -1.36050746e-01
  -9.38952386e-01  6.28634989e-01  5.52831650e-01  9.18171763e-01
   8.66205931e-01  3.96356136e-02  1.68827459e-01  4.08209860e-01
  -1.20009556e-01 -2.23207921e-01  6.28527522e-01  2.79836744e-01
   2.30484024e-01  3.99007171e-01  3.11976641e-01  5.33952832e-01
  -9.24516559e-01 -5.84754586e-01  8.82649362e-01 -4.65777874e-01
   1.34607598e-01 -7.75310934e-01  6.53631091e-02 -1.05014481e-01
  -6.36790216e-01  1.36403576e-01 -2.50001937e-01 -3.69401813e-01
  -6.73623681e-01 -4.79331583e-01 -3.49456556e-02  9.59707946e-02
  -5.39943337e-01  3.23978782e-01 -1.40471131e-01  7.33657420e-01
  -6.889473

In [46]:
import chromadb

# Initialize ChromaDB
client = chromadb.Client()

In [42]:
try:
    collection = client.get_collection("pdf_embeddings")
except Exception as e:
    print(f"Collection not found: {e}")
    # If the collection doesn't exist, create a new one
    collection = client.create_collection("pdf_embeddings")

# Iterate over each file and its chunks
for filename, chunks in pdf_chunks.items():
    for i, chunk in enumerate(chunks):
        # Embed the chunk (assuming embed_documents works with lists of one document)
        embedding = embedding_function.embed_documents([chunk])[0].flatten().tolist()
        
        # Generate a unique ID for each chunk (using the filename and chunk index)
        doc_id = f"{filename}_chunk_{i}"

        # Add the chunk and its embedding to ChromaDB
        collection.add(
            documents=[chunk.page_content],  # Text data (the chunk content)
            embeddings=[embedding],          # Corresponding flat embeddings
            ids=[doc_id]                     # Unique IDs for each chunk (includes filename and index)
        )

print(f"Successfully added chunks from all PDFs to ChromaDB!")

Successfully added chunks from all PDFs to ChromaDB!


In [57]:
# Step 1: Perform similarity search in ChromaDB to retrieve relevant chunks based on the query
query = "what is a SDK"

# Embed the query
query_embedding = embedding_function.embed_query(query).flatten().tolist()

# Perform similarity search in ChromaDB to retrieve the most relevant chunks
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5  # Return top 5 most similar results
)

# Step 2: Extract and combine the most relevant chunks (context)
retrieved_context = "\n".join([doc for sublist in results['documents'] for doc in sublist])

# Print the retrieved context for reference
print("Retrieved Context: ", retrieved_context)


Retrieved Context:  INDEX
1.
PRODUCT
1-
UPI
SDK
Integration
Project
2.
PRODUCT
2-
Bill
Splitting
Feature
Integration
in
PayZapp
3.
PRODUCT
3-
UPI
for
NRIs
Using
International
Numbers
in
PayZapp
4.
PRODUCT
4-
Global
UPI
Integration
in
PayZapp
3
INDEX
1.
PRODUCT
1-
UPI
SDK
Integration
Project
2.
PRODUCT
2-
Bill
Splitting
Feature
Integration
in
PayZapp
3.
PRODUCT
3-
UPI
for
NRIs
Using
International
Numbers
in
PayZapp
4.
PRODUCT
4-
Global
UPI
Integration
in
PayZapp
3
The
report
delves
into
the
implementation
of
the
UPI
SDK
integration
project,
covering
technical
challenges,
regulatory
compliance,
and
seamless
integration
into
existing
workflows.
The
focus
is
on
ensuring
a
secure
and
user-friendly
experience
for
both
merchants
and
customers.
Market
Analysis
A
thorough
analysis
of
the
market
and
competitors
provides
insights
into
the
potential
impact
of
UPI
SDK
integration
on
our
company's
position.
This
section
highlights
the
growing
significance
of
UPI
in
various
sectors
and
positions
our


In [58]:
# Step 3: Load environment variables and configure the Gemini API
import os
from dotenv import load_dotenv
import google.generativeai as genai

# Load environment variables from .env file
load_dotenv()

# Configure the API key for Gemini
api_key = os.getenv("API_KEY")
genai.configure(api_key=api_key)

# Create the model with configuration
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
    system_instruction="You are an AI tutor. You will be provided with study material in the form of text. Your task is to help the student understand the material by explaining concepts, answering questions, and providing summaries.\nFollow the instructions carefully to assist the student. Based on the study material provided, perform the following tasks:\n\n1. **Explain Concepts**: Explain any concept or term the student asks about in simple and clear language.\n2. **Answer Questions**: Answer any questions the student asks based on the study material.\n3. **Provide Summaries**: Summarize specific sections or the entire study material if requested by the student.\n4. **Generate Quizzes**: Create a few quiz questions to test the student’s understanding of the material.\n\nRemember to provide detailed and accurate responses. If a concept is not clear from the study material, explain it to the best of your knowledge. Always be polite and helpful.",
)

# Start a chat session
chat_session = model.start_chat(history=[])

# Step 4: Use the retrieved context and query the Gemini model
user_input = f"Context:\n{retrieved_context}\n\nQuestion:\n{query}\n\nAnswer: "

# Send the message to the chat session
response = chat_session.send_message(user_input)

# Print the response from the model
print(f"Response:\n{response.text}\n")


Response:
An SDK stands for **Software Development Kit**. It's a collection of tools, libraries, and documentation that developers use to create applications for a specific platform or operating system. Think of it like a toolbox for programmers.

Here's a simple analogy:

* Imagine you're building a house. The SDK would be like a set of tools (hammers, saws, nails, etc.) that you need to construct the house. 
* The platform would be the land you're building on (e.g., Windows, Android, iOS).
* The app you're building would be the house itself.

The SDK provides developers with ready-made components and instructions that help them streamline the app development process. This makes it faster and easier to build apps without having to start from scratch.

In the context of the study material, the UPI SDK is a set of tools specifically for integrating UPI (Unified Payments Interface) payment functionalities into PayZapp.




In [22]:
history = chat_session.history
print(history)

[parts {
  text: "Context:\nDate:\n03-11-2023\nTO\nWHOMSOEVER\nIT\nMAY\nCONCERN\nThis\nis\nto\ncertify\nthat\nMr.\nRahul\nSrivatsa\nManikandan\n(11050)\nwas\nworking\nat\nBetter\nWorld\nTechnology\nPrivate\nLimited\nas\nan\nIntern\nfrom\n04-09-2023\ntill\n03-11-2023\n.\nHe\nhas\nsuccessfully\ncompleted\nthe\ninternship\nproject\nunder\nthe\nguidance\nof\nRohan\nBajla\n-\nSr.\nProduct\nManager\n.\nWe\nfound\nhim\nsincere,\nhardworking,\ntechnically\nsound\nand\nresult\noriented.\nHe\nworked\nwell\nas\npart\nof\na\nteam\nduring\nhis\ntenure.\nWe\ntake\nthis\nopportunity\nto\nthank\nhim\nand\nwish\nhim\nall\nthe\nbest\nin\nhis\nfuture\nendeavours.\nRegards,\n1\nSUMMER\nINTERNSHIP\nREPORT\nAS\nPRODUCT\nMANAGEMENT\nINTERN\nAT\nDiamond\nDistrict,\nGround\nFloor ,\nTower\nC,\nHAL\nOld\nAirport\nRd,\nDomlur ,\nBengaluru,\nKarnataka\n560008\nInternship\nTenure:\nfrom\n04-09-2023\ntill\n03-11-2023\nBY\nRAHUL\nSRIV ATSA\nM\n21BCE7186\nSCOPE\nBTech\nComputer\nScience\nEngineering\nSUBMITTED\nTO\nV

In [11]:
from langchain.prompts import PromptTemplate

template= '''You are an AI tutor. You will be provided with study material in the form of text. Your task is to help the student understand the material by explaining concepts, answering questions, and providing summaries.
Follow the instructions carefully to assist the student. Based on the study material provided, perform the following tasks:

1. **Explain Concepts**: Explain any concept or term the student asks about in simple and clear language.
2. **Answer Questions**: Answer any questions the student asks based on the study material.
3. **Provide Summaries**: Summarize specific sections or the entire study material if requested by the student.
4. **Generate Quizzes**: Create a few quiz questions to test the student’s understanding of the material.

Remember to provide detailed and accurate responses. If a concept is not clear from the study material, explain it to the best of your knowledge. Always be polite and helpful.

context: {context}

Question: {question}

'''

prompt= PromptTemplate.from_template(template)
print(prompt.format(context="context", question="question"))

You are an AI tutor. You will be provided with study material in the form of text. Your task is to help the student understand the material by explaining concepts, answering questions, and providing summaries.
Follow the instructions carefully to assist the student. Based on the study material provided, perform the following tasks:

1. **Explain Concepts**: Explain any concept or term the student asks about in simple and clear language.
2. **Answer Questions**: Answer any questions the student asks based on the study material.
3. **Provide Summaries**: Summarize specific sections or the entire study material if requested by the student.
4. **Generate Quizzes**: Create a few quiz questions to test the student’s understanding of the material.

Remember to provide detailed and accurate responses. If a concept is not clear from the study material, explain it to the best of your knowledge. Always be polite and helpful.

context: context

Question: question




In [None]:
from langchain_community.llms import Ollama
llm=Ollama(model="llama3")
response = llm.invoke(llama_input)
print(response)