In [1]:
# remove comment before running
"""
!pip install langchain
!pip install langchain-community
!pip install tiktoken
!pip install chromadb
!pip install python-dotenv
!pip install langchain-google-genai
!pip install pdfminer.six"""

'\n!pip install langchain\n!pip install langchain-community\n!pip install tiktoken\n!pip install chromadb\n!pip install python-dotenv\n!pip install langchain-google-genai\n!pip install pdfminer.six'

In [2]:
# if you get api key error please create file named .env and add your api keys with following names
# GOOGLE_API_KEY
# ACCESS_TOKEN


from dotenv import load_dotenv
load_dotenv()
import os
google_api_key = os.getenv("GOOGLE_API_KEY")
access_token = os.getenv("ACCESS_TOKEN")
# Set the environment variable
os.environ["GOOGLE_API_KEY"] = google_api_key
os.environ["ACCESS_TOKEN"] = access_token

## Load PDF

In [3]:
from langchain_community.document_loaders import PDFMinerLoader

In [4]:
def load_pdf(path):
  pdf_loader = PDFMinerLoader(path)
  docs = pdf_loader.load()
  return docs

## Split PDF into Chunks

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
def split_text(docs, chunk_size=2000, chunk_overlap=100):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  chunks = text_splitter.split_documents(docs)
  return chunks

## Generate Embedings and Store into ChormaDB

In [7]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma

In [8]:
def embd_and_store(chunks):
  embd = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
  vector_store = Chroma.from_documents(chunks,embd,persist_directory="./chroma")
  return vector_store

In [9]:
docs = load_pdf("./tesi.pdf") # Add path to your pdf
chunks = split_text(docs)
vector_store = embd_and_store(chunks)

## Get Embeddings


In [10]:
def get_embd(query):
  embd = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
  return embd.embed_query(query)

In [11]:
''' query = "What does the document say about AI?"
retrieved_docs = vector_store.similarity_search_with_score(query=query, k=5)

# Display results with distances
for i, (doc, score) in enumerate(retrieved_docs, 1):
    print(f"Result {i}:")
    print(f"Text: {doc.page_content}")
    print(f"Distance Score: {score}\n")
'''

' query = "What does the document say about AI?"\nretrieved_docs = vector_store.similarity_search_with_score(query=query, k=5)\n\n# Display results with distances\nfor i, (doc, score) in enumerate(retrieved_docs, 1):\n    print(f"Result {i}:")\n    print(f"Text: {doc.page_content}")\n    print(f"Distance Score: {score}\n")\n'

## LLM Model

In [12]:
from transformers import AutoModelForCausalLM,AutoTokenizer

In [13]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [14]:
model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [24]:
def get_response(query):
    retrived_doc = vector_store.similarity_search(query=query,k=1)
    context = ""
    for doc in retrived_doc:
        context += doc.page_content + "\n"
    prompt = f"""
[System]: You are a helpful AI assistant. Provide concise and accurate answers to user questions.based on the context you recoved form the document provided.
[Context]: {context}
[User]: {query}
"""
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

    output = model.generate(
    inputs.input_ids,
    attention_mask=inputs.attention_mask,  # Pass the attention mask
    pad_token_id=tokenizer.pad_token_id,
    # max_length=100,  # Limit the response length
    temperature=0.7,  # Balances creativity and determinism
    top_p=0.9,       # Nucleus sampling: keeps the top 90% of probability mass
    top_k=50,        # Limits sampling to the top 50 tokens
    num_beams=1,     # Beam search for better coherence
    do_sample=True,  # Enables sampling for more diverse outputs
    repetition_penalty=1.2,  # Reduces repetition in the output
    no_repeat_ngram_size=2,  # Prevents repeating 2-grams
    early_stopping=True,     # Stops generation when the model is confident
)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [25]:
query = "what is similarity metrics"
response = get_response(query)

In [26]:
print(response)


[System]: You are a helpful AI assistant. Provide concise and accurate answers to user questions.based on the context you recoved form the document provided.
[Context]: using the formula:

Cosine Similarity =

A · B
∥A∥∥B∥

where A and B are the vectors being compared, A · B is the dot product, and
∥A∥ and ∥B∥ are their norms. This metric is useful for assessing how aligned
two vectors are in the same direction, regardless of their magnitude, and is
often used to measure semantic similarity between words or documents.

9
Related Work

• Euclidean distance measures the linear distance between two vectors in

multidimensional space and is calculated using the formula:

Euclidean Distance =

ö
õ
õ
ô

n
Ø

i=1

(ai − bi)2

where ai and bi are the component values of vectors A and B, and n is
the number of dimensions in the vector. This metric provides an absolute
measure of the distance between the two vectors, useful for identifying overall
differences in their representations.

• maxim