<a href="https://colab.research.google.com/github/ShasmaAfs/RAG-School-Certificate/blob/main/RAG_School_Certificate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
!pip install pytesseract sentence-transformers chromadb pdfplumber python-dotenv transformers diffusers accelerate safetensors




In [56]:
from google.colab import files

uploaded = files.upload()
file_name = list(uploaded.keys())[0]

print("Uploaded:", file_name)


Saving school.png to school.png
Uploaded: school.png


In [57]:
import pdfplumber
from PIL import Image
import pytesseract

extracted_text = ""

if file_name.endswith(".pdf"):
    with pdfplumber.open(file_name) as pdf:
        for page in pdf.pages:
            extracted_text += page.extract_text() + "\n"
else:
    img = Image.open(file_name)
    extracted_text = pytesseract.image_to_string(img)

print("Extracted Text:\n", extracted_text)


Extracted Text:
 IJIULTIVWVUVL

CERTIFICATE

Student Name:
Siyasath Shasma
School:

Mahmud Ladies College
Year:

2019
Subjects: Chemistry, Physics, Maths

Issued by:
Ministry of Education



In [69]:
import chromadb
from chromadb.utils import embedding_functions

embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name="School_Certificate_Rag",
    embedding_function=embedding_function
)

collection.add(
    documents=[extracted_text],
    ids=["school_certificate"]
)

print("Vector DB ready!")


Vector DB ready!


In [70]:
from transformers import pipeline

def rag_query(question):
    results = collection.query(
        query_texts=[question],
        n_results=1
    )
    context = results["documents"][0][0]

    rag_prompt = f"""
    You are an assistant. Use ONLY this context to answer:

    Context:
    {context}

    Question:
    {question}

    Answer:
    """

    generator = pipeline("text-generation", model="distilgpt2")

    response = generator(
        rag_prompt,
        max_new_tokens=150,   # generate output
        num_return_sequences=1,
        do_sample=True
    )[0]["generated_text"]

    return response


In [71]:
print(rag_query("Summarize this school certificate"))


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



    You are an assistant. Use ONLY this context to answer:

    Context:
    IJIULTIVWVUVL

CERTIFICATE

Student Name:
Siyasath Shasma
School:

Mahmud Ladies College
Year:

2019
Subjects: Chemistry, Physics, Maths

Issued by:
Ministry of Education


    Question:
    Summarize this school certificate

    Answer:
        
The title of this class is "Associative Chemistry". However, the title of this class is "Associative Chemistry".
The title of this class is "Associative Chemistry". However, the title of this class is "Associative Chemistry". However, the title of this class is "Associative Chemistry". However, the title of this class is "Associative Chemistry". However, the title of this class is "Associative Chemistry". However, the title of this class is "Associative Chemistry". However, the title of this class is "Associative Chemistry". However, the title of this class is "Associative Chemistry". However, the title of this class is


In [73]:
from transformers import pipeline

qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")


def rag_query(question):
    results = collection.query(
        query_texts=[question],
        n_results=1
    )

    context = results["documents"][0][0]

    answer = qa_model({
        'question': question,
        'context': context
    })

    return answer['answer']


Device set to use cpu


In [74]:
print(rag_query("What is the student's name?"))

Siyasath Shasma


In [75]:
print(rag_query("What is the school name?"))

Mahmud Ladies College


In [76]:
print(rag_query("What is the year?"))

2019


In [77]:
print(rag_query("What are the subjects?"))

Chemistry, Physics, Maths


In [78]:
print(rag_query("Who issued it?"))


Ministry of Education

