In [1]:
import getpass, os, requests
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Retrieve API keys
openai_api_key = os.getenv("OPENAI_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
astra_db_api_endpoint = os.getenv("ASTRA_DB_API_ENDPOINT")
astra_db_application_token = os.getenv("ASTRA_DB_APPLICATION_TOKEN")

if "ASTRA_DB_API_ENDPOINT" not in os.environ:
  os.environ["ASTRA_DB_API_ENDPOINT"] = getpass.getpass("Provide your Astra DB Endpoint")

if "ASTRA_DB_APPLICATION_TOKEN" not in os.environ:
  os.environ["ASTRA_DB_APPLICATION_TOKEN"] = getpass.getpass("Provide your Astra DB Token")

In [2]:
from langchain_astradb import AstraDBVectorStore
from langchain_openai import OpenAIEmbeddings
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"


# Configure your embedding model and vector store
embedding = OpenAIEmbeddings()
vstore = AstraDBVectorStore(
    collection_name="test",
    embedding=embedding,
    token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
    api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"),
)
print("Astra vector store configured")

Astra vector store configured


In [3]:
import uuid
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from unstructured.partition.pdf import partition_pdf
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

In [4]:
for uploaded_file in os.listdir("uploaded_docs"):
    file_path = os.path.join("uploaded_docs", uploaded_file)
    # # Save the uploaded file to disk
    # file_path = os.path.join("uploaded_docs", uploaded_file.name)
    # with open(file_path, "wb") as f:
    #     f.write(uploaded_file.getbuffer())


    all_chunks = []  

    chunks = partition_pdf(
        filename=file_path,
        infer_table_structure=True,            # extract tables
        strategy="hi_res",                     # mandatory to infer tables

        extract_image_block_types=["Image"],   # Add 'Table' to list to extract image of tables
        # image_output_dir_path=output_path,   # if None, images and tables will saved in base64

        extract_image_block_to_payload=True,   # if true, will extract base64 for API usage

        chunking_strategy="by_title",          # or 'basic'
        max_characters=10000,                  # defaults to 500
        combine_text_under_n_chars=2000,       # defaults to 0
        new_after_n_chars=6000,

        # extract_images_in_pdf=True,          # deprecated
    )
    all_chunks.extend(chunks)

# separate tables from texts
tables = []
texts = []

for chunk in all_chunks:
    if "Table" in str(type(chunk)):
        tables.append(chunk)

    if "CompositeElement" in str(type((chunk))):
        texts.append(chunk)

# Get the images from the CompositeElement objects
def get_images_base64(chunks):
    images_b64 = []
    for chunk in chunks:
        if "CompositeElement" in str(type(chunk)):
            chunk_els = chunk.metadata.orig_elements
            for el in chunk_els:
                if "Image" in str(type(el)):
                    images_b64.append(el.metadata.image_base64)
    return images_b64

images = get_images_base64(all_chunks)

# Prompt
prompt_text = """
You are an assistant tasked with summarizing tables and text.
Give a concise summary of the table or text.

Respond only with the summary, no additionnal comment.
Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is.

Table or text chunk: {element}

"""
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatGroq(temperature=0.5, model="llama-3.1-8b-instant")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

# Summarize text
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 3})

# Summarize tables
tables_html = [table.metadata.text_as_html for table in tables]
table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 3})


# Summarise Images
prompt_template = """Describe the image in detail. For context,
                the image is part of a research paper explaining the transformers
                architecture. Be specific about graphs, such as bar plots."""
messages = [
    (
        "user",
        [
            {"type": "text", "text": prompt_template},
            {
                "type": "image_url",
                "image_url": {"url": "data:image/jpeg;base64,{image}"},
            },
        ],
    )
]

prompt = ChatPromptTemplate.from_messages(messages)
chain = prompt | ChatOpenAI(model="gpt-4o-mini") | StrOutputParser()
image_summaries = chain.batch(images)


# id_key = "undefined"


In [6]:
# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=summary, metadata={"doctype":"text"}) for _, summary in enumerate(text_summaries)
]
if summary_texts:
    inserted_ids1 = vstore.add_documents(summary_texts)

In [7]:
# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=summary, metadata={"doctype":"table"}) for _, summary in enumerate(table_summaries)
]
if summary_tables:
    inserted_ids2 = vstore.add_documents(summary_tables)

# Add image summaries
img_ids = [str(uuid.uuid4()) for _ in images]
summary_img = [
    Document(page_content=summary, metadata={"doctype":"image"}) for _, summary in enumerate(image_summaries)
]
if summary_img:
    inserted_ids3 = vstore.add_documents(summary_img)

In [11]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

retriever = vstore.as_retriever(search_kwargs={"k": 3})

prompt_template = """
Answer the question based only on the supplied context. If you don't know the answer, say you don't know the answer.
Context: {context}
Question: {question}
Your answer:
"""
prompt = ChatPromptTemplate.from_template(prompt_template)
model = ChatOpenAI()

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Invoke the chain
chain.invoke("give 5 skills of pratik bhangale?")

  model = ChatOpenAI()


"Pratik Bhangale's skills include proficiency in various programming languages and tools, with expertise in areas such as NLP, deep learning, and machine learning."

In [12]:
chain.invoke("give 5 skills of pratik bhangale?")

'1. Proficiency in various programming languages\n2. Expertise in areas such as NLP, deep learning, and machine learning\n3. Experience as a data science intern\n4. Worked on projects involving generative AI-driven medical imaging\n5. Worked on projects involving SAP installation search automation using agentic AI and multi-modal RAG'

In [14]:
# Retrieve and print the content of the documents
retrieved_docs = retriever.invoke("give 5 skills of pratik bhangale?")
for doc in retrieved_docs:
    print(doc)

page_content='Pratik Bhangale is a master's student in Artificial Intelligence at Rochester Institute of Technology, with a strong background in computer science and engineering. His technical skills include proficiency in various programming languages and tools, with expertise in areas such as NLP, deep learning, and machine learning. He has experience as a data science intern and has worked on several projects, including generative AI-driven medical imaging and SAP installation search automation using agentic AI and multi-modal RAG.' metadata={'doctype': 'text'}
page_content='Engineered model optimization techniques like knowledge distillation and pruning achieved an 87.5% reduction in model size while maintaining performance. A data preprocessing pipeline handled 200,000 conversation records improving data quality. A neural network model achieved 97% accuracy, and its size was reduced from 150MB to 40MB for edge devices. An Android app was developed for the model's deployment.' meta