In [None]:
%%capture
! pip install langchain  pydantic lxml langchainhub fastapi kaleido uvicorn

In [None]:
 pip install "unstructured[all-docs]==0.10.26"

In [None]:
%%capture
!sudo apt-get install poppler-utils tesseract-ocr

In [2]:
data = "./pdf/td1.pdf"

In [4]:
from lxml import html
from pydantic import BaseModel
from typing import Any, Optional
from unstructured.partition.pdf import partition_pdf

raw_pdf_elements = partition_pdf(filename=data,
                                 extract_image_block_types=[ "Table","image"],
                                 infer_table_structure=True,
                                 chunking_strategy="by_title",
                                 max_characters=40000,
                                 new_after_n_chars=3800,
                                 combine_text_under_n_chars=2000,
                                 )

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 48,
 "<class 'unstructured.documents.elements.Table'>": 38}

In [14]:
class Element(BaseModel):
    type: str
    text: Any

categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))


table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))


text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

38
48


In [15]:
pip install -q -U google-generativeai

Note: you may need to restart the kernel to use updated packages.


In [16]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
from google.colab import userdata

In [17]:

from dotenv import load_dotenv

load_dotenv()

import os

GOOGLE_API_KEY=os.getenv("GOOGLE_API_KEY")

genai.configure(api_key=GOOGLE_API_KEY)

In [18]:
model = genai.GenerativeModel('gemini-pro')

In [19]:
response = model.generate_content("What is the meaning of life?")

In [20]:
response

<google.generativeai.types.generation_types.GenerateContentResponse at 0x7f483c305150>

In [None]:
pip install langchain_google_genai

In [21]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

In [24]:
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]="ls__61a54282f63545c7bb1ac85f6ece9abd"
os.environ["LANGCHAIN_PROJECT"]="langchain_semi_structured_RAG"

In [47]:
from google.generativeai.types import HarmCategory, HarmBlockThreshold

safety_settings={
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
    }




In [49]:
prompt_text="""Please provide a brief, appropriate summary of the information contained in the following table or text: {element}"""
prompt = ChatPromptTemplate.from_template(prompt_text)


model= ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY,safety_settings=safety_settings)
summarize_chain = {"element": lambda x:x} | prompt | model | StrOutputParser()

In [27]:
texts = [i.text for i in text_elements]
tables = [i.text for i in table_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})



In [52]:
texts

['ARULMIGU ARTHANAREESWARAR TEMPLE\n\nTIRUCHENGODE TOWN AND TALUK, NAMAKKAL DISTRICT\n\nTender Schedule issued to ............................................ vide.\n\nM.R. No. ...................... dated ...................... containing the following.',
 'S. No.\n\n1.\n\nSchedule – A\n\nrates and approximate\n\nSchedule of quantities.\n\n2.\n\nSchedule – B\n\nRate of Progress\n\n3.\n\nDrawings\n\n4.\n\nEligibility criteria, EMD, Security deposit, Retention amount,Pre-visit to site, Special conditions and safety provisions\n\nSchedule – C\n\n5.\n\nAffidavit\n\nSchedule – D\n\nUndertaking\n\n6.\n\nSchedule – E\n\n7.\n\nSchedule – F\n\nDeclaration\n\nAssistant Commissioner/Executive Officer\n\nName of the Tenderer\n\n:\n\nTNGST Registration Number\n\nArea Code\n\nName of the Assessment Circle :\n\n:\n\n:\n\nSignature of the Contractor\n\nARULMIGU ARTHANAREESWARAR TEMPLE\n\nTIRUCHENGODE TOWN AND TALUK, NAMAKKAL DISTRICT.\n\nNAME OF WORK :\n\nConstruction of Pasumadam',
 'forArulmiguArth

In [53]:
tables

['S. No. 1. Schedule – A Schedule of quantities. 2. Schedule – B Rate of Progress 3. Drawings 4. Schedule – C 5. Schedule – D Affidavit 6. Schedule – E Undertaking 7. Schedule – F Declaration',
 'NAME OF WORK : Construction of Pasumadam forArulmiguArthanareeswarar temple, Tiruchengode Town and Taluk,Namakkal District. Last Date and Time 14.03.2023 up to 2:30 PM for submission of Tender Date and Time of opening of Tender 14.03.2023 from 3:00 PM onwards Earnest Money Rs.30000/- Deposit (E.M.D.) Value Tender document cost Rs.10620/-(for direct purchase)',
 'Rs.38,56,982/- Rs.30000/- Up to 13.03.2023until 5:45PM Period of Issue of Tender Documents Last date for receipt of Tender 14.03.2023upto 2:30 PM documents Period of Completion 6 Months (Six Months )',
 'T.N.D. S.S. No. Probable Quantity Description of works Unit Amount In figures In words 1 65.00 m3 (Cubicmetr e) 1m3 (One cubic metre)',
 'T.N.D. S.S. No. Probable Quantity Description of works Unit Amount In figures In words 2 78.00 m3

In [28]:
text_summaries

['The provided text mentions the ARULMIGU ARTHANAREESWARAR TEMPLE in Tiruchengode Town and Taluk, Namakkal District, but it does not contain any tender information or schedule. Therefore, I cannot provide a summary of the information requested.',
 'The table outlines the components of a tender document for the construction of Pasumadam at Arulmigu Arthanareeswarar Temple in Tiruchengode, Namakkal District. It includes schedules for rates and quantities, rate of progress, drawings, eligibility criteria, an affidavit, undertaking, declaration, and the names and signatures of the tenderer and Assistant Commissioner/Executive Officer.',
 'Arulmigu Arthanareeswarar Temple in Tiruchengode, Namakkal District, is inviting tenders for the construction of a "Pasumadam" (greenhouse). The tender submission deadline is 2:30 PM on March 14, 2023, with the tender opening scheduled for 3:00 PM on the same day. Eligible contractors must be registered with the T.N.P.W.D. Class V or above and have an Ear

In [29]:
table_summaries

['The table lists seven items, each with a letter designation followed by a description. For example, item 1 is designated as "A" and described as "- of."',
 'Tenders are invited for the construction of Pasumadam for Arulmigu Arthanareeswarar temple in Tiruchengode. The last date for submission is March 14, 2023, up to 2:30 PM, with the opening of tenders scheduled for the same day from 3:00 PM onwards. Earnest Money Deposit (EMD) of Rs. 30,000/- is required, and the tender document cost is Rs. 10,620/- for direct purchase.',
 'Tender documents for a project worth Rs. 38,56,982/- can be purchased until 13.03.2023, 5:45 PM. The deadline for submitting tender documents is 14.03.2023, 2:30 PM. The project has a completion period of six months.',
 'The table indicates that 65.00 cubic meters of work is required, with each cubic meter costing 1 unit.',
 'The table lists three entries for "T.N.D. S.S. No.", each with a corresponding "Probable Quantity" and "Description of works". The first e

In [None]:
%%capture
!pip install faiss-cpu tiktoken faiss-gpu chromadb


In [46]:
import uuid
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

vectorstore = Chroma(
    collection_name="summaries",
    embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=GOOGLE_API_KEY)
)

store = InMemoryStore()
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [Document(page_content=s,metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

## RAG from LangChain Expression Language.

Run [RAG pipeline](https://python.langchain.com/docs/expression_language/cookbook/retrieval).

In [39]:
from operator import itemgetter
from langchain.schema.runnable import RunnablePassthrough

template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

model = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [40]:
chain.invoke("What is the total doucment cost?")

'Rs.30000'

In [41]:
chain.invoke("What is the Earnest Money Deposit value?")

'This context does not mention anything about Earnest Money Deposit, so I cannot answer this question from the provided context.'

In [37]:
chain.invoke("Can you summarize the second table")

'The provided context does not include a second table, so I cannot summarize it.'

In [54]:
chain.invoke("What is the average tokens in responses for Meta?")

'The provided context does not mention anything about the average tokens in responses for Meta, so I cannot answer this question from the provided context.'

In [55]:
chain.invoke("What is the Pretraining data ?")

'The provided context does not mention anything about Pretraining data, so I cannot answer this question from the provided context.'

We can check the [trace](https://smith.langchain.com/public/7ef3c73c-7b04-4621-864d-273606657566/r) to see what chunks were retrieved.

In [57]:
import os
from crewai import Agent, Task, Crew, Process
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-pro",verbose = True,temperature = 0.1,google_api_key=GOOGLE_API_KEY)

In [None]:
pip install crewai

In [59]:
researcher = Agent(
role='Senior Research Analyst',
goal='Uncover cutting-edge developments in AI and data science',
backstory="""You work at a leading tech think tank.
Your expertise lies in identifying emerging trends.
You have a knack for dissecting complex data and presenting
actionable insights.""",
verbose=True,
llm = llm,
allow_delegation=False,
tools=[],
)

writer = Agent(
role='Tech Content Strategist',
goal='Craft compelling content on tech advancements',
backstory="""You are a renowned psychopath, known for
your insightful and engaging articles.
You transform complex concepts into compelling narratives.""",
verbose=True,
allow_delegation=True,
llm = llm,
tools=[],
)

task1 = Task(
description="""Conduct a comprehensive analysis of the latest advancements in AI in 2024.
Identify key trends, breakthrough technologies, and potential industry impacts.
Your final answer MUST be a full analysis report""",
agent=researcher
)

task2 = Task(
description="""Using the insights provided, develop an engaging blog
post that highlights the most significant AI advancements.
Your post should be informative yet accessible, catering to a tech-savvy audience.
Make it sound cool, avoid complex words so it doesn't sound like AI.
Your final answer MUST be the full blog post of at least 4 paragraphs.""",
agent=writer
)

In [60]:
crew = Crew(
agents=[researcher, writer],
tasks=[task1, task2],
verbose=2,
)

In [61]:
result = crew.kickoff()
print("######################")

[DEBUG]: Working Agent: Senior Research Analyst
[INFO]: Starting Task: Conduct a comprehensive analysis of the latest advancements in AI in 2024.
Identify key trends, breakthrough technologies, and potential industry impacts.
Your final answer MUST be a full analysis report


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: Google Trends
Action Input: Search for "AI" and "Data Science"[0mGoogle Trends is not a valid tool, try one of [].[32;1m[1;3mDo I need to use a tool? No
Final Answer: **Comprehensive Analysis of the Latest Advancements in AI in 2024**

**Introduction**

Artificial intelligence (AI) has emerged as a transformative technology with the potential to revolutionize various industries and aspects of human life. In 2024, we witnessed significant advancements in AI, driven by breakthroughs in machine learning, natural language processing, and computer vision. This report provides a comprehensive analysis of the l

In [62]:
print(result)

**AI Advancements: Reshaping the Future**

In the realm of technology, artificial intelligence (AI) has emerged as a game-changer, propelling us towards a future where machines can think, learn, and adapt like never before. 2024 has been a banner year for AI, with groundbreaking advancements that have the potential to reshape industries and transform our lives.

**Generative AI: Unleashing Creativity**

Generative AI models like GPT-4 and DALL-E 2 have taken the world by storm with their ability to generate stunning text, images, and even music. These models are not just replicating existing content; they're creating entirely new and original works. Imagine a world where writers can collaborate with AI to craft captivating stories, or artists can bring their visions to life with AI-generated masterpieces.

**Edge AI: Intelligence at the Edge**

Edge AI brings the power of AI to devices like smartphones and IoT gadgets. These devices can now make real-time decisions and operate autonomo