# smolagents with AzureOpenAI model


In [None]:
%pip install opendatasets openai unstructured[pdf] gradio langchain-openai aperturedb pandas langchain-community smolagents 'smolagents[litellm]' arxiv --upgrade --quiet

In [None]:
!sudo apt-get update
!apt-get install poppler-utils

In [1]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/sg/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/sg/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
!pip install tesseract

In [None]:
!apt install tesseract-ocr

In [None]:
! adb config create --active --from-json 

In [None]:
adb utils execute summary

In [1]:
import os
import json
import arxiv
import requests
import pandas as pd
import opendatasets as od
from langchain_core.documents import Document
from unstructured.partition.auto import partition
#from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.chains import (
    StuffDocumentsChain, LLMChain
)
from langchain.schema import HumanMessage, AIMessage
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain.callbacks.manager import (
    trace_as_chain_group,
)
import gradio as gr

In [4]:
dataset = 'https://www.kaggle.com/datasets/Cornell-University/arxiv'
od.download(dataset)

Dataset URL: https://www.kaggle.com/datasets/Cornell-University/arxiv


In [2]:
def fetch_paper_details(arxiv_id):
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
    paper.download_pdf( filename=f"{arxiv_id}.pdf")
    return partition(f"{arxiv_id}.pdf")

In [None]:
sudo apt update
sudo apt install -y libgl1

In [None]:
papers = []
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=5000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

sample = 5 # Arxiv has over 1.7M articles, using 20 for our application

# Open the JSON file and process entries
with open("arxiv/arxiv-metadata-oai-snapshot.json", "r") as file:
    for _ in range(sample):
        line = file.readline()
        data = json.loads(line)

        # Extract relevant fields
        arxiv_id = data.get("id", "")

        # Add paper details by downloading and parsing the paper
        paper_details = "".join(
            text if isinstance((text := element.text), str)
            else "".join(str(part) for part in text) if isinstance(text, (list, tuple))
            else str(text)
            for element in fetch_paper_details(arxiv_id)
        )
        print(type(paper_details))
        # Use LangChain's splitter to divide paper details into chunks
        chunks = text_splitter.create_documents([paper_details])
        print(len(chunks))
        # Create a Document for each chunk
        for idx, chunk in enumerate(chunks):
            print(chunk,type(chunk))
            document_id = f"{arxiv_id}_{idx + 1}"  # Unique ID for each chunk
            document = Document(
                page_content=chunk.page_content,
                id=document_id,
                metadata={
                    'title': data.get("title",""),
                    'authors': data.get("authors", ""),
                    'submitter': data.get("submitter", ""),
                    'abstract': data.get("abstract", ""),
                    'paper_content': chunk.page_content
                }
            )
            papers.append(document)

print("Processing complete. Papers saved to processed_papers.json.")

In [6]:
from dotenv import load_dotenv
load_dotenv()
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
KAGGLE_TOKEN = os.getenv("KAGGLE_TOKEN")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_API_VERSION = os.getenv("AZURE_API_VERSION")
deployment = os.getenv("AZURE_DEPLOYMENT")

In [None]:
print(AZURE_OPENAI_ENDPOINT,  AZURE_API_VERSION ,deployment)

In [None]:
python -c "import openai; print(openai.__version__)"

In [None]:
# #Below step create ApertureDB and use the connection string in .env file
# KAGGLE_TOKEN=""
# KAGGLE_USERNAME=

# AZURE_DEPLOYMENT="gpt-4"
# AZURE_API_VERSION="2024-08-01-preview"
# AZURE_OPENAI_ENDPOINT="https://<>.openai.azure.com/"
# AZURE_OPENAI_API_KEY=""

# LANGCHAIN_TRACING_V2=true
# LANGCHAIN_ENDPOINT=https://api.smith.langchain.com
# LANGCHAIN_API_KEY=""
# LANGCHAIN_PROJECT=""

# APERTUREDB_JSON={"host":"<>.farm0003.cloud.aperturedata.io","port":55555,"username":"","password":"","name":"","use_ssl":true,"use_rest":false,"use_keepalive":true,"retry_interval_seconds":1,"retry_max_attempts":3}


In [11]:
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain_community.vectorstores import ApertureDB

embeddings = AzureOpenAIEmbeddings(model="text-embedding-3-large")
vector_db = ApertureDB.from_documents(papers, embeddings)

In [34]:
import os
from langchain.embeddings import AzureOpenAIEmbeddings
from openai import AzureOpenAI

# Ensure API credentials are set correctly
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_API_VERSION = os.getenv("AZURE_API_VERSION")

# Initialize AzureOpenAI client
client = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_API_VERSION 
) 

# Ensure the correct deployment name for embeddings
EMBEDDING_DEPLOYMENT_NAME = "text-embedding-3-large"  # Replace with your Azure deployment name

from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI

# Specify the Azure deployment name
embeddings = AzureOpenAIEmbeddings(
    model=EMBEDDING_DEPLOYMENT_NAME,  
    azure_deployment=EMBEDDING_DEPLOYMENT_NAME  # Corrected: Specify the deployment name
)

from langchain_community.vectorstores import ApertureDB

vector_db = ApertureDB.from_documents(papers, embeddings)

# Define Models
EMBEDDING_MODEL = "text-embedding-3-large"  # Ensure this matches your Azure deployment name
GENERATION_MODEL = "gpt-4"




Progress:   0%|                                | 0.00/39.0 [00:08<?, ?batches/s]


In [35]:
from smolagents import Tool

class RetrieverTool(Tool):
    name = "retriever"
    description = "Uses semantic search to retrieve documents that could be relevant to answer the query."
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. This should be semantically close to the target documents. Use the affirmative form rather than a question.",
        }
    }
    output_type = "string"

    def __init__(self, client, **kwargs):
        super().__init__(**kwargs)
        self.embedder = client

    def simple_retriever(self, query: str, n=5):
        """Retrieve documents using similarity search"""
        retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={"k": n})
        results = retriever.invoke(query)

        return "\nRetrieved documents:\n" + "".join(
            [
                f"\n\n===== Document {str(i)} =====\n" + doc.page_content
                for i, doc in enumerate(results)
            ]
        )

    def get_embedding(self, text: str):
        """Fix: Use correct embedding call"""
        response = self.embedder.embeddings.create(
            input=[text],  # Embeddings expect a list of strings
            engine=EMBEDDING_DEPLOYMENT_NAME  # Use Azure deployment name
        )
        return response["data"][0]["embedding"]

    def forward(self, query: str) -> str:
        assert isinstance(query, str)
        return self.simple_retriever(query)

retriever_tool = RetrieverTool(client)


In [48]:
# Initialize LLM and Agent
from smolagents import ToolCallingAgent, LiteLLMModel
import os
from smolagents import AzureOpenAIServerModel

model = AzureOpenAIServerModel(
    model_id = GENERATION_MODEL,
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    api_version="2024-08-01-preview"    
)
            
agent = ToolCallingAgent(tools=[retriever_tool], model=model)

question = "Why is calculating Higgs Boson decay important?"
agent_output = agent.run(question)
print(agent_output)

Calculating the decay of the Higgs Boson is important due to several reasons not explicitly detailed in the retrieved documents. However, generally, the importance can be inferred from the discussions around the precision calculations and experimental measurements related to the Higgs Boson decay. These calculations are crucial for testing the predictions of the Standard Model of particle physics, refining our understanding of fundamental particles and forces, and potentially uncovering new physics beyond the Standard Model. The decay patterns and rates of the Higgs Boson can provide insights into the Higgs mechanism, which is responsible for particles acquiring mass. Moreover, discrepancies between theoretical predictions and experimental results could indicate the existence of unknown particles or forces, opening avenues for new research in particle physics.
