In [38]:
import pandas as pd
import openai
from dotenv import load_dotenv
import os
import ast


### Load Enviroment files

In [17]:
# Load environment variables
load_dotenv()

True

In [18]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"

# print(openai.api_key)
# print(openai.api_base)
# print(openai.api_version)


2c10778db282466e8bd61e5791b1a41b
https://undp-ngd-openai-datafutures-dev-2.openai.azure.com
2023-09-01-preview


<h3>globals</h3>

In [25]:
test_query="What are the sustainable energy priorities for UNDP?"



<h3> helper functions </h3>

In [21]:
# use this function to make simple openAI Calls
def callOpenAI(prompt):  
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=[
                        {"role": "user", "content": prompt},
                    ]
                )
    response = response_entities.choices[0].message.content
    return response


<h3> processing modules </h3>

In [26]:
## module to create prompt engineering template  

def promptEngineeringModule(user_query):
    
    ## generate prompt engineering text and template
    llm_instructions="llm instruction template here, with placeholders for insertion of user query, excerpts, indicator data, and entity and relation info" 
    
    return llm_instructions

# test usage
llm_instructions=promptEngineeringModule(test_query)
print(llm_instructions)

llm instruction template here, with placeholders for insertion of user query, excerpts, indicator data, and entity and relation info


In [32]:
def extractEntitiesFromQuery(user_query):
    prompt = f"""
    Extract entities from the following user query: \"{user_query}\" and return output in array format e.g ['entity1','entity2']
    
    -Entities should be directly related to the domain or topic of interest. They should represent important concepts that contribute to the understanding of the subject matter.
    -Each entity in the knowledge graph should be distinct and have a unique identifier. This ensures clarity and avoids ambiguity when establishing relationships between entities.
    """
    entity_list = callOpenAI(prompt)   
    return entity_list

# Test usage
test_query = "What are the sustainable energy for UNDP?"
entity_list = extractEntitiesFromQuery(test_query)
print(entity_list)

['sustainable energy', 'UNDP']


In [44]:
## module to get information on the entities from user query using the KG
def knowledgeGraphModule(user_query):
    
    # generate list of entities based on user query
    entity_list = extractEntitiesFromQuery(user_query)
    my_list = ast.literal_eval(entity_list)
    prompt_summarise_entites = f"""
    Summarize all relations between all the entities : {my_list}
    """
    summarise_entities = callOpenAI(prompt_summarise_entites)
    # Initialize an empty dictionary to store information
    entities_dict = {
        "relations": summarise_entities,
        "entities": {}
    }
    # Loop through each entity in the list
    for entity in my_list:
        # Fetch information about the entity from your knowledge graph
        prompt = f"Give me a short description 50 words of {entity}"
        entity_info = callOpenAI(prompt)
        # Add the entity information to the dictionary
        entities_dict["entities"][entity] = entity_info
    
    return entities_dict


# Test usage
test_query = "Tell me about sustainable energy and UNDP"
entities_dict = knowledgeGraphModule(test_query)
print(entities_dict)

{'relations': 'The UNDP is involved in promoting sustainable energy.', 'entities': {'sustainable energy': 'Sustainable energy refers to the production and utilization of renewable resources that have minimal impact on the environment. It involves harnessing energy from sources like sunlight, wind, water, and geothermal heat, while minimizing greenhouse gas emissions and reducing dependence on finite fossil fuels.', 'UNDP': 'The United Nations Development Programme (UNDP) is a global organization that works to eradicate poverty, reduce inequalities, and promote sustainable development. It provides support to countries in areas such as governance, climate change, and crisis response, aiming to improve the lives of people and protect the planet.'}}


In [None]:
## module to extract text from documents and return the text and document codes

def semanticSearchModule(user_query):
        
    # run semantic search over embeddings to extract best matches based on user query
    excerpts_dict={
        "test-doc-1": {
            "title": "Wind energy and economic recovery",
            "extract": "Low Environmental Impact: Sustainable energy has a lower environmental impact compared to fossil fuels. It emits little to no greenhouse gases and pollutants, thus helping to combat climate change and reduce air pollution.",
            "category": "Research Article",
            "link": "https://windeurope.org/intelligence-platform/product/wind-energy-and-economic-recovery-in-europe/",
            "thumbnail": "https://windeurope.org/wp-content/uploads/flagship-report-2020-cover-thumbnail.jpg"
        },
        "test-doc-2": {
            "title": "Green Energy Solar Solutions by UNDP",
            "extract": "The shift towards sustainable energy is crucial for reducing our carbon footprint, mitigating the effects of climate change, and ensuring a stable and secure energy future",
            "category": "UNDP Publication",
            "link": "https://unsdg.un.org/resources/green-energy-solar-solutions-practice-note-business-operations-strategy",
            "thumbnail": "https://unsdg.un.org/sites/default/files/2021-12/Cover%20Solar%20Solutions%20Green%20Energy.jpg"
        }
    } ##example results (the "extract" value is the main part, and the rest is just metadata)
       
    return excerpts_dict

#test usage
excerpts_dict=semanticSearchModule(test_query)
print(excerpts_dict)

In [45]:
## module to get data for specific indicators which are identified is relevant to the user query

def indicatorsModule(user_query): #lower priority
    
    # find relevant indicators based on uesr query and extract values
    indicators_dict={
        "indicator-id-1":"value from indicator-id-1",
        "indicator-id-2":"value from indicator-id-2"
    }#temp
    
    return indicators_dict

#test usage
indicators_dict=indicatorsModule(test_query)
print(indicators_dict)

{'indicator-id-1': 'value from indicator-id-1', 'indicator-id-2': 'value from indicator-id-2'}


In [41]:
## module to generate query ideas

def queryIdeationModule(user_query): # lower priority
    
    ## generate list of prompt ideas based on user query
    query_idea_list=["prompt idea 1","prompt idea 2","prompt idea 3"]#temp
    
    return query_idea_list

#test usage
query_idea_list=queryIdeationModule(test_query)
print(query_idea_list)

['prompt idea 1', 'prompt idea 2', 'prompt idea 3']


<h3> synthesis module </h3>

In [42]:
# module to synthesize answer using retreival augmented generation approach

def synthesisModule(user_query, llm_instructions, entities_dict, excerpts_dict, indicators_dict):
    
    ###synthesize data into structure within llm prompt engineering instructions
    answer="test answer"
    
    return answer  

## to test this, run the full pipeline with the handleApiCall function

<h3> run pipeline </h3>

In [43]:
# full pipeline with retreival, synthesis of answer to user query, and structure results into api response

def handleApiCall(user_query):
        
    ##run processing modules (in parallel)
    llm_instructions=promptEngineeringModule(user_query)
    entities_dict=knowledgeGraphModule(user_query)
    excerpts_dict=semanticSearchModule(user_query)
    indicators_dict=indicatorsModule(user_query) ##lower priority
    query_idea_list=queryIdeationModule(user_query) ##lower priority
    
    ##synthesis module
    answer=synthesisModule(user_query, llm_instructions, entities_dict, excerpts_dict, indicators_dict)
       
    ##structure response
    response={
        "user_query":user_query,
        "answer":answer,
        "sources":excerpts_dict,
        "query_ideas":query_idea_list,
        "entities":list(entities_dict["entities"].keys())       
    }
    
    return response

# test usage
response=handleApiCall(test_query) 
print(response)

{'user_query': 'What are the sustainable energy priorities for UNDP?', 'answer': 'test answer', 'sources': {'test-doc-1': {'title': 'Wind energy and economic recovery', 'extract': 'Low Environmental Impact: Sustainable energy has a lower environmental impact compared to fossil fuels. It emits little to no greenhouse gases and pollutants, thus helping to combat climate change and reduce air pollution.', 'category': 'Research Article', 'link': 'https://windeurope.org/intelligence-platform/product/wind-energy-and-economic-recovery-in-europe/', 'thumbnail': 'https://windeurope.org/wp-content/uploads/flagship-report-2020-cover-thumbnail.jpg'}, 'test-doc-2': {'title': 'Green Energy Solar Solutions by UNDP', 'extract': 'The shift towards sustainable energy is crucial for reducing our carbon footprint, mitigating the effects of climate change, and ensuring a stable and secure energy future', 'category': 'UNDP Publication', 'link': 'https://unsdg.un.org/resources/green-energy-solar-solutions-pr

<h3>testing</h3>

In [44]:
## next step, develop automated testing for all modules
## iterate through test_queries and build automated tests to score results

# open testing dataset with queries and expected results
test_queries_df=pd.read_csv("../testing/energy_ai_test_dataset_v0.csv")

