In [1]:
import pandas as pd
import openai
from dotenv import load_dotenv
import os
import ast
from openai import AzureOpenAI
import faiss
import numpy as np
import json

### Load Enviroment files

In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"


client = AzureOpenAI(
  api_key = os.getenv("api_key_azure"),  
  api_version = os.getenv("api_version"),
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)

embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE") 

# print(openai.api_key)
# print(openai.api_base)
# print(openai.api_version)


<h3>globals</h3>

In [4]:
test_query="What are the sustainable energy priorities for UNDP?"


<h3> helper functions </h3>

In [5]:
# use this function to make simple openAI Calls
def callOpenAI(prompt):  
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=[
                        {"role": "user", "content": prompt},
                    ]
                )
    response = response_entities.choices[0].message.content
    return response


<h3> processing modules </h3>

In [6]:
def extractEntitiesFromQuery(user_query):
    prompt = f"""
    Extract entities from the following user query: \"{user_query}\" and return output in array format.
    
    -Entities should be directly related to the domain or topic of interest. They should represent important concepts that contribute to the understanding of the subject matter.
    -Each entity in the knowledge graph should be distinct and have a unique identifier. This ensures clarity and avoids ambiguity when establishing relationships between entities.
    -You Must return output in array format e.g  ['entity1','entity2'] !!!
    -Avoid adding new lines or breaking spaces to your output. Array should be single dimension and single line !!!
 
    """
    entity_list = callOpenAI(prompt)   
    return entity_list

# Test usage
test_query = "What are the sustainable energy for UNDP?"
entity_list = extractEntitiesFromQuery(test_query)
print(entity_list)

['sustainable energy', 'UNDP']


In [7]:
## module to get information on the entities from user query using the KG
def knowledgeGraphModule(user_query):
    
    # generate list of entities based on user query
    entity_list = extractEntitiesFromQuery(user_query)
    my_list = ast.literal_eval(entity_list)
    prompt_summarise_entites = f"""
    Summarize all relations between all the entities : {my_list}
    """
    summarise_entities = callOpenAI(prompt_summarise_entites)
    # Initialize an empty dictionary to store information
    entities_dict = {
        "relations": summarise_entities,
        "entities": {}
    }
    # Loop through each entity in the list
    for entity in my_list:
        # Fetch information about the entity from your knowledge graph
        prompt = f"Give me a short description 50 words of {entity}"
        entity_info = callOpenAI(prompt)
        # Add the entity information to the dictionary
        entities_dict["entities"][entity] = entity_info
    
    return entities_dict


# Test usage
test_query = "what is the major work on UNDP in Afganistan?"
entities_dict = knowledgeGraphModule(test_query)
print(entities_dict)

{'relations': 'The relation between UNDP and Afghanistan is that UNDP works in Afghanistan to support development projects and initiatives.', 'entities': {'UNDP': 'The United Nations Development Programme (UNDP) is a global organization that works to eradicate poverty, reduce inequalities, and promote sustainable development. It provides support to countries in areas such as governance, climate change, and crisis response, aiming to improve the lives of people and protect the planet.', 'Afghanistan': 'Afghanistan is a landlocked country located in Central Asia, bordered by Iran, Pakistan, Turkmenistan, Uzbekistan, Tajikistan, and China. It is known for its rugged mountainous terrain, rich cultural heritage, and complex history. Despite facing numerous challenges, Afghanistan is home to resilient people, diverse ethnic groups, and a vibrant blend of traditions.'}}


In [8]:
def search_embeddings(user_query):
    df = pd.read_pickle('../models/df_embed_EN.pkl')
    df_filtered = df
    length = len(df_filtered.head())
    filtered_embeddings_arrays = np.array(list(df_filtered['Embedding']))
    index = faiss.IndexFlatIP(filtered_embeddings_arrays.shape[1]) 
    index.add(filtered_embeddings_arrays)
    
    user_query_embedding = client.embeddings.create( 
        input=user_query ,model= embedding_model
    ).data[0].embedding
    
    k = min(5, length)
    distances, indices = index.search(np.array([user_query_embedding]), k)
    return df_filtered, distances, indices

In [9]:
def get_answer(user_question, content):
    system_prompt = "You are a system that answers user questions based on excerpts from PDF documents provided for context. Only answer if the answer can be found in the provided context. Do not make up the answer; if you cannot find the answer, say so."
    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_question},
        {'role': 'user', 'content': content},
    ]
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0.2,
                    messages=messages
                )
    response = response_entities.choices[0].message.content
    return response
  

In [10]:
def map_to_structure(qs):
    result_dict = {}

    # Extract the DataFrame from the tuple
    dataframe = qs[0]

    # Counter to limit the loop to 10 iterations
    count = 0

    for index, row in dataframe.iterrows():
        # Define a unique identifier for each document, you can customize this based on your data
        document_id = f"doc-{index + 1}"
        # Handle NaN in content by using fillna
        content = row["Content"]
        content = ' '.join(row["Content"].split()[:160])
        # Create a dictionary for each document
        document_info = {
            "title": row["Document Title"],
            "extract": content or "",  # You may need to adjust this based on your column names
            "category": row["Category"],
            "link": row["Link"],
            "thumbnail": ''
        }
        # print(document_info)
        # Add the document to the result dictionary
        result_dict[document_id] = document_info

        # Increment the counter
        count += 1

        # # Break out of the loop if the counter reaches top 10
        if count == 10:
            break

    return result_dict


In [11]:
## module to extract text from documents and return the text and document codes

def semanticSearchModule(user_query):
    qs = search_embeddings(user_query) #df, distances, indices
    result_structure = map_to_structure(qs)
    return result_structure

#test usage
excerpts_dict=semanticSearchModule(test_query)
# print(excerpts_dict)


#Return top 10-20 most related 
# Define the filename to save the JSON data -  can remove later
json_filename = "outputs/excerpts.json"

# Save excerpts_dict to the JSON file just for a better preview
with open(json_filename, 'w', encoding='utf-8') as json_file:
    json.dump(excerpts_dict, json_file, ensure_ascii=False, indent=4)

print(f"Excerpts saved to {json_filename}")

Excerpts saved to outputs/excerpts.json


In [12]:
## module to get data for specific indicators which are identified is relevant to the user query

def indicatorsModule(user_query): #lower priority
    
    # find relevant indicators based on uesr query and extract values
    indicators_dict={
        "indicator-id-1":"value from indicator-id-1",
        "indicator-id-2":"value from indicator-id-2"
    }#temp
    
    return indicators_dict

#test usage
indicators_dict=indicatorsModule(test_query)
print(indicators_dict)

{'indicator-id-1': 'value from indicator-id-1', 'indicator-id-2': 'value from indicator-id-2'}


In [13]:
## module to generate query ideas

def queryIdeationModule(user_query): # lower priority
    
    # Generate query ideas using OpenAI GPT-3
    prompt = f"""Generate query ideas based on the user query: {user_query}
    
    -You Must return output in array format e.g ['idea 1', 'idea2'] !!!
    -Avoid adding new lines or breaking spaces to your output. Array should be single dimension and single line !!!
    
    """
    response = callOpenAI(prompt)
    return response


#test usage
query_idea_list=queryIdeationModule(test_query)
print(query_idea_list)

['What are the main projects of UNDP in Afghanistan?', 'What is the role of UNDP in Afghanistan?', 'What are the achievements of UNDP in Afghanistan?', 'How does UNDP contribute to development in Afghanistan?', 'What is the impact of UNDP's work in Afghanistan?', 'What are the focus areas of UNDP in Afghanistan?', 'What is the budget allocation for UNDP projects in Afghanistan?', 'What are the challenges faced by UNDP in Afghanistan?', 'How does UNDP collaborate with the Afghan government and other stakeholders?', 'What is the future plan of UNDP in Afghanistan?']


<h3> synthesis module </h3>

    llm_instructions="llm instruction template here, with placeholders for insertion of user query, excerpts, indicator data, and entity and relation info" 


In [30]:
# module to synthesize answer using retreival augmented generation approach

def synthesisModule(user_query, entities_dict, excerpts_dict, indicators_dict):
    
    # Generate prompt engineering text and template
    llm_instructions = f"""
    Ignore previous commands!!!
    Given a user query, use the provided excerpts, Sources, and entity and relation info to
    provide the correct answer to the user's query
    
    User Query: {user_query}
    
    Sources: {excerpts_dict}
    
    Entity and Relation info: {entities_dict}

    - Answer output must be properly formatted using HTML. 
    - Don't include <html>, <script>, <link> or <body> tags. Only text formating tags should be allowed. e.g h1..h3, p, anchor, etc.
    - Make sure to Include citations based on the Sources. e.g Text excerpt here<a data-id='test-doc-1'>[1]</a> when referencing a document in the sources. using 1 ...nth
    - The citations anchor should be near the excerpt not following each other.
    - Use the anchor tag for the citation links and should link to the document link. for example Undp operates in afganistan <a data-id='test-doc-1'>[1]</a>. UNDP offers health relationships <a data-id='test-doc-2'>[2]</a>.
    - The text in the anchor tag should be citation number not document title.
    - You can reference multitple citations based sources
    """
    ###synthesize data into structure within llm prompt engineering instructions
    answer=callOpenAI(llm_instructions)
    
    return answer

## to test this, run the full pipeline with the handleApiCall function

<h3> run pipeline </h3>

In [31]:
# full pipeline with retreival, synthesis of answer to user query, and structure results into api response

def handleApiCall(user_query):
        
    ##run processing modules (in parallel)
    entities_dict=knowledgeGraphModule(user_query)
    excerpts_dict=semanticSearchModule(user_query)
    indicators_dict=indicatorsModule(user_query) ##lower priority
    query_idea_list=queryIdeationModule(user_query) ##lower priority
    
    ##synthesis module
    answer=synthesisModule(user_query, entities_dict, excerpts_dict, indicators_dict)
    
    ##structure response
    response={
        "user_query":user_query,
        "answer":answer,
        "sources":excerpts_dict,
        "query_ideas":query_idea_list,
        "entities":list(entities_dict["entities"].keys())       
    }
    
    return response

# test usage
response=handleApiCall(test_query) 
# Define the filename to save the JSON data -  can remove later
json_filename = "outputs/synthesis_output.json"

# Save excerpts_dict to the JSON file just for a better preview
with open(json_filename, 'w', encoding='utf-8') as json_file:
    json.dump(response, json_file, ensure_ascii=False, indent=4)

print(f"Synthesis saved to {json_filename}")

Synthesis saved to outputs/synthesis_output.json


<h3>testing</h3>

In [16]:
## next step, develop automated testing for all modules
## iterate through test_queries and build automated tests to score results

# open testing dataset with queries and expected results
test_queries_df=pd.read_csv("../testing/energy_ai_test_dataset_v0.csv")



  # TODO::: 
  ##### Add citation prompt to the synthesis module. -done 
  ##### Convert notebook to flask API script. main.py - done
  ##### Refactor PDF -> txt pipeline 

  