In [74]:
import pandas as pd
import openai
from dotenv import load_dotenv
import os
import ast
from openai import AzureOpenAI
import faiss
import numpy as np
import json
import spacy
nlp = spacy.load("en_core_web_sm")
import pycountry
import re
from bert_score import score as bert_score
import csv
import transformers
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import itertools
import re
from sklearn.metrics.pairwise import cosine_similarity
from spacy.lang.en.stop_words import STOP_WORDS
from datetime import datetime
import tiktoken
from scipy import spatial  # for calculating vector similarities for search
from country_named_entity_recognition import find_countries
import awoc
from bs4 import BeautifulSoup





### Load Model files

In [75]:
df = pd.read_pickle('../models/df_embed_EN_All_V4.pkl')

# # Filter DataFrame for rows where Country Code is 'THA'
# df_tha = df[df['Country Code'] == 'China']

# # Display the first 10 rows of the filtered DataFrame
# print(df_tha.head(10))


### Load Enviroment files

In [76]:
# Load environment variables
load_dotenv()

True

In [77]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"
encoding = tiktoken.get_encoding('cl100k_base')


client = AzureOpenAI(
  api_key = os.getenv("api_key_azure"),  
  api_version = os.getenv("api_version"),
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)

embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE") 

# print(openai.api_key)
# print(openai.api_base)
# print(openai.api_version)


<h3>globals</h3>

In [78]:
# test_query="What are the sustainable energy priorities for UNDP?"
test_query = 'What is the Human Development Index (HDI) value for Albania as mentioned in the document?'


<h3> helper functions </h3>

In [79]:
# use this function to make simple openAI Calls
def callOpenAI(prompt):  
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=[
                        {"role": "user", "content": prompt},
                    ]
                )
    response = response_entities.choices[0].message.content
    return response


<h3> processing modules </h3>

In [80]:
def extractEntitiesFromQuery(user_query):
    prompt = f"""
    Extract entities from the following user query: \"{user_query}\" and return output in array format.
    
    -Entities should be directly related to the domain or topic of interest. They should represent important concepts that contribute to the understanding of the subject matter.
    -Each entity in the knowledge graph should be distinct and have a unique identifier. This ensures clarity and avoids ambiguity when establishing relationships between entities.
    -You Must return output in array format e.g  ['entity1','entity2'] !!!
    -Avoid adding new lines or breaking spaces to your output. Array should be single dimension and single line !!!
 
    """
    entity_list = callOpenAI(prompt)   
    return entity_list

# Test usage
# test_query = "What are the sustainable energy for UNDP?"
# entity_list = extractEntitiesFromQuery(test_query)
# print(entity_list)

In [81]:
## module to get information on the entities from user query using the KG
def knowledgeGraphModule(user_query):
    
    # generate list of entities based on user query
    entity_list = extractEntitiesFromQuery(user_query)
    my_list = ast.literal_eval(entity_list)
    prompt_summarise_entites = f"""
    Summarize all relations between all the entities : {my_list}
    """
    summarise_entities = callOpenAI(prompt_summarise_entites)
    # Initialize an empty dictionary to store information
    entities_dict = {
        "relations": summarise_entities,
        "entities": {}
    }
    # Loop through each entity in the list
    for entity in my_list:
        # Fetch information about the entity from your knowledge graph
        prompt = f"Give me a short description 50 words of {entity}"
        entity_info = callOpenAI(prompt)
        # Add the entity information to the dictionary
        entities_dict["entities"][entity] = entity_info
    
    return entities_dict


# Test usage
# test_query = "What is the role of Paris Agreement in sustainable energy development?"
# entities_dict = knowledgeGraphModule(test_query)
# print(entities_dict)

In [83]:
from awoc import AWOC

 
 
def find_mentioned_countries(text):
    countries = set()
    
    # Tokenize the text using regular expressions to preserve punctuation marks
    words = re.findall(r'\w+|[^\w\s]', text)
    text = ' '.join(words)  # Join the tokens back into a string
    
    # Get a list of all country names
    all_countries = {country.name: country for country in pycountry.countries}
    
    # Check for multi-word country names first to avoid partial matches
    for name in sorted(all_countries.keys(), key=lambda x: len(x), reverse=True):
        if name in text:
            countries.add(all_countries[name].name)
            text = text.replace(name, '')  # Remove the found country name from the text to avoid duplicates

    return list(countries)


# Extract mentioned countries' ISO3 code
def find_mentioned_country_code(user_query):
    countries = set()
    extracted_countries = find_mentioned_countries(user_query)
    
    for country in extracted_countries:
        try:
            country_info = pycountry.countries.get(name=country)
            if country_info:
                countries.add(country_info.alpha_3)
        except LookupError:
            pass
    
    # If no countries are found, check for continent mentions
    if not countries:
        words = re.findall(r'\w+|[^\w\s]', user_query)
        text = ' '.join(words)  # Join the tokens back into a string
    
        world_info = AWOC()
        all_continents = set([continent.lower() for continent in world_info.get_continents_list()])
        for word in text.split():
            word = word.lower()
            if word in all_continents:
                target_countries = world_info.get_countries_list_of(word)
                
                for country in target_countries:
                    countries.add(world_info.get_country_data(country)['ISO3'])
    
    return countries


# # Example 
# user_query = 'Could you clarify how UNDP ensures financial transparency and accountability in its large-scale solar energy projects in Africa ?'
# mentioned_countries = find_mentioned_country_code(user_query)
# mentioned_countries

In [84]:

# Load the English language model
# Function to calculate the average word embedding for a sentence
# def average_word_embedding(sentence):
#     # Parse the sentence using SpaCy
#     doc = nlp(sentence)
#     # Get word vectors and average them
#     word_vectors = [token.vector for token in doc if token.has_vector]
#     if not word_vectors:
#         return None
#     return np.mean(word_vectors, axis=0)


def average_word_embedding(sentence):
    if sentence is None:
        sentence = ""
    
    # Parse the sentence using SpaCy
    doc = nlp(sentence)
    
    # Get word vectors and average them
    vectors = [token.vector for token in doc if token.has_vector]
    if not vectors:
        return None
    
    avg_vector = sum(vectors) / len(vectors)
    return avg_vector

# Function to calculate context similarity between two sentences using word embedding averaging
def calculate_context_similarity(sentence1, sentence2):
    # Get average word embeddings for each sentence
    avg_embedding1 = average_word_embedding(sentence1)
    avg_embedding2 = average_word_embedding(sentence2)
    if avg_embedding1 is None or avg_embedding2 is None:
        return None
    # Calculate cosine similarity between the embeddings
    similarity = cosine_similarity([avg_embedding1], [avg_embedding2])[0][0]
    return similarity

# # Example sentences
# sentence1 = 'The companys quarterly earnings report exceeded expectations, leading to a surge in stock prices.'
# sentence2 = 'The firms financial results for the last quarter surpassed predictions, resulting in a sharp rise in the value of shares'



# Calculate context similarity
# similarity = calculate_context_similarity(sentence1, sentence2)
# print("Context similarity:", similarity)




In [85]:
# Function to calculate Jaccard similarity between two texts
def jaccard_similarity(text1, text2):
    # Tokenize texts
    tokens1 = set(text1.lower().split())
    tokens2 = set(text2.lower().split())
    
    # Calculate Jaccard similarity
    intersection = len(tokens1.intersection(tokens2))
    union = len(tokens1.union(tokens2))
    
    return intersection / union if union > 0 else 0


In [86]:
def title_contains_entity(entity, title):
    # Convert both entity and title to lowercase for case-insensitive comparison
    entity_lower = entity.lower()
    title_lower = title.lower()

    # Check if the lowercase entity is contained within the lowercase title
    if entity_lower in title_lower:
        return 1
    else:
        return 0

In [91]:
# def filter_semantics_spacy(user_query):
#     doc = nlp(user_query)
#     entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ != ""]  # Filter out empty entities
#     entities.extend((token.text, "NOUN") for token in doc if token.pos_ in ["NOUN", "PROPN", "PRON", "NUM", "SYM", "X", "ABBR"] or token.is_alpha)

#     # Remove stop words
#     entities = [(entity, label) for entity, label in entities if entity.lower() not in STOP_WORDS]

#     # Initialize empty DataFrames
#     filtered_df_country = pd.DataFrame()
#     filtered_df_others = pd.DataFrame()
#     filtered_df_others_title = pd.DataFrame()

#     filtered_df_backup_reference = pd.DataFrame()
#     allow_low = True

#     for entity, label in entities:
       
#         filtered_df_others = pd.concat([filtered_df_others, df[df['Country Name'].str.lower().str.contains(entity.lower(), na=False)]])
#         filtered_df_others_title = pd.concat([filtered_df_others_title, df[df['Document Title'].str.lower().str.contains(entity.lower(), na=False)]])

#         # Calculate similarity scores for each document title and country name
#         similarity_scores_country = []
#         similarity_scores_title = []
#         document_titles = []

#         for index, row in filtered_df_others.iterrows():
#             country_name = row['Country Name']
#             document_title = row['Document Title']

#             if country_name is not None:
                
#                 similarity_score_country = calculate_context_similarity(user_query, country_name)
#                 similarity_scores_country.append(similarity_score_country)
#             else:
#                 similarity_scores_country.append(0)

#             if document_title is not None:
#                 similarity_score_title = calculate_context_similarity(user_query, document_title)
#                 similarity_scores_title.append(similarity_score_title)
#             else:
#                 similarity_scores_title.append(0)

#             document_titles.append(document_title)
        
#         similarity_df = pd.DataFrame({
#             'Country Name': filtered_df_others['Country Name'],
#             'Document Title': document_titles,
#             'Similarity Score Country': similarity_scores_country,
#             'Similarity Score Title': similarity_scores_title
#         })

#         # Define thresholds
#         threshold_country = 0.5
#         threshold_title = 0.5

#         # Filter df based on similarity scores greater than threshold
#         filtered_df_others = df[
#             df['Country Name'].isin(similarity_df[similarity_df['Similarity Score Country'] > threshold_country]['Country Name']) &
#             df['Document Title'].isin(similarity_df[similarity_df['Similarity Score Title'] > threshold_title]['Document Title'])
#         ]

#         filtered_df_backup_reference = pd.concat([filtered_df_backup_reference, df[
#             df['Country Name'].isin(similarity_df[(similarity_df['Similarity Score Country'] >= 0.1) & (similarity_df['Similarity Score Country'] < threshold_country)]['Country Name']) |
#             df['Document Title'].isin(similarity_df[(similarity_df['Similarity Score Title'] >= 0.1) & (similarity_df['Similarity Score Title'] < threshold_title)]['Document Title'])
#         ]])

#         # Check for location related e.g by country, language, locals
#         # if label in ['GPE', 'NORP', 'LANGUAGE', 'FAC']:
#         filtered_df_country_code = find_mentioned_country_code(user_query) #pd.concat([filtered_df_country, df[df['Country Name'] == entity]])
            
#         # filtered_df_country = df[df['Country Code'] == filtered_df_country_code]
#         filtered_df_country = filter_dataframe_by_country_names(df, filtered_df_country_code)

#         # print(f"""find_mentioned_country_code=== "{filtered_df_country_code}""")

#     merged_df = pd.DataFrame()
#     # if filtered_df_others.empty and filtered_df_country.empty:
#     #     print(f'on the reference df {filtered_df_backup_reference.empty}')
#     #     merged_df = pd.concat([filtered_df_backup_reference])
#     # else:
#     # merged_df = pd.concat([filtered_df_country, filtered_df_others, filtered_df_backup_reference,filtered_df_others_title])
#     merged_df = pd.concat([filtered_df_country])

#     return merged_df



In [108]:


def average_word_context_embed(sentence):
    # Ensure the input is a string
    if not isinstance(sentence, str):
        return None
    
    # If the sentence is empty, return None
    if not sentence:
        return None
    
    # Parse the sentence using SpaCy
    doc = nlp(sentence)
    
    # Get word vectors and average them
    vectors = [token.vector for token in doc if token.has_vector]
    if vectors:
        avg_vector = np.mean(vectors, axis=0)
        return avg_vector
    else:
        return None

def calculate_context_bool(uq, doc_):
    avg_emb1 = average_word_context_embed(uq)
    avg_emb2 = average_word_context_embed(doc_)
    if avg_emb1 is None or avg_emb2 is None:
        return False
    
    similarity = np.dot(avg_emb1, avg_emb2) / (np.linalg.norm(avg_emb1) * np.linalg.norm(avg_emb2))
    return similarity > 0.75  # Assuming 0.75 is the threshold for context similarity


# Function to convert country codes to country names
def convert_codes_to_names(codes):
    code_to_name = {country.alpha_3: country.name for country in pycountry.countries}
    return {code_to_name.get(code, code) for code in codes}



# Function to filter DataFrame based on country names
def filter_dataframe_by_country_names(df, filtered_country_cde):
    filtered_dfs = []
    country_names = convert_codes_to_names(filtered_country_cde)
    code_to_name = {country.alpha_3: country.name for country in pycountry.countries}
    
    for code in filtered_country_cde:
        country_name = code_to_name.get(code, None)
        if country_name:
            filtered_df = df[df['Country Code'] == code]
            filtered_df['Country Name'] = country_name
            filtered_dfs.append(filtered_df)
    
    if filtered_dfs:
        result_df = pd.concat(filtered_dfs, ignore_index=True)
    else:
        result_df = pd.DataFrame()  # Return empty DataFrame if no matches
    
    return result_df

#no spacy option - spacy entites gives alot of errors
def filter_semantics(user_query): 
    filtered_df_country = pd.DataFrame()
    filtered_df_title_context = pd.DataFrame()
    merged_df = pd.DataFrame()

    filtered_df_country_code = find_mentioned_country_code(user_query)
    filtered_df_country = filter_dataframe_by_country_names(df, filtered_df_country_code)

    
    filtered_df_title_context = df[df['Document Title'].notnull() & df['Document Title'].apply(lambda title: calculate_context_bool(user_query, title))]
    filtered_df_summary_context = df[df['Summary'].notnull() & df['Summary'].apply(lambda summary: calculate_context_bool(user_query, summary))]
    
    # Ensure both DataFrames have the same columns before concatenating
    if 'Country Name' not in filtered_df_title_context.columns:
        filtered_df_title_context['Country Name'] = np.nan
    if 'Country Name' not in filtered_df_summary_context.columns:
        filtered_df_summary_context['Country Name'] = np.nan
    
    # Merge the two filtered DataFrames
    merged_df = pd.concat([filtered_df_country, filtered_df_summary_context, filtered_df_title_context])
    return merged_df


# # Example 
# test_query="How do the challenges of implementing renewable energy projects in Asia compare to those in Latin America?"
# filtered_country = filter_semantics(test_query)
# filtered_country

In [114]:


def search_embeddings(user_query):
    # df_filtered = filter_semantics(user_query) if filter_semantics(user_query) is not None else None
    filtered_result = filter_semantics(user_query)
    # Check if the result is not None before assigning it to df_filtered
    df_filtered = filtered_result if filtered_result is not None else None

    if df_filtered is not None and not df_filtered.empty:  # Check if DataFrame is not None and not empty
        length = len(df_filtered.head())
        filtered_embeddings_arrays = np.array(list(df_filtered['Embedding']))
        index = faiss.IndexFlatIP(filtered_embeddings_arrays.shape[1]) 
        index.add(filtered_embeddings_arrays)
        
        user_query_embedding = client.embeddings.create( 
                input=user_query ,model= embedding_model
            ).data[0].embedding

        k = min(5, length)
        distances, indices = index.search(np.array([user_query_embedding]), k)

        # Extract the text excerpts
        text_excerpts = df_filtered.iloc[indices[0]]['Content'].tolist()
        print(f""" indices=== {indices}""")
        return df_filtered, distances, indices, text_excerpts
    else:
        return None, None, None, None


In [15]:
def map_to_structure(qs):
    result_dict = {}

    # Extract the DataFrame from the tuple
    dataframe = qs[0]
    # print(qs[1])
    # print(qs[2])
    # print(f""" text_excerpts== {qs[3]} """)

    # Counter to limit the loop to 10 iterations
    count = 0
    for index, row in dataframe.iterrows():
        # Define a unique identifier for each document, you can customize this based on your data
        document_id = f"doc-{index + 1}"
        # Handle NaN in content by using fillna
        content = row["Content"]
        content = ' '.join(row["Content"].split()[:160])
        # Create a dictionary for each document
        document_info = {
            "title": row["Document Title"],
            "extract": content or "",  # You may need to adjust this based on your column names
            "category": row["Category"],
            "link": row["Link"].replace("https-//","https://"),
            "summary": row["Summary"],
            "thumbnail": ''
        }
        # print(document_info)
        # Add the document to the result dictionary
        result_dict[document_id] = document_info

        # Increment the counter
        count += 1

        # # Break out of the loop if the counter reaches top 25
        if count == 10:
            break

    return result_dict

In [16]:
# Function to relabel keys and add citations
def relabel_and_add_citations(data):
    new_data = {}
    citation_counter = 1

    for doc_id, doc_info in data.items():
        new_data[doc_id] = {
            "document_title": doc_info.get("title", ""),
            "summary": doc_info.get("extract", ""),
            "document_category": doc_info.get("category", ""),
            "document_link": doc_info.get("link", ""),
            "document_thumbnail": doc_info.get("thumbnail", ""),

            "citation": citation_counter
        }
        citation_counter += 1

    return new_data

In [17]:
def extract_valid_json_objects(json_string):
    # Regex pattern to extract valid JSON objects
    pattern = re.compile(r'{.*?}', re.DOTALL)

    # Find all matches in the string
    matches = pattern.findall(json_string)

    # Parse each match to a dictionary
    parsed_data = []
    for match in matches:
        try:
            parsed_data.append(json.loads(match))
        except json.JSONDecodeError:
            continue
    
    return parsed_data

In [18]:
def check_links_and_process_html(html, content_dict):
    soup = BeautifulSoup(html, 'html.parser')
    
    for a in soup.find_all('a'):
        ref_text = a.get_text()
        if ref_text.startswith('[') and ref_text.endswith(']'):
            href = a.get('href')
            if not any(d['link'] == href for d in content_dict.values()):
                a.decompose()
    
    result = str(soup)
    return result

In [116]:

def process_queries(queries):
    merged_result_structure = {}

    for query in queries:
        qs = search_embeddings(query)  # Assuming search_embeddings returns a tuple (df, distances, indices)

        if qs[0] is not None:
            result_structure = map_to_structure(qs)
            for doc_id, doc_info in result_structure.items():
                merged_result_structure[doc_id] = doc_info
    
    return merged_result_structure


## module to extract text from documents and return the text and document codes
def semanticSearchModule(user_query):
    query_transformation = callOpenAI(f"""
    Given a question, your job is to break them into 3 main sub-question and return as array. 
    
    - You Must return output seperated by |
    - Avoid adding new lines or breaking spaces to your output and must seperate each idea with |

    QUESTION: {user_query}
    """)
    print(f""" query_transformation: {query_transformation} """)
    
    # Split the string by the delimiter '|'
    questions_array = [question.strip() for question in query_transformation.split('|')]


    merged_results = process_queries(questions_array)
    return merged_results



#test usage
excerpts_dict= semanticSearchModule("What exactly does UNDP mean by 'gender-inclusive' when discussing renewable energy solutions in Bangladesh?")
# # print(f"""excerpts_dict {excerpts_dict}""")

# #Return top 10-20 most related 
# # Define the filename to save the JSON data -  can remove later
json_filename = "outputs/excerpts.json"

# # Save excerpts_dict to the JSON file just for a better preview
with open(json_filename, 'w', encoding='utf-8') as json_file:
    json.dump(excerpts_dict, json_file, ensure_ascii=False, indent=4)

print(f"Excerpts saved to {json_filename}")
         

 query_transformation: What does UNDP mean by 'gender-inclusive' when discussing renewable energy solutions in Bangladesh? | 
Why is UNDP emphasizing on gender inclusivity in renewable energy solutions in Bangladesh? | 
How does UNDP plan to implement gender-inclusive renewable energy solutions in Bangladesh? 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Country Name'] = country_name


 indices=== [[ 0  4  3 10  8]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Country Name'] = country_name


 indices=== [[ 0  4  3  8 10]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Country Name'] = country_name


 indices=== [[ 0 10  4  3 11]]
Excerpts saved to outputs/excerpts.json


In [117]:
## module to get data for specific indicators which are identified is relevant to the user query

def indicatorsModule(user_query): #lower priority
    
    # find relevant indicators based on uesr query and extract values
    indicators_dict={
        "indicator-id-1":"value from indicator-id-1",
        "indicator-id-2":"value from indicator-id-2"
    }#temp
    
    return indicators_dict

#test usage
indicators_dict=indicatorsModule(test_query)
print(indicators_dict)

{'indicator-id-1': 'value from indicator-id-1', 'indicator-id-2': 'value from indicator-id-2'}


In [118]:
## module to generate query ideas

def queryIdeationModule(user_query): # lower priority
    
    # Generate query ideas using OpenAI GPT-3
    prompt = f"""
    Ignore previous commands!!!
    Generate prompt ideas based on the user query: {user_query}

    
    -Prompt shoud not be answer to the user query but give other contextual ways of representing the user query !!!
    -You Must return output seperated by |  e.g idea 1 | idea2 
    - Each generated ideas should be very dinstinct but contextual. Not repeatitive or using same words
    - The query idea should be in a question form and not an answer form.
    -Avoid adding new lines or breaking spaces to your output and must seperate each idea with |
    """
    response = callOpenAI(prompt)
    return response

#test usage
# query_idea_list=queryIdeationModule(test_query)
# print(query_idea_list)

In [119]:
def get_answer(user_question, relevant_docs):
 
    formattings_html = f""" 
        Ignore previous
        Strictly follow the follow steps:
        Your output answer shoud be  in HTML syntax with HTML tags.
        Use HTML tags like < ul>, < ol>, < li>,  < strong>, < p>
        Only consider the inner part of the < body> tag.
        ALWAYS use the following tag for new lines: < br />
        Do not add CSS attributes.
        Include links and citations at all!!!
        Your final answer must be formatted in HTML format !!!

        - Only provide links in citations. Never link outside citations.
    """
    formattings = f""" 
        You can use relevant information in the docs to answer also: 

        DOCS: {relevant_docs}
        
       """
    messages = [
        {"role": "system", "content":f"""You are a helpful assistant and a professional researcher with many years of experience in answering questions. Give answer to the user's inquiry. {formattings_html}"""
        },
        {'role': 'user', 'content': f"""{formattings} 
                                        {user_question}
                                        
                                         {formattings_html}
                                        """},
    ]
       
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0.5,
                    messages=messages,
                    top_p=0.8,
                    frequency_penalty=0.6,
                    presence_penalty=0.8
                )
    response = response_entities.choices[0].message.content

    return response
  

<h3> synthesis module </h3>

    llm_instructions="llm instruction template here, with placeholders for insertion of user query, excerpts, indicator data, and entity and relation info" 


In [120]:

def synthesisModule(user_query, entities_dict, excerpts_dict, indicators_dict,prompt_formattings):
    
    # print(f""" ********************************************************************* {user_query} *********************************************************************  """)

    ###synthesize data into structure within llm prompt engineering instructions
    answer=get_answer(user_query,excerpts_dict) #callOpenAI
    answer_formated_fixed = answer.replace("\n\n","<br>").replace("\n","<br>")
    # answer_citation = add_citation(answer_formated_fixed,excerpts_dict)
    # answer_citation = answer_citation #.replace("\\","").replace("\n","")
    
    # print(f""" {answer} """)
    # print(f""" ********************************************************************* END *********************************************************************  """)

    return answer

## to test this, run the full pipeline with the handleApiCall function

In [121]:
#Cleanup outputs
# Parse the HTML content

# Function to remove [n] if not inside an <a> tag
def remove_unlinked_citations(soup):
    # Regular expression to match [n] pattern
    pattern = re.compile(r'\[\d+\]')
    
    for text in soup.find_all(text=pattern):
        # Find all matches in the text
        matches = pattern.findall(text)
        for match in matches:
            # Check if the match is inside an <a> tag
            if not text.find_parent('a'):
                # Remove the match from the text
                text.replace_with(text.replace(match, ''))
    
    return soup


def cleanCitation(html_content): 

    soup = BeautifulSoup(html_content, 'html.parser')
    # Remove unlinked citations
    clean_soup = remove_unlinked_citations(soup)

    # Get the modified HTML content
    clean_html_content = str(clean_soup)

    return clean_html_content


<h3> run pipeline </h3>

In [122]:
# full pipeline with retreival, synthesis of answer to user query, and structure results into api response

def handleApiCall(user_query):
    
    ##run processing modules (in parallel)
    entities_dict=knowledgeGraphModule(user_query)
    excerpts_dict=semanticSearchModule(user_query)
    indicators_dict=indicatorsModule(user_query) ##lower priority
    query_idea_list=queryIdeationModule(user_query) ##lower priority
    prompt_formattings=""
    ##synthesis module
    answer=synthesisModule(user_query, entities_dict, excerpts_dict, indicators_dict, prompt_formattings)
    # print(f""" Answer==== {answer} """)
    pattern =  re.compile(r'[^.]*\.')  #re.compile(r'<li>(.*?)</li>')
    # Find all matches
    content_array = pattern.findall(answer)
    sources = excerpts_dict
    results = []
    # print(content_array)
    limiter = 0
    for element in content_array:
        for doc_id, doc_info in sources.items():
            title_similarity = calculate_context_similarity(element, doc_info['title']) or 0
            extract_similarity = calculate_context_similarity(element, doc_info['extract']) or  0
            # summary_similarity = calculate_context_similarity(element, doc_info['summary'])
            # print(title_similarity)
            # print(extract_similarity)
            if title_similarity > 0.65 and extract_similarity > 0.65 and limiter < 5:
                result = {
                            'element': element,
                            'title': doc_info['title'],
                            'extract': doc_info['extract'],
                            'extract': doc_info['extract'],
                            'link': doc_info['link'],
                            'doc_id': doc_id,
                            'title_similarity': float(title_similarity),
                            'extract_similarity': float(extract_similarity)
                            # 'summary_similarity': float(summary_similarity)
                        }
                results.append(result)
                limiter += 1

    # for result in results:
    #     citation_fixes = callOpenAI(f"Given the below: {result} Create an output that mixes Element, Document extract and Summary into one output while still maintaining the context of the Element. Your final output answer length should not be more than 200 words. Also avoid using links, sources and references. ")
    #     result['citation_fixes'] = citation_fixes
    #     result

 
    content = answer
    counter = 0
    # Loop through each JSON object and replace the element with citation_fixes in the content
    # for result in results:
    #     counter += 1
    #     print(f"Element: {result['element']}")
    #     print(f"Document ID: {result['doc_id']}")
    #     print(f"Document Title: {result['title']}")
    #     print(f"Title Similarity: {result['title_similarity']:.4f}")
    #     print(f"Extract Similarity: {result['extract_similarity']:.4f}")
    #     # print(f"Summary Similarity: {result['summary_similarity']:.4f}")

    #     print()

        # content = content.replace(result['element'], f""" {result['citation_fixes']} <a href='{result['link']}' data-id='{result['doc_id']}'>[{counter}]</a> <br/>\n\n""")
    
    #final cleanup using openAI
    cleanup_content = callOpenAI(f""" Ignore previous commands !!!
                                      Strictly follow the below:
                                      Give the sentence. I want to to fix the citation formatings only. Don't add any answer to it.
                                       1. make sure  links are all in a citation format [n] where n represent an integer and must link to the document e.g content<a href='url-here'>[1]</a>  !!!!
                                       2.  The citations must be numbered in an ordered manner. Fix and return the output. !!!
                                       3. remove all foot notes or references. !!! 
                                       4. The citations MUST BE LINK to the docs e.g <a href='url-here'>[1]</a>  never use without LINKS !!!
                                       5. Output should retains HTML formattings. Never adjust a ciation without it being an anchor link. !!!
                                       6. Remeber only format the answer citations. Don't add or remove any. !!!
                                       7. Don't generate any link or so. Just use the answer as it is and adjust the citations as instructed above
                                       SENTENCE: {content}  
                                """)
    cleanup_content = cleanup_content.replace("\n","")
    cleanup_content = cleanCitation(cleanup_content)
    print(f""" {cleanup_content}""")
    
    response={
        "user_query":user_query,
        "answer":f"""{cleanup_content}""",
        "sources":excerpts_dict,
        "query_ideas":query_idea_list,
        "entities":list(entities_dict["entities"].keys())       
    }
    return response


# test usage
test_query = "Could you clarify the role of UNDP in promoting solar energy solutions in urban areas of Thailand?"
response=handleApiCall(test_query) 
# Define the filename to save the JSON data -  can remove later
json_filename = "outputs/synthesis_output.json"

# Save excerpts_dict to the JSON file just for a better preview
with open(json_filename, 'w', encoding='utf-8') as json_file:
    json.dump(response, json_file, ensure_ascii=False, indent=4)

print(f"Synthesis saved to {json_filename}")

 query_transformation: What is the role of UNDP in promoting solar energy solutions in urban areas of Thailand? | How does UNDP promote solar energy solutions in urban areas of Thailand? | What are the efforts made by UNDP to promote solar energy solutions in urban areas of Thailand? 
 indices=== [[ 7  9  1 12  6]]
 indices=== [[6 4 2 1 0]]
 <p>The United Nations Development Programme (UNDP) plays a crucial role in promoting solar energy solutions in urban areas of Thailand. Through various initiatives and partnerships, UNDP aims to accelerate the adoption of solar energy and contribute to sustainable development goals. Here are some key aspects of UNDP's involvement:</p><ol> <li><strong>Project Implementation:</strong> UNDP implements projects that focus on increasing access to solar energy in urban areas of Thailand. These projects involve the installation of solar panels on rooftops, public buildings, and community facilities. </li> <li><strong>Capacity Building:</strong> UNDP provi

  for text in soup.find_all(text=pattern):


<h1>Testing</h1>

<p>This sections contains all things testings </p>

In [None]:
# !pip install bert_score

In [None]:
def calculate_scores(csv_file):
    # Initialize an empty list to store processed entries
    result = []
    
    # Loop through each entry in the CSV file
    for entry in csv.DictReader(csv_file):
        query = entry['query']
        sample_answer = entry['sample_answer']
        
        # Call OpenAI for chat GPT answer
        chat_gpt_answer = callOpenAI(f""" 
                                    {query} 
                                    {prompt_formattings} 
                                    """)
        
        # Call the moonshot model API
        moonshot_model_answer = handleApiCall(query) 
        
        # Calculate BERT score for moonshot model answer
        P, F, R = bert_score([sample_answer], [moonshot_model_answer['answer']], lang='en', verbose=True)
        entry['moonshot_model_answer'] = moonshot_model_answer['answer']
        entry['bert_score'] = round(float(F), 2)

        # Calculate BERT score for chat GPT answer
        P, F, R = bert_score([sample_answer], [chat_gpt_answer], lang='en', verbose=True)
        entry['chat_gpt_answer'] = chat_gpt_answer
        entry['bert_score_gpt'] = round(float(F), 2)
        
        # Append the processed entry to the result list
        result.append(entry)
    
    # Return the list of processed entries
    return result


# Specify the path to your CSV file
csv_file_path = "../testing/queries.csv"

# Open the CSV file for reading
with open(csv_file_path, mode='r') as file:
    # Pass the file object to the function
    result = calculate_scores(file)

# Print updated data with scores
# print(json.dumps(result, indent=4))

# Save updated data to a JSON file
with open('../testing/test_output.json', 'w') as file:
    json.dump(result, file, indent=4)


<h1>Compare Moonshot BERT score to GPT BERT Score</h1>

In [None]:
# Load the JSON file
with open('../testing/test_output.json', 'r') as file:
    data = json.load(file)

# Initialize variables to store total scores and count of items
total_bert_score = 0
total_bert_score_gpt = 0
count = 0

# Iterate through each item in the JSON data
for item in data:
    # Extract bert_score and bert_score_gpt from the current item
    bert_score = item['bert_score']
    bert_score_gpt = item['bert_score_gpt']
    
    # Add the scores to the total
    total_bert_score += bert_score
    total_bert_score_gpt += bert_score_gpt
    
    # Increment the count
    count += 1

# Calculate the average scores
average_bert_score = total_bert_score / count
average_bert_score_gpt = total_bert_score_gpt / count

# Print the average scores
print("Average bert_score:", average_bert_score)
print("Average bert_score_gpt:", average_bert_score_gpt)

Average bert_score: 0.7995652173913044
Average bert_score_gpt: 0.7778260869565218


<h1>Trello Board AutoPopulate</h1>
<p> Automation for trello</p>

In [41]:
import os
import csv
import asyncio
import aiohttp
import nest_asyncio
import requests
from markdownify import markdownify as md
import textwrap
import re


In [42]:
# Load environment variables
trello_api_list_id = os.getenv("trello_api_list_id")
trello_api_key_id = os.getenv("trello_api_key_id")
trello_api_key_token = os.getenv("trello_api_key_token")
trello_board_id= os.getenv("trello_board_id")
trello_url ="https://api.trello.com/1/"
# print(trello_board_id)

In [43]:
#create list function - this allows for various test versions 

def create_list(list_title):
    url = f"{trello_url}lists?name={list_title}&token={trello_api_key_token}&idBoard={trello_board_id}&key={trello_api_key_id}"
    response = requests.request("POST", url)
    response_json = response.json()  # Parse response JSON
    list_id = response_json["id"]  # Access 'id' from JSON
    return list_id

#example
# list_response = create_list('User Queries')

In [44]:
#create list function - this allows for various test versions 
def create_card_label(card_id, color, name):
    url = f"{trello_url}cards/{card_id}/labels?color={color}&name={name}&token={trello_api_key_token}&idBoard={trello_board_id}&key={trello_api_key_id}"
    # print(url)
    response = requests.request("POST", url)
    response_json = response.json()  # Parse response JSON
    list_id = response_json["id"]  # Access 'id' from JSON
    return list_id

#example
# list_response = create_card_label('','')

In [45]:
def get_current_date(format='%b %d %H:%M'):
    return datetime.now().strftime(format)


In [46]:
#Async function to send the request

async def send_request(query,list_response,card_color,card_label_name):
    name = query["query"]
    response=handleApiCall(name) 
    
    # print(f""" send_request==== {response['answer']} """ )
    #for trello preview only
    cleanResp = response['answer'].replace("</p>","</p> ******************************************************************************").replace("<p>","<br/>").replace("<br>"," *******************************************************************************").replace("</li>"," *******************************************************************************")
    markdown_content_description = md(f"{(cleanResp)}")
    # markdown_content_description = html2text.html2text(cleanResp)

    desc = f"""{markdown_content_description}"""
    url = f"{trello_url}cards?idList={list_response}&key={trello_api_key_id}&token={trello_api_key_token}&name={name}&desc={desc}"
    
    async with aiohttp.ClientSession() as session:
        async with session.post(url, timeout=1200) as response:
            el = ''
            # print(name)
            resp = await response.text()
            print(resp)
            resp_dict = json.loads(resp)
            id = resp_dict['id']

            card_label_resp = create_card_label(id, card_color, card_label_name)
            print(card_label_resp)
            print("---------")


In [None]:
#Run Test process
async def mainTest(color,file):
    card_title = f"""{get_current_date()}"""
    card_color = color
    card_label_name =file
    queries_source = f"""../testing/queries/{file}.csv"""

    tasks = []
    list_response = create_list(card_title)
    with open(queries_source, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            tasks.append(send_request(row,list_response,card_color,card_label_name))
    await asyncio.gather(*tasks)

# Apply nest_asyncio
nest_asyncio.apply()
 
# Run the event loop

await mainTest('purple','comparative') 
# await mainTest('blue','opinion') - fair 
# await mainTest('green','descriptive') - fair
# await mainTest('red','clarification') - fair
# await mainTest('orange','procedural') - fair
# await mainTest('pink','yesno') - fair