## Import Dependencies

In [39]:
import numpy as np
import pandas as pd
import openai
import os
from dotenv import load_dotenv
from openai import AzureOpenAI
import re
import tiktoken
import time
import faiss
import awoc
import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.metrics.pairwise import cosine_similarity

## Load Documents Data

In [40]:
# main data
wdi_csv = pd.read_csv('../data/WDI_CSV/WDICSV.csv')
# country meta data
wdi_country = pd.read_csv('../data/WDI_CSV/WDICountry.csv')
# Series meta data
wdi_series = pd.read_csv('../data/WDI_CSV/WDISeries.csv')
# country + series
wdi_country_series = pd.read_csv('../data/WDI_CSV/WDIcountry-series.csv')
# series + time
wdi_series_time = pd.read_csv('../data/WDI_CSV/WDIseries-time.csv')
# With CountryCode + SeriesCode + year, describe more info about this resource
wdi_footnote = pd.read_csv('../data/WDI_CSV/WDIfootnote.csv')

## Load Environments

In [41]:
load_dotenv()

True

## OpenAI API Configuration

In [42]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"


client = AzureOpenAI(
  api_key = os.getenv("api_key_azure"),  
  api_version = os.getenv("api_version"),
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)

encoding = tiktoken.get_encoding('cl100k_base')
embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE") 

In [43]:
# use this function to make simple openAI Calls
def callOpenAI(prompt):  
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=[
                        {"role": "user", "content": prompt},
                    ]
                )
    response = response_entities.choices[0].message.content
    return response

## Test Query Examples

In [44]:
test_query = "How many people in africa lack access to energy/electricity/clean cooking solutions?"
test_query2 = 'What is the Human Development Index (HDI) value for albania as mentioned in the document?'

## Extract Contry Code

In [45]:
'''
David's 'find_mentioned_countries' cannot catch continent -> Modify a bit
'''
# Extract mentioned countries' ISO3 code
def find_mentioned_country_code(user_query):
    countries = set()
    
    # Tokenize the text using regular expressions to preserve punctuation marks
    words = re.findall(r'\w+|[^\w\s]', user_query)
    text = ' '.join(words)  # Join the tokens back into a string
    
    world_info = awoc.AWOC()

    all_continents = set([continent.lower() for continent in world_info.get_continents_list()])
    all_countries = set([country.lower() for country in world_info.get_countries_list()])
    
    for word in text.split():
        # check if this continent
        if word in all_countries:
            countries.add(world_info.get_country_data(word)['ISO3'])
        elif word.lower() in all_continents:
            target_countries = world_info.get_countries_list_of(word)
            for country in target_countries:
                countries.add(world_info.get_country_data(country)['ISO3'])
    return countries
#print(find_mentioned_countries(test_query))
#print(find_mentioned_countries(test_query2))

## Embedding Processing for Indicators

In [None]:
def create_embedding(row):
    time.sleep(3)
    print(row.name)
    input_text = row['Indicator Name'].replace("\n", " ")
    input_text = re.sub(r'\s+', ' ', input_text)
    encodings = encoding.encode(input_text)
    length = len(encodings)
    embedding = client.embeddings.create( 
        input=input_text ,model= embedding_model
    ).data[0].embedding
    
    return length, embedding

wdi_series['token_length'], wdi_series['Embedding'] = zip(*wdi_series.apply(lambda row: create_embedding(row), axis=1))

In [47]:
wdi_series.to_pickle('../data/indicator_meta_embed.pkl')

## testing the model

In [48]:
df = pd.read_pickle('../data/indicator_meta_embed.pkl')

In [49]:
df

Unnamed: 0,Series Code,Topic,Indicator Name,Short definition,Long definition,Unit of measure,Periodicity,Base Period,Other notes,Aggregation method,...,General comments,Source,Statistical concept and methodology,Development relevance,Related source links,Other web links,Related indicators,License Type,token_length,Embedding
0,AG.AGR.TRAC.NO,Environment: Agricultural production,"Agricultural machinery, tractors",,Agricultural machinery refers to the number of...,,Annual,,,Sum,...,,"Food and Agriculture Organization, electronic ...",A tractor provides the power and traction to m...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,7,"[-0.023923911154270172, -0.021890051662921906,..."
1,AG.CON.FERT.PT.ZS,Environment: Agricultural production,Fertilizer consumption (% of fertilizer produc...,,Fertilizer consumption measures the quantity o...,,Annual,,The world and regional aggregate series do not...,Weighted average,...,,"Food and Agriculture Organization, electronic ...",Fertilizer consumption measures the quantity o...,"Factors such as the green revolution, has led ...",,,,CC BY-4.0,9,"[0.022105641663074493, -0.01266169361770153, 0..."
2,AG.CON.FERT.ZS,Environment: Agricultural production,Fertilizer consumption (kilograms per hectare ...,,Fertilizer consumption measures the quantity o...,,Annual,,The world and regional aggregate series do not...,Weighted average,...,,"Food and Agriculture Organization, electronic ...",Fertilizer consumption measures the quantity o...,"Factors such as the green revolution, has led ...",,,,CC BY-4.0,15,"[0.029225729405879974, 0.0019958876073360443, ..."
3,AG.LND.AGRI.K2,Environment: Land use,Agricultural land (sq. km),,Agricultural land refers to the share of land ...,,Annual,,Areas of former states are included in the suc...,Sum,...,,"Food and Agriculture Organization, electronic ...",Agricultural land constitutes only a part of a...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,9,"[0.0006477561546489596, -0.010305909439921379,..."
4,AG.LND.AGRI.ZS,Environment: Land use,Agricultural land (% of land area),,Agricultural land refers to the share of land ...,,Annual,,Areas of former states are included in the suc...,Weighted average,...,,"Food and Agriculture Organization, electronic ...",Agriculture is still a major sector in many ec...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,9,"[-0.005848452914506197, -0.018034594133496284,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1487,VC.IDP.NWDS,Public Sector: Conflict & fragility,"Internally displaced persons, new displacement...",,Internally displaced persons are defined accor...,,Annual,,,Sum,...,,The Internal Displacement Monitoring Centre (h...,"Internally displaced persons are ""persons or g...",Although all persons affected by conflict and/...,,,,CC BY-4.0,15,"[-0.017909416928887367, -0.001801028847694397,..."
1488,VC.IDP.TOCV,Public Sector: Conflict & fragility,"Internally displaced persons, total displaced ...",,Internally displaced persons are defined accor...,,Annual,,,Sum,...,,The Internal Displacement Monitoring Centre (h...,"Internally displaced persons are ""persons or g...",Although all persons affected by conflict and/...,,,,CC BY-4.0,16,"[-0.024313515052199364, -0.011989077553153038,..."
1489,VC.IHR.PSRC.FE.P5,Public Sector: Conflict & fragility,"Intentional homicides, female (per 100,000 fem...",,"Intentional homicides, female are estimates of...",,Annual,,,,...,,UN Office on Drugs and Crime's International H...,The definitions used to produce data are in li...,"In some regions, organized crime, drug traffic...",,,,CC BY-4.0,14,"[-0.0004817298904526979, -0.0091606630012393, ..."
1490,VC.IHR.PSRC.MA.P5,Public Sector: Conflict & fragility,"Intentional homicides, male (per 100,000 male)",,"Intentional homicides, male are estimates of u...",,Annual,,,,...,,UN Office on Drugs and Crime's International H...,The definitions used to produce data are in li...,"In some regions, organized crime, drug traffic...",,,,CC BY-4.0,14,"[0.012024604715406895, -0.01930396258831024, -..."


In [50]:
# Function to calculate Jaccard similarity between two texts
def jaccard_similarity(text1, text2):
    # Tokenize texts
    tokens1 = set(text1.lower().split())
    tokens2 = set(text2.lower().split())
    
    # Calculate Jaccard similarity
    intersection = len(tokens1.intersection(tokens2))
    union = len(tokens1.union(tokens2))
    
    return intersection / union if union > 0 else 0


In [51]:
def filter_semantics(user_query):
    # Calculate similarity scores for each document title
    similarity_scores = []
    indicators = []

    # Iterate through each document title and calculate similarity score
    for indicator in wdi_series_emd['Indicator Name']:
        similarity_score = jaccard_similarity(user_query, indicator)
        similarity_scores.append(similarity_score)
        indicators.append(indicator)
        
    # Create DataFrame only with valid similarity scores
    similarity_df = pd.DataFrame({'Indicator Name': indicators, 'Similarity Score': similarity_scores})
    similarity_df = similarity_df.sort_values('Similarity Score', ascending=False)
    similarity_df = similarity_df[:10]
        
    # Filter documents where similarity score is above a threshold (e.g., 0.3)
    threshold = 0.01
    filtered_df = wdi_series_emd[wdi_series_emd['Indicator Name'].isin(similarity_df[similarity_df['Similarity Score'] > threshold]['Indicator Name'])]

    return  filtered_df

In [52]:
test_query = "How many people in africa lack access to energy/electricity/clean cooking solutions?"
filter_semantics(test_query)

Unnamed: 0.1,Unnamed: 0,Series Code,Topic,Indicator Name,Short definition,Long definition,Unit of measure,Periodicity,Base Period,Other notes,...,General comments,Source,Statistical concept and methodology,Development relevance,Related source links,Other web links,Related indicators,License Type,token_length,Embedding
212,212,EG.CFT.ACCS.RU.ZS,Environment: Energy production & use,Access to clean fuels and technologies for coo...,,Access to clean fuels and technologies for coo...,,Annual,,,...,,"IEA, IRENA, UNSD, World Bank, WHO. 2023. Track...",Data for access to clean fuels and technologie...,,,,,CC BY-4.0,15,"[0.022533031180500984, -0.0017073963535949588,..."
213,213,EG.CFT.ACCS.UR.ZS,Environment: Energy production & use,Access to clean fuels and technologies for coo...,,Access to clean fuels and technologies for coo...,,Annual,,,...,,"IEA, IRENA, UNSD, World Bank, WHO. 2023. Track...",Data for access to clean fuels and technologie...,,,,,CC BY-4.0,15,"[0.0315319187939167, -0.0034343975130468607, 0..."
214,214,EG.CFT.ACCS.ZS,Environment: Energy production & use,Access to clean fuels and technologies for coo...,,Access to clean fuels and technologies for coo...,,Annual,,,...,,"IEA, IRENA, UNSD, World Bank, WHO. 2023. Track...",Data for access to clean fuels and technologie...,,,,,CC BY-4.0,12,"[0.029152700677514076, -0.003278686897829175, ..."
216,216,EG.ELC.ACCS.RU.ZS,Environment: Energy production & use,"Access to electricity, rural (% of rural popul...",,"Access to electricity, rural is the percentage...",,Annual,,,...,,"IEA, IRENA, UNSD, World Bank, WHO. 2023. Track...",The World Bank’s Global Electrification Databa...,,,,,CC BY-4.0,10,"[0.009685068391263485, -0.013544631190598011, ..."
217,217,EG.ELC.ACCS.UR.ZS,Environment: Energy production & use,"Access to electricity, urban (% of urban popul...",,"Access to electricity, urban is the percentage...",,Annual,,,...,,"IEA, IRENA, UNSD, World Bank, WHO. 2023. Track...",The World Bank’s Global Electrification Databa...,,,,,CC BY-4.0,10,"[0.01995106227695942, -0.013784131966531277, 0..."
218,218,EG.ELC.ACCS.ZS,Environment: Energy production & use,Access to electricity (% of population),,Access to electricity is the percentage of pop...,,Annual,,,...,,"IEA, IRENA, UNSD, World Bank, WHO. 2023. Track...",The World Bank’s Global Electrification Databa...,Maintaining reliable and secure electricity se...,,,,CC BY-4.0,7,"[0.013930145651102066, -0.013670012354850769, ..."
515,515,IQ.CPA.TRAN.XQ,Public Sector: Policy & institutions,"CPIA transparency, accountability, and corrupt...",,"Transparency, accountability, and corruption i...",,Annual,,,...,,"World Bank Group, CPIA database (http://www.wo...",All criteria within each cluster receive equal...,The International Development Association (IDA...,,,,CC BY-4.0,24,"[0.019003234803676605, -0.006473481189459562, ..."
1459,1459,TX.VAL.MRCH.AL.ZS,Private Sector & Trade: Exports,Merchandise exports to economies in the Arab W...,,Merchandise exports to economies in the Arab W...,,Annual,,,...,,World Bank staff estimates based data from Int...,,,,,,CC BY-4.0,16,"[-0.00021702497906517237, -0.01074172277003526..."
1466,1466,TX.VAL.MRCH.R4.ZS,Private Sector & Trade: Exports,Merchandise exports to low- and middle-income ...,Merchandise exports to low- and middle-income ...,Merchandise exports to low- and middle-income ...,,Annual,,,...,,World Bank staff estimates based data from Int...,,,,,,CC BY-4.0,23,"[-0.003463709494099021, -0.009502499364316463,..."
1468,1468,TX.VAL.MRCH.R6.ZS,Private Sector & Trade: Exports,Merchandise exports to low- and middle-income ...,Merchandise exports to low- and middle-income ...,Merchandise exports to low- and middle-income ...,,Annual,,,...,,World Bank staff estimates based data from Int...,,,,,,CC BY-4.0,21,"[0.0013253657380118966, -0.012262063100934029,..."


In [56]:
test_query=" is the promotion of renewable energy a focal point within the energy sector?"
df_filtered = filter_semantics(test_query)
filtered_embeddings_arrays = np.array(list(df_filtered['Embedding']))
filtered_embeddings_arrays.shape

(10,)

In [54]:
def search_embeddings(user_query):
    df_filtered = filter_semantics(user_query) if filter_semantics(user_query) is not None else None
    
    if df_filtered is not None and not df_filtered.empty:  # Check if DataFrame is not None and not empty
        length = len(df_filtered.head())
        filtered_embeddings_arrays = np.array(list(df_filtered['Embedding']))
        index = faiss.IndexFlatIP(filtered_embeddings_arrays.shape[1]) 
        index.add(filtered_embeddings_arrays)
        
        user_query_embedding = client.embeddings.create( 
                input=user_query ,model= embedding_model
            ).data[0].embedding

        k = min(5, length)
        distances, indices = index.search(np.array([user_query_embedding]), k)
        return df_filtered, distances, indices
    else:
        return None, None, None

In [55]:
search_embeddings(test_query)

IndexError: tuple index out of range