## Import Dependencies

In [83]:
import numpy as np
import pandas as pd
import openai
import os
from dotenv import load_dotenv
from openai import AzureOpenAI
import re
import tiktoken
import time
import faiss
import awoc
import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.metrics.pairwise import cosine_similarity
from sutime import SUTime
import json
from datetime import datetime

## Load Documents Data

In [2]:
# main data
wdi_csv = pd.read_csv('../data/WDI_CSV/WDICSV.csv')
# country meta data
wdi_country = pd.read_csv('../data/WDI_CSV/WDICountry.csv')
# Series meta data
wdi_series = pd.read_csv('../data/WDI_CSV/WDISeries.csv')
# country + series
#wdi_country_series = pd.read_csv('../data/WDI_CSV/WDIcountry-series.csv')
# series + time
#wdi_series_time = pd.read_csv('../data/WDI_CSV/WDIseries-time.csv')
# With CountryCode + SeriesCode + year, describe more info about this resource
#wdi_footnote = pd.read_csv('../data/WDI_CSV/WDIfootnote.csv')

## Load Environments

In [3]:
load_dotenv()

True

## OpenAI API Configuration

In [4]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("api_key_azure")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("api_version")
openai_deployment = "sdgi-gpt-35-turbo-16k"


client = AzureOpenAI(
  api_key = os.getenv("api_key_azure"),  
  api_version = os.getenv("api_version"),
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)

encoding = tiktoken.get_encoding('cl100k_base')
embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE") 

In [5]:
# use this function to make simple openAI Calls
def callOpenAI(prompt):  
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=[
                        {"role": "user", "content": prompt},
                    ]
                )
    response = response_entities.choices[0].message.content
    return response

## Test Query Examples

In [6]:
test_query = "How many people in africa lack access to energy/electricity/clean cooking solutions?"
test_query2 = 'What is the Human Development Index (HDI) value for albania as mentioned in the document?'

To get any information from WDICSV.csv (WDI meta data) we need 3 things: 1. country code 2. indicator code 3. target period (1960 - 2023)

## Function for searching country code (First Condition Done ✅)

In [69]:
'''
Previous 'find_mentioned_countries' cannot catch continent -> Modify a bit
'''
# Extract mentioned countries' ISO3 code
def find_mentioned_country_code(user_query):
    countries = set()
    
    # Tokenize the text using regular expressions to preserve punctuation marks
    words = re.findall(r'\w+|[^\w\s]', user_query)
    text = ' '.join(words)  # Join the tokens back into a string
    
    world_info = awoc.AWOC()

    all_continents = set([continent.lower() for continent in world_info.get_continents_list()])
    all_countries = set([country.lower() for country in world_info.get_countries_list()])
    
    for word in text.split():
        # check if this continent
        if word in all_countries:
            countries.add(world_info.get_country_data(word)['ISO3'])
        elif word.lower() in all_continents:
            target_countries = world_info.get_countries_list_of(word)
            for country in target_countries:
                countries.add(world_info.get_country_data(country)['ISO3'])
    return countries
print(find_mentioned_country_code(test_query))
print(find_mentioned_country_code(test_query2))

{'REU', 'ESH', 'ZWE', 'MAR', 'TCD', 'GHA', 'BWA', 'SDN', 'NER', 'COM', 'STP', 'GNQ', 'LBY', 'RWA', 'NAM', 'TZA', 'ZMB', 'TUN', 'UGA', 'GMB', 'GIN', 'CMR', 'CPV', 'MOZ', 'DJI', 'BFA', 'SYC', 'MUS', 'SEN', 'ZAF', 'COD', 'BDI', 'KEN', 'LSO', 'ETH', 'COG', 'SOM', 'ERI', 'GNB', 'GAB', 'MYT', 'NGA', 'AGO', 'CIV', 'SSD', 'SWZ', 'MLI', 'MRT', 'TGO', 'LBR', 'MWI', 'MDG', 'CAF', 'BEN', 'DZA', 'SHN', 'EGY', 'SLE'}
{'ALB'}


# Function for searching indicator code (Second Condition Done✅)

## Embedding Processing for Indicators

In [None]:
def create_embedding(row):
    time.sleep(3)
    #print(row.name)
    input_text = row['Indicator Name'].replace("\n", " ")
    input_text = re.sub(r'\s+', ' ', input_text)
    encodings = encoding.encode(input_text)
    length = len(encodings)
    embedding = client.embeddings.create( 
        input=input_text ,model= embedding_model
    ).data[0].embedding
    
    return length, embedding

wdi_series['token_length'], wdi_series['Embedding'] = zip(*wdi_series.apply(lambda row: create_embedding(row), axis=1))

In [47]:
wdi_series.to_pickle('../data/indicator_meta_embed.pkl')

## Searching target indicator

In [8]:
df = pd.read_pickle('../data/indicator_meta_embed.pkl')

In [9]:
# Function to calculate Jaccard similarity between two texts
def jaccard_similarity(text1, text2):
    # Tokenize texts
    tokens1 = set(text1.lower().split())
    tokens2 = set(text2.lower().split())
    
    # Calculate Jaccard similarity
    intersection = len(tokens1.intersection(tokens2))
    union = len(tokens1.union(tokens2))
    
    return intersection / union if union > 0 else 0


In [63]:
def filter_indicators(user_query):
    # Calculate similarity scores for each indicators
    similarity_scores = []
    indicators = []

    # Iterate through each indicator title and calculate similarity score
    for indicator in df['Indicator Name']:
        similarity_score = jaccard_similarity(user_query, indicator)
        similarity_scores.append(similarity_score)
        indicators.append(indicator)
        
    # Create DataFrame only with valid similarity scores
    similarity_df = pd.DataFrame({'Indicator Name': indicators, 'Similarity Score': similarity_scores})
    similarity_df = similarity_df.sort_values('Similarity Score', ascending=False)
    similarity_df = similarity_df[:10]
        
    # Filter indicators where similarity score is above a threshold (e.g., 0.3)
    threshold = 0.01
    filtered_df = df[df['Indicator Name'].isin(similarity_df[similarity_df['Similarity Score'] > threshold]['Indicator Name'])]

    return  filtered_df

In [66]:
# search target indicator
def search_embeddings(user_query):
    df_filtered = filter_indicators(user_query) if filter_indicators(user_query) is not None else None
    
    if df_filtered is not None and not df_filtered.empty:  # Check if DataFrame is not None and not empty
        length = len(df_filtered.head())
        filtered_embeddings_arrays = np.array(list(df_filtered['Embedding']))
        index = faiss.IndexFlatIP(filtered_embeddings_arrays.shape[1]) 
        index.add(filtered_embeddings_arrays)
        
        user_query_embedding = client.embeddings.create( 
                input=user_query ,model= embedding_model
            ).data[0].embedding

        k = min(5, length)
        distances, indices = index.search(np.array([user_query_embedding]), k)
        return df_filtered, distances, indices
    else:
        return None, None, None

In [None]:
def get_answer(user_query, content):
    system_prompt = "You are a system that answers user questions based on excerpts from word indicator documents provided for context. Only answer if the answer can be found in the provided context. Do not make up the answer; if you cannot find the answer, say so."
    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_question},
        {'role': 'user', 'content': content},
    ]
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0.2,
                    messages=messages
                )
    response = response_entities.choices[0].message.content
    return response

## Function for searching target period (1960 - 2023) ✅

In [145]:
# Extract set of years from given timex3_list
def timex3_to_year_list(timex3_list):
    year_list = set()
    for timex3 in timex3_list:
        sutimeType, value = timex3["type"], timex3["value"]
        if "REF" not in value:
            if isinstance(value, dict):
                for year in range(int(value['begin']), int(value['end']) + 1):
                    year_list.add(year)
            elif value.isdigit():
                year_list.add(int(value))
            elif sutimeType in ['DATE', 'DURATION']:
                if sutimeType == 'DATE':
                    res = re.search('^\d\d\d\d', value)
                    if res:
                        year_list.add(int(res.group(0)))
                else:
                    year_dur = 0
                    current_year = datetime.now().year
                    dur_list = re.findall('\d+', "".join(re.findall('P[0-9]+Y', value)))
                    if dur_list:
                        year_dur = max([int(y) for y in dur_list])
                        while year_dur:
                            year_list.add(current_year - year_dur)
                            year_dur -= 1
            else:
                continue
    print(year_list)
    return year_list

In [146]:
def find_target_period_timex3(user_query):
    sutime = SUTime(mark_time_ranges = True, include_range = True)
    res = sutime.parse(user_query)
    return timex3_to_year_list(res)

## Final one function for searching indicator data (Function for finding info from indicator database)

In [None]:
def map_to_structure(countries, indicators, years):
    

In [64]:
## module to extract text from documents and return the text and document codes
def indicatorsModule(user_query, client, embedding_model):
    countries = find_mentioned_country_code(user_query)
    indicators = filter_indicators(user_query) #df, distances, indices
    years = find_target_period(user_query)
    if countries and indicators and years:
        result_structure = map_to_structure(countries, indicators, years)
        return result_structure
    else:
        return []