### Import dependencies

In [None]:
import pandas as pd
import numpy as np
import openai
import json
from dotenv import load_dotenv
import os
from openai import AzureOpenAI
import re
import tiktoken
import time
import spacy
nlp = spacy.load("en_core_web_sm")
import faiss


### Load documents data 

In [None]:

with open('../data/documents.json', 'r') as json_file:
    data = json.load(json_file)
df = pd.DataFrame(data)
df


### Load enviroments

In [14]:
load_dotenv()

True

### OpenAI API configuration


In [43]:
 
openai.api_type = "azure"
openai.api_base = os.getenv('OPENAI_API_BASE')
openai.api_version = "2023-05-15"
openai.api_key = os.getenv('OPENAI_API_KEY')
openai_deployment = "sdgi-gpt-35-turbo-16k"

client = AzureOpenAI(
  api_key = os.getenv("api_key_azure"),  
  api_version = os.getenv("api_version"),
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)
encoding = tiktoken.get_encoding('cl100k_base')
embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE") 

### Embedding Processings

In [None]:
def create_embedding(row):
    time.sleep(3)
    input_text = row['Content'].replace("\n", " ")
    input_text = re.sub(r'\s+', ' ', input_text)
    encodings = encoding.encode(input_text)
    length = len(encodings)
    if len(encodings) > 8090:
        input_text = encoding.decode(encodings[:8090])
    embedding = client.embeddings.create( 
        input=input_text ,model= embedding_model
    ).data[0].embedding
    
    return length, embedding

df['token_length'], df['Embedding'] = zip(*df.apply(lambda row: create_embedding(row), axis=1))

In [None]:
def cut_context(row):
    if row['token_length'] < 15000:
        return row['Content']
    else:
        text = row['Content'].replace("\n", " ")
        text =  re.sub(r'\s+', ' ', text)
        encodings = encoding.encode(text)
        cut = encodings[0:15000]
        decodings =  encoding.decode(cut)
        return decodings

df['content_cut'] = df.apply(lambda row:cut_context(row), axis= 1)

In [11]:
# save embeddings 
df[df['Language'] == 'EN'].to_pickle('df_embed_EN.pkl')

### Testing the model

In [25]:
df = pd.read_pickle('../models/df_embed_EN.pkl')
df

Unnamed: 0,Code,Status,Country Name,Country Code,Category,KeyWord to Search,Document Title,Exists?,Type,Publication Date,Publication Year,Start Year,End Year,Language,Link,Content,token_length,Embedding,content_cut
0,AFG-CPD-2014-EN,Completed,Afghanistan,AFG,CPD,,Country programme document for Afghanistan (20...,Y,Text,2-5 September 2014,2-5 September 2014,2015,2019,EN,https://digitallibrary.un.org/record/781748/fi...,Executive Board of the\r\nUnited Nations Devel...,10947,"[-0.01851908676326275, -0.0209512859582901, -0...",Executive Board of the\r\nUnited Nations Devel...
3,AFG-NEP-2015-EN,Completed,Afghanistan,AFG,NEP,,RENEWABLE ENERGYPOLICY,Y,Text,2015,2015,2015,2023,EN,https://cdn.climatepolicyradar.org/navigator/A...,Islamic Republic of Afghanistan\r\nMinistry Of...,56241,"[-0.010560429655015469, -0.025872383266687393,...",Islamic Republic of Afghanistan Ministry Of En...
4,AFG-NREP-2013-EN,Completed,Afghanistan,AFG,NREP,,Afghanistan Rural Renewable Energy Policy,Y,Text,"April, 2013","April, 2013",2017,2027,EN,https://cdn.climatepolicyradar.org/navigator/A...,ISLAMIC REPUBLIC OF AFGHANISTAN \r\nMINISTRY O...,7768,"[-0.00792530458420515, -0.015904521569609642, ...",ISLAMIC REPUBLIC OF AFGHANISTAN \r\nMINISTRY O...
5,AFG-NREP-2015-EN,Skipped (identical copy),Afghanistan,AFG,NREP,,RENEWABLE ENERGYPOLICY,Y,Text,2015,2015,2015,2032,EN,https://policy.asiapacificenergy.org/sites/def...,Islamic Republic of Afghanistan\r\nMinistry Of...,56241,"[-0.010560429655015469, -0.025872383266687393,...",Islamic Republic of Afghanistan Ministry Of En...
7,AFG-NRER-2017-EN,Completed,Afghanistan,AFG,NRER,,RENEWABLE ENERGY ROADMAP FOR AFGHANISTAN RER2032,Y,Text,2017,2017,2017,2032,EN,https://policy.asiapacificenergy.org/sites/def...,RENEWABLE ENERGY ROADMAP \r\nFOR AFGHANISTAN \...,50195,"[0.0005985109601169825, -0.027293723076581955,...",RENEWABLE ENERGY ROADMAP FOR AFGHANISTAN RER20...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,VCT-NEAP-2010-EN,,St Vincent and the Grenadines,VCT,NEAP,,,Y,Text,2010,2010,2010,2030,EN,https://cdn.climatepolicyradar.org/navigator/V...,St. Vincent and the Grenadines\r\nEnergy Actio...,21142,"[0.0021034132223576307, -0.019926007837057114,...",St. Vincent and the Grenadines Energy Action P...
747,VCT-NEP-2009-EN,,St Vincent and the Grenadines,VCT,NEP,,,Y,Text,2009,2009,2009,2030,EN,https://cdn.climatepolicyradar.org/navigator/V...,St. Vincent and the Grenadines (SVG) \r\nSusta...,5126,"[0.01070143561810255, -0.015502329915761948, 0...",St. Vincent and the Grenadines (SVG) \r\nSusta...
748,VCT-NEPro-2022-EN,,St Vincent and the Grenadines,VCT,NEPro,,,Y,Data,"24th August, 2022","24th August, 2022",2014,2022,EN,https://www.irena.org/-/media/Files/IRENA/Agen...,Total Energy Supply (TES) 2015 2020\r\nNon-ren...,2410,"[0.004014915786683559, -0.03727811947464943, 0...",Total Energy Supply (TES) 2015 2020\r\nNon-ren...
749,GRD-NEP-2011-EN,,Grenada,GRD,NEP,,,Y,Text,2011,2011,2011,2030,EN,https://cdn.climatepolicyradar.org/navigator/G...,GRENADATHE NATIONAL ENERGY POLICY \r\nOF GRENA...,15948,"[0.006723986007273197, 0.0015769308665767312, ...",GRENADATHE NATIONAL ENERGY POLICY OF GRENADA A...


In [34]:
user_query = 'Give me a summary of the goals UNDP wants to achieve in 10 years and the energy plans for Philippines'

In [35]:
def find_mentioned_countries(text):
    doc = nlp(text)
    countries = set()
    
    for ent in doc.ents:
        if ent.label_ == "GPE":  # GPE stands for "Geopolitical Entity"
            countries.add(ent.text)
    
    return list(countries)

#Example 
# mentioned_countries = find_mentioned_countries(user_query)
# mentioned_countries

['Philippines']

In [36]:
def filter_country(user_query):
    country = find_mentioned_countries(user_query)[0]
    print(country)
    return df[df['Country Name'] == country]


#Example 
# filtered_country = filter_country(user_query)
# filtered_country

Philippines


Unnamed: 0,Code,Status,Country Name,Country Code,Category,KeyWord to Search,Document Title,Exists?,Type,Publication Date,Publication Year,Start Year,End Year,Language,Link,Content,token_length,Embedding,content_cut
507,PHL-CPD-2018-EN,,Philippines,PHL,CPD,,,Y,Text,4 - 7 September 2018,4 - 7 September 2018,2019.0,2023.0,EN,https://digitallibrary.un.org/record/1637110/f...,Executive Board of the\r\nUnited Nations Devel...,8491,"[-0.013351034373044968, -0.016692159697413445,...",Executive Board of the\r\nUnited Nations Devel...
511,PHL-NEEP-2017-EN,,Philippines,PHL,NEEP,,,Y,Text,2017,2017,2017.0,2040.0,EN,https://policy.asiapacificenergy.org/sites/def...,Energy \r\nEfficiency and \r\nConservation\r\n...,7522,"[0.0006053519900888205, -0.011441781185567379,...",Energy \r\nEfficiency and \r\nConservation\r\n...
512,PHL-NEEP-2014-EN,,Philippines,PHL,NEEP,,,Y,Text,2014,2014,2014.0,2030.0,EN,https://policy.asiapacificenergy.org/sites/def...,An Energy Efficiency Roadmap \r\nfor the Phili...,15035,"[-0.003340955823659897, -0.02187677100300789, ...",An Energy Efficiency Roadmap for the Philippin...
513,PHL-NEP-2020-EN,,Philippines,PHL,NEP,,,Y,Text,2020,2020,,,EN,https://policy.asiapacificenergy.org/sites/def...,EIGHTEENTH CONGRESS OF THE )\r\nREPUBLIC OF TH...,6526,"[-0.005891846492886543, -0.007154147140681744,...",EIGHTEENTH CONGRESS OF THE )\r\nREPUBLIC OF TH...
514,PHL-NEP-2018-EN,,Philippines,PHL,NEP,,,Y,Text,2018,2018,2018.0,2040.0,EN,https://policy.asiapacificenergy.org/sites/def...,MESSAGE OF THE SECRETARY\r\nThe 2018-2040 Phil...,198664,"[-0.0037676398642361164, -0.011569184251129627...",MESSAGE OF THE SECRETARY The 2018-2040 Philipp...
515,PHL-NEP-2007-EN,,Philippines,PHL,NEP,,,Y,Text,2007,2007,2007.0,2014.0,EN,https://policy.asiapacificenergy.org/sites/def...,PHILIPPINE ENERGY PLAN\r\n2007-2014ABOUT THE C...,114685,"[-0.00039375413325615227, -0.01191779039800167...",PHILIPPINE ENERGY PLAN 2007-2014ABOUT THE COVE...
516,PHL-NEP-2012-EN,,Philippines,PHL,NEP,,,Y,Text,2012,2012,2012.0,2030.0,EN,https://policy.asiapacificenergy.org/sites/def...,"For inquiries, please contact:\r\nLoreta G. Ay...",164929,"[-0.009951895102858543, -0.013942074030637741,...","For inquiries, please contact: Loreta G. Ayson..."
517,PHL-NEP-2016-EN,,Philippines,PHL,NEP,,,Y,Text,2016,2016,2016.0,2030.0,EN,https://policy.asiapacificenergy.org/sites/def...,TABLE OF CONTENTS\r\nMESSAGE FROM THE SECRETAR...,50268,"[0.00829307921230793, -0.024419622495770454, -...",TABLE OF CONTENTS MESSAGE FROM THE SECRETARY ....
518,PHL-NEPro-2022-EN,,Philippines,PHL,NEPro,,,Y,Data,"24th August, 2022","24th August, 2022",2014.0,2022.0,EN,https://www.irena.org/-/media/Files/IRENA/Agen...,Total Energy Supply (TES) 2015 2020\r\nNon-ren...,2558,"[0.0051450119353830814, -0.03588515892624855, ...",Total Energy Supply (TES) 2015 2020\r\nNon-ren...
519,PHL-NREP-2019-EN,,Philippines,PHL,NREP,,,Y,Data,2019,2019,2017.0,2030.0,EN,https://www.irena.org/-/media/Files/IRENA/Agen...,"© IRENA 2017\r\nUnless otherwise stated, mater...",29852,"[0.006035402417182922, -0.021346403285861015, ...","© IRENA 2017 Unless otherwise stated, material..."


In [39]:
def search_embeddings(user_query):
    df_filtered = filter_country(user_query)
    length = len(df_filtered.head())
    filtered_embeddings_arrays = np.array(list(df_filtered['Embedding']))
    index = faiss.IndexFlatIP(filtered_embeddings_arrays.shape[1]) 
    index.add(filtered_embeddings_arrays)
    
    user_query_embedding = client.embeddings.create( 
        input=user_query ,model= embedding_model
    ).data[0].embedding

    if length > 5:
        k = 5
    else:
        k = length
    distances, indices = index.search(np.array([user_query_embedding]), k)
    return df_filtered, distances, indices


#Example 
# search_embedding = search_embeddings(user_query)
# search_embedding

Philippines


(                  Code Status Country Name Country Code Category  \
 507    PHL-CPD-2018-EN    NaN  Philippines          PHL      CPD   
 511   PHL-NEEP-2017-EN    NaN  Philippines          PHL     NEEP   
 512   PHL-NEEP-2014-EN    NaN  Philippines          PHL     NEEP   
 513    PHL-NEP-2020-EN    NaN  Philippines          PHL      NEP   
 514    PHL-NEP-2018-EN    NaN  Philippines          PHL      NEP   
 515    PHL-NEP-2007-EN    NaN  Philippines          PHL      NEP   
 516    PHL-NEP-2012-EN    NaN  Philippines          PHL      NEP   
 517    PHL-NEP-2016-EN    NaN  Philippines          PHL      NEP   
 518  PHL-NEPro-2022-EN    NaN  Philippines          PHL    NEPro   
 519   PHL-NREP-2019-EN    NaN  Philippines          PHL     NREP   
 
      KeyWord to Search Document Title Exists?  Type      Publication Date  \
 507                NaN            NaN       Y  Text  4 - 7 September 2018   
 511                NaN            NaN       Y  Text                  2017   
 512 

In [53]:

def get_answer(user_question, content):
    system_prompt = "You are a system that answer user questions based on excerpts from PDF documents that are provided for context. You must only answer the question if the answer can be found in the provided context. Do not make up the answer, and if you cannot find the answer in the context just say that you cannot find the answer"
    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_question},
        {'role': 'user', 'content': content},
    ]
        
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=messages,
        )
    response = response_entities.choices[0].message.content

    return response
        

In [54]:

def response_generating(user_query):
    df, distances, indices = search_embeddings(user_query)
    dis = distances[0][::-1]
    ind = indices[0][::-1]
    for i in range(len(dis)):
        content = df.iloc[ind[i]]['content_cut']
        print("Searching document {} ({})...".format(df.iloc[ind[i]]['Document Title'], df.iloc[ind[i]]['Link']))
        response = get_answer(user_query, content)
        answer = response['choices'][0]['message']['content']
        
        not_found_phrases = ['not mention', 'not mentioned', 'I did not find', 'not found', 'no information', 'not contain', 'cannot be found', 'no mention']
        if any(phrase.lower() in answer.lower() for phrase in not_found_phrases):
            print('Answer not found in this document')
            continue
        else:
            return answer
        