### Import dependencies

In [1]:
import pandas as pd
import numpy as np
import openai
import json
from dotenv import load_dotenv
import os
from openai import AzureOpenAI
import re
import tiktoken
import time
import spacy
nlp = spacy.load("en_core_web_sm")
import faiss




### Load documents data 

In [2]:

with open('../data/documents_all.json', 'r') as json_file:
    data = json.load(json_file)
df = pd.DataFrame(data)
df

# nan_title_rows = df[df['Document Title'].isna()]
# # print(nan_title_rows)

# for index, row in nan_title_rows.iterrows():
#     print(json.dumps(data[index], indent=2))


Unnamed: 0,Code,Status,Country Name,Country Code,Category,KeyWord to Search,Document Title,Exists?,Publication Date,Publication Year,...,End Year,Language,Link,Content,Thumbnail,Region,Journal,Authors,Description,Resolution No
0,,,,,SEH,,Machine learning for a sustainable energy future,Y,2023-02-05 00:00:00,2023,...,2023,EN,https://www.nature.com/articles/s41578-022-004...,"0123456789();: The combustion of fossil fuels,...",,Global,Nature,"Zhenpeng Yao, Yanwei Lum, Andrew Johnston, Lui...",,
1,,,USA,USA,SEH,,Clean energy for all? Mapping inequity potenti...,Y,2023-02-05 00:00:00,2023,...,2023,EN,https://www.sciencedirect.com/science/article/...,Energy Research & Social Science 108 (2024) 10...,,North America,Energy Research & Social Science,"Huiting Chen, Sung-Gheel Jang, Yan Zhang, Yaol...",,
2,,,,,SEH,,Access to clean cooking services in energy and...,Y,2023-02-07 00:00:00,2021,...,2021,EN,https://www.nature.com/articles/s41560-021-009...,"AnAlysis 1Energy, Climate, and Environment Pr...",,Global,Nature Energy,"Shonali Pachauri, Miguel Poblete-Cazenave, Ard...",,
3,,,China,China,SEH,,The asymmetric impacts of artificial intellige...,Y,2023-02-05 00:00:00,2024,...,2024,EN,https://www.sciencedirect.com/science/article/...,Energy 291 (2024) 130197 Available online Jan...,,East Asia and Pacific,Energy,"Hongwei Zhang, Beixin Fang, Pengwei He, Wang Gao",,
4,,,,,SEH,,Is clean cooking affordable? A review,Y,2023-02-07 00:00:00,2021,...,2021,EN,https://www.sciencedirect.com/science/article/...,Renewable and Sustainable Energy Reviews 151 (...,,,Renewable and Sustainable Energy Reviews,"A Gill-Wiehl, I Ray, D Kammen",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
584,AFG-NRES-2017-PR,,Afghanistan,AFG,NRES,,Renewable Energy Strategy for Rural Afghanista...,Y,2017,2017,...,2027,PR,https://policy.asiapacificenergy.org/sites/def...,ناتسناغفا یملاسا یروهمج تاهد فاشکنا ایحا ترا...,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,,,,,
585,AFG-NREP-41365-EN,Completed,Afghanistan,AFG,NREP,,Afghanistan Rural Renewable Energy Policy,Y,2013-04-01 00:00:00,2013-04-01 00:00:00,...,2027,EN,https://cdn.climatepolicyradar.org/navigator/A...,ISLAMIC REPUBLIC OF AFGHANISTAN MINISTRY OF EN...,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,,,,,
586,AFG-NRER-2017-EN,Completed,Afghanistan,AFG,NRER,,RENEWABLE ENERGY ROADMAP FOR AFGHANISTAN RER2032,Y,2017,2017,...,2032,EN,https://policy.asiapacificenergy.org/sites/def...,RENEWABLE ENERGY ROADMAP FOR AFGHANISTAN RER20...,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,,,,,
587,COL-CPD-2020-SP,,Colombia,COL,CPD,,,Y,31 August – 4 September 2020,31 August – 4 September 2020,...,2024,SP,https://www.undp.org/sites/g/files/zskgke326/f...,Table of Contents: Distr general: Page: II Pr...,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,,,,,


### Load enviroments

In [3]:
load_dotenv()

True

### OpenAI API configuration


In [4]:
 
openai.api_type = "azure"
openai.api_base = os.getenv('OPENAI_API_BASE')
openai.api_version = "2023-05-15"
openai.api_key = os.getenv('OPENAI_API_KEY')
openai_deployment = "sdgi-gpt-35-turbo-16k"

client = AzureOpenAI(
  api_key = os.getenv("api_key_azure"),  
  api_version = os.getenv("api_version"),
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)
encoding = tiktoken.get_encoding('cl100k_base')
embedding_model = os.getenv("USER_QUERY_EMBEDDING_ENGINE") 

### Embedding Processings

In [5]:
def create_embedding(row):
    time.sleep(3)
    input_text = row['Content'].replace("\n", " ")
    input_text = re.sub(r'\s+', ' ', input_text)
    encodings = encoding.encode(input_text)
    length = len(encodings)
    if len(encodings) > 8090:
        input_text = encoding.decode(encodings[:8090])
    embedding = client.embeddings.create( 
        input=input_text ,model= embedding_model
    ).data[0].embedding
    
    return length, embedding

df['token_length'], df['Embedding'] = zip(*df.apply(lambda row: create_embedding(row), axis=1))

In [6]:
def cut_context(row):
    if row['token_length'] < 15000:
        return row['Content']
    else:
        text = row['Content'].replace("\n", " ")
        text =  re.sub(r'\s+', ' ', text)
        encodings = encoding.encode(text)
        cut = encodings[0:15000]
        decodings =  encoding.decode(cut)
        return decodings

df['content_cut'] = df.apply(lambda row:cut_context(row), axis= 1)

print(df['content_cut'])

0      0123456789();: The combustion of fossil fuels,...
1      Energy Research & Social Science 108 (2024) 10...
2      AnAlysis  1Energy, Climate, and Environment Pr...
3      Energy 291 (2024) 130197 Available online Janu...
4      Renewable and Sustainable Energy Reviews 151 (...
                             ...                        
584    ناتسناغفا یملاسا یروهمج تاهد فاشکنا ایحا ترازو...
585    ISLAMIC REPUBLIC OF AFGHANISTAN MINISTRY OF EN...
586    RENEWABLE ENERGY ROADMAP FOR AFGHANISTAN RER20...
587    Table of Contents: Distr general: Page:  II Pr...
588    Table of Contents:  UNDP within the United Nat...
Name: content_cut, Length: 589, dtype: object


In [7]:
# save embeddings 
df[df['Language'] == 'EN'].to_pickle('df_embed_EN_All_V3.pkl')

### Testing the model

In [None]:
df = pd.read_pickle('../models/df_embed_EN_All_V2.pkl')
df

In [10]:
user_query = 'Give me a summary of the goals UNDP wants to achieve in 10 years and the energy plans for Philippines'

In [11]:
def find_mentioned_countries(text):
    doc = nlp(text)
    countries = set()
    
    for ent in doc.ents:
        if ent.label_ == "GPE":  # GPE stands for "Geopolitical Entity"
            countries.add(ent.text)
    
    return list(countries)

#Example 
# mentioned_countries = find_mentioned_countries(user_query)
# mentioned_countries

In [12]:
def filter_country(user_query):
    country = find_mentioned_countries(user_query)[0]
    print(country)
    return df[df['Country Name'] == country]


#Example 
# filtered_country = filter_country(user_query)
# filtered_country

In [13]:
def search_embeddings(user_query):
    df_filtered = filter_country(user_query)
    length = len(df_filtered.head())
    filtered_embeddings_arrays = np.array(list(df_filtered['Embedding']))
    index = faiss.IndexFlatIP(filtered_embeddings_arrays.shape[1]) 
    index.add(filtered_embeddings_arrays)
    
    user_query_embedding = client.embeddings.create( 
        input=user_query ,model= embedding_model
    ).data[0].embedding

    if length > 5:
        k = 5
    else:
        k = length
    distances, indices = index.search(np.array([user_query_embedding]), k)
    return df_filtered, distances, indices


#Example 
# search_embedding = search_embeddings(user_query)
# search_embedding

In [53]:

def get_answer(user_question, content):
    system_prompt = "You are a system that answer user questions based on excerpts from PDF documents that are provided for context. You must only answer the question if the answer can be found in the provided context. Do not make up the answer, and if you cannot find the answer in the context just say that you cannot find the answer"
    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_question},
        {'role': 'user', 'content': content},
    ]
        
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=messages,
        )
    response = response_entities.choices[0].message.content

    return response
        

In [54]:

def response_generating(user_query):
    df, distances, indices = search_embeddings(user_query)
    dis = distances[0][::-1]
    ind = indices[0][::-1]
    for i in range(len(dis)):
        content = df.iloc[ind[i]]['content_cut']
        print("Searching document {} ({})...".format(df.iloc[ind[i]]['Document Title'], df.iloc[ind[i]]['Link']))
        response = get_answer(user_query, content)
        answer = response['choices'][0]['message']['content']
        
        not_found_phrases = ['not mention', 'not mentioned', 'I did not find', 'not found', 'no information', 'not contain', 'cannot be found', 'no mention']
        if any(phrase.lower() in answer.lower() for phrase in not_found_phrases):
            print('Answer not found in this document')
            continue
        else:
            return answer
        