In [177]:

import streamlit as st
import os, toml, requests
import requests
import datetime as dt
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from typing import List, Dict, Optional, Union
from langchain.utilities import TextRequestsWrapper
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain.output_parsers import PydanticOutputParser, YamlOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.document_loaders import WebBaseLoader
from langchain.schema.prompt_template import format_document
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai


# GOOGLE_API_KEY = "AIzaSyC4NWD6EqPQ-uM4xDX3MQ-Y7fgzQ1jrxU4"  # add your GOOGLE API key here
# os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
# llm = ChatGoogleGenerativeAI(model="gemini-pro")

# Definicion de rutas y constantes
PATH_CWD = "c:/wom/1_VIU/TFM/app-eventos-procolombia/"
PATH_DATA = PATH_CWD + "/src/data/"

FN_KEYW = 'db_eventos_keyw.csv'
FN_EVENTS = 'events_data.xlsx'
FN_KEYW_JSON = 'app_config.json'
ACCESS_PATH = PATH_CWD + "/.scrts/access.toml"
#
MODELS_DICT = {'Gemini':0, 'GROG-LLAMA2':1}


def cargar_contraseñas(nombre_archivo):
    with open(nombre_archivo, 'r') as f:
        contraseñas = toml.load(f)
    return contraseñas

# Define your desired data structure.
class event(BaseModel):
    resume: str = Field(description="The resume of the context in few words")
    there_is_event: str = Field(description="Defines if any asociative event is mentioned. If so answer 'Yes', if not answer 'No'")
    title: str = Field(description="The name of the event, dont use Acronyms, dont use punctuation marks, if not sure keep blank")
    general_title: Optional[str] = Field(description="The name of the event, dont use Acronyms, don't use punctuation marks, don't specify the version of the event, if not sure keep blank")
    date: Optional[str] = None 
    year: Optional[str] = Field(description="The year of the event, if not sure keep blank")
    description: Optional[str] = Field(description="The description of the event,don't use punctuation marks, if not sure keep blank")
    country: Optional[str] = Field(description="The location of the event, if not sure keep blank")
    city: Optional[str] = Field(description="The city of the event, if not sure keep blank")
    place: Optional[str] = Field(description="The name of the place where the event takes place, if not sure keep blank")
    key_words: Optional[str] = Field(description="Only five key words of thats describe de event, separated by comma")
    

class Event(BaseModel):
    title: str  = Field(description="The name of the event, dont use initials, dont use punctuation marks")
    year: Optional[str]   = Field(description="The year of the event")
    country: Optional[str] = Field(description="The location of the event")

class json_resp_events(BaseModel):
    events: List[Event] = Field(..., description="The Event details")

class eventAsist(BaseModel):
    title: str  = Field(description="The name of the event, dont use initials, dont use punctuation marks")
    participants: Optional[str]   = Field(description="The resume of the information in few words about event participation, if not information or you are not sure put None")


def query_google_search(google_query, page, search_engine_keys, add_params = {}):
  """
  Query the Google Custom Search API and return the results in a dictionary.

  Args:
      google_query (str): The query to search for.
      page (int): The page number to retrieve.
  Returns:
      A dictionary containing the search results
  """

  # using the first page
  page = page
  start = (page - 1) * 10 + 1

#   url = f"https://www.googleapis.com/customsearch/v1?key={search_engine_keys['KEY']}&cx={search_engine_keys['ID']}&q={google_query}&start={start}" + add_args
  url = "https://www.googleapis.com/customsearch/v1"
  params = {
      'key' : search_engine_keys['KEY'],
      'cx' : search_engine_keys['ID'],
      'q' : google_query
  }
  params.update(add_params)
  print(url)
  print(params)
  try:
      # Make the GET request to the Google Custom Search API
      google_response = requests.get(url, params=params)
      print(google_response.url)
      # Check if the request was successful (status code 200)
      if google_response.status_code == 200:
          # Parse the JSON response
          google_response_data = google_response.json()
          google_response_items = {}
          # get the result items
          search_items = google_response_data.get("items")
          # iterate over 10 results found
          for i, search_item in enumerate(search_items, start=1):
              try:
                  long_description = search_item["pagemap"]["metatags"][0]["og:description"]
              except KeyError:
                  long_description = "N/A"
              # get the page title
              title = search_item.get("title")
              # page snippet
              snippet = search_item.get("snippet")
              # alternatively, you can get the HTML snippet (bolded keywords)
              html_snippet = search_item.get("htmlSnippet")
              # extract the page url
              link = search_item.get("link")
              google_response_items[i] = {
                  'title': title,
                  'snippet': snippet,
                  'long_description': long_description,
                  'link': link
              }
          return google_response_items

      else:
          print(f"Error: {google_response.status_code}")
          return None
  except Exception as e:
      print(f"An error occurred: {e}")
      return None

def web_scrapper(url):
  """

  """
  # Initializing variable
  lang_request = TextRequestsWrapper()
  try:
    lang_request.get(url)
  except:
    print('ERR', 'Error scrappiing the url')
    return None
  # Initializing variable
  result = lang_request.get(url)
  # Initializing variable
  bs_result = BeautifulSoup(result)
  # Calculating result
  text = bs_result.get_text()
  text = text.replace("\n", " ")
  text = text.replace("\t", " ")
  return text

def es_archivo_pdf(url):
    try:
        # Realizar una solicitud HEAD para obtener solo los encabezados de la respuesta
        response = requests.head(url)
        
        # Verificar si la respuesta tiene el tipo de contenido "application/pdf"
        if 'application/pdf' in response.headers.get('Content-Type', ''):
            return True
        else:
            return False
    except requests.exceptions.RequestException as e:
        print("Error al hacer la solicitud:", e)
        return False

def preprocess(sentence):
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('punkt')
        nltk.download('stopwords')
    stop_words = list(set(stopwords.words('english'))) + list(set(stopwords.words('spanish')))
    word_tokens = word_tokenize(sentence.lower())
    return [word for word in word_tokens if word.isalnum() and word not in stop_words ]

def jaccard_similarity(sentence1, sentence2):
    words1 = set(preprocess(sentence1))
    words2 = set(preprocess(sentence2))
    intersection = len(words1.intersection(words2))
    try:
        if len(words1) == len(intersection):
            return 1
        else:
            union = len(words1.union(words2))
            return intersection / union if union != 0 else 0 
    except:
        union = len(words1.union(words2))
        return intersection / union if union != 0 else 0 

def check_similar (new_key, old_keys):
    for old_key in old_keys:
        similarity_score = jaccard_similarity(new_key, old_key)
        if similarity_score >= 0.7:
            return True
        else:
            continue
    return False

def extraer_informacion_general_gemini(url, API_KEY_GEMINI):
    os.environ["GOOGLE_API_KEY"] = API_KEY_GEMINI
    llm = ChatGoogleGenerativeAI(model="gemini-pro")
    model = genai.GenerativeModel('gemini-pro')
    llm_prompt_template = """Your Task is to extract any event showed in following "Context" that can be related to the "event information". 
    "context":{context_str}
    \n{format_instructions}\n
    """
    # parser = YamlOutputParser(pydantic_object=event)
    parser = YamlOutputParser(pydantic_object=event)

    # Realizar el query a Gemini
    llm_prompt = PromptTemplate.from_template(llm_prompt_template)

    llm_prompt = PromptTemplate(
        template=llm_prompt_template,
        input_variables=["context_str"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    context = web_scrapper(url)
    if context.startswith('Not Acceptable!'):
        loader = WebBaseLoader(url)
        docs = loader.load()
        doc_prompt = PromptTemplate.from_template("{page_content}")
        context = "\n\n".join(format_document(doc, doc_prompt) for doc in docs)
        
    tokens_size = int(model.count_tokens(str(llm_prompt) + context).total_tokens)
    if tokens_size > 30000:
        return None
    else:
        stuff_chain = llm_prompt | llm | parser
        llm_result = stuff_chain.invoke({"context_str": context, "event_str": event} )
        return llm_result

def extraer_informacion_eventos_rel_gemini(url, event, API_KEY_GEMINI):
    
    os.environ["GOOGLE_API_KEY"] = API_KEY_GEMINI
    llm = ChatGoogleGenerativeAI(model="gemini-pro")
    model = genai.GenerativeModel('gemini-pro')
    llm_prompt_template = """Your Task is to extract any event showed in following "Context" that can be related to the "event information". 
    "event information":{event_str}
    "context":{context_str}
    \n{format_instructions}\n
    """
    parser = YamlOutputParser(pydantic_object=json_resp_events)

    # To extract data from WebBaseLoader
    doc_prompt = PromptTemplate.from_template("{page_content}")
    
    # Realizar el query a Gemini
    llm_prompt = PromptTemplate.from_template(llm_prompt_template)

    llm_prompt = PromptTemplate(
        template=llm_prompt_template,
        input_variables=["context_str", "event_str"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    context = web_scrapper(url)
    if context.startswith('Not Acceptable!'):
        loader = WebBaseLoader(url)
        docs = loader.load()
        doc_prompt = PromptTemplate.from_template("{page_content}")
        context = "\n\n".join(format_document(doc, doc_prompt) for doc in docs)
        
    tokens_size = int(model.count_tokens(str(llm_prompt) + context).total_tokens)
    if tokens_size > 30000:
        return None
    else:
        stuff_chain = llm_prompt | llm | parser
        llm_result = stuff_chain.invoke({"context_str": context, "event_str": event} )
        return llm_result

def rel_events_parser(yaml_events, df_hist_rel_events, event_key):
    df_rel_events = pd.DataFrame(columns=['event_key', 'rel_event_link', 'rel_event_key','rel_event_title', 'rel_event_year', 'rel_event_country'])

    for event in yaml_events.events:
        events_related_parsed = {}
        rel_event_key = event.title + " | " + event.country + " | " + str(event.year)
        print(check_similar(event_key, [rel_event_key]) , check_similar(rel_event_key, df_hist_rel_events['rel_event_key']))
        if not check_similar(event_key, [rel_event_key]) and not check_similar(rel_event_key, df_hist_rel_events['rel_event_key']):
            if int(event.year) > dt.datetime.today().year -20:
                events_related_parsed['event_key'] = event_key
                events_related_parsed['rel_event_key'] = event.title + " | " + event.country + " | " + str(event.year)
                events_related_parsed['rel_event_title'] = event.title
                events_related_parsed['rel_event_country'] = event.country
                events_related_parsed['rel_event_year'] = event.year
                df_events_related_parsed = pd.DataFrame(events_related_parsed, index=[0])
                df_rel_events = pd.concat([df_rel_events, df_events_related_parsed])
    return df_rel_events

def buscar_eventos_relacionados(llm_result_event, contraseñas):

    df_rel_events = pd.DataFrame(columns=['event_key',  'rel_event_key','rel_event_title', 'rel_event_year', 'rel_event_country','rel_event_link'])
    if llm_result_event.there_is_event == "True":

        for i in range(3):
            print(i)
            
            if i == 0:
                add_args = {
                    'lr': 'lang_eng|lang_esp'
                }
            elif i == 1:
                add_args = {
                    'lr': 'lang_esp'
                }
            elif i == 2:
                add_args = {
                    'lr': 'lang_esp',
                    'cr': 'countryCO'
                }
            if llm_result_event.title !="" and llm_result_event.general_title !=None:
                link_or_name = llm_result_event.general_title
                search_pattern = f"related: {link_or_name} "
            else:
                link_or_name = llm_result_event.title
                search_pattern = f"related: {link_or_name} "
            print("Criterio Busqueda:{}".format(search_pattern))
            google_query_result = query_google_search(search_pattern, 1, contraseñas["api_google_search"],add_args)
            for url in google_query_result:
                if es_archivo_pdf(google_query_result[url]['link']):
                    continue
                else:
                    print(google_query_result[url]['link'], search_pattern)
                    ref_event_info = "title:" + llm_result_event.title + "|" +"resume:" + llm_result_event.resume + "|"+"country:" + llm_result_event.country  + "|"+"year:" + llm_result_event.year
                    ref_event_key = llm_result_event.title + " | " + llm_result_event.country + " | " + llm_result_event.year 
                    try:   
                        yaml_events_related = extraer_informacion_eventos_rel_gemini(google_query_result[url]['link'], ref_event_info , contraseñas["api_gemini"]['KEY'])
                        df_events_related_link = rel_events_parser(yaml_events_related, df_rel_events, ref_event_key)
                        df_events_related_link ['rel_event_link'] = google_query_result[url]['link']
                        df_rel_events = pd.concat([df_rel_events, df_events_related_link])
                        if len(df_rel_events) >= 5:
                            return df_rel_events
                    except Exception as e:
                        print(e)
                        continue
    return df_rel_events
    
def extraer_informacion_asistentes_gemini(url, event, API_KEY_GEMINI):
    
    os.environ["GOOGLE_API_KEY"] = API_KEY_GEMINI
    llm = ChatGoogleGenerativeAI(model="gemini-pro")
    model = genai.GenerativeModel('gemini-pro')
    llm_prompt_template = """Tu tarea es extraer de "context" la informacion disponible del numero de asistentes al evento {event_str} en el idioma del contexto". 
    "context":{context_str}
    \n{format_instructions}\n
    """
    parser = YamlOutputParser(pydantic_object=eventAsist)

    # To extract data from WebBaseLoader
    doc_prompt = PromptTemplate.from_template("{page_content}")
    
    # Realizar el query a Gemini
    llm_prompt = PromptTemplate.from_template(llm_prompt_template)

    llm_prompt = PromptTemplate(
        template=llm_prompt_template,
        input_variables=["context_str", "event_str"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    context = web_scrapper(url)
    if context.startswith('Not Acceptable!'):
        loader = WebBaseLoader(url)
        docs = loader.load()
        doc_prompt = PromptTemplate.from_template("{page_content}")
        context = "\n\n".join(format_document(doc, doc_prompt) for doc in docs)
    tokens_size = int(model.count_tokens(str(llm_prompt) + context).total_tokens)
    if tokens_size > 30000:
        return None
    else:
        stuff_chain = llm_prompt | llm | parser
        llm_result = stuff_chain.invoke({"context_str": context, "event_str": event} )
        return llm_result
    
def buscar_informacion_asistentes(llm_result_event, contraseñas):
    asistants_list = []
    for i in range(3):
        print(i)
        
        if llm_result_event.place !=None and llm_result_event.place !="":
            location = llm_result_event.place
        elif llm_result_event.country !=None and llm_result_event.country !="":
            location = llm_result_event.country
        else:
            location = ""
        search_pattern = f"{llm_result_event.general_title} {location}"
        if i == 0:
            add_args = {
                'lr': 'lang_eng|lang_esp'
            }
        elif i == 1:
            add_args = {
                'lr': 'lang_esp'
            }
        elif i == 2:
            add_args = {
                'lr': 'lang_esp',
                'cr': 'countryCO'
            }
        google_query_result = query_google_search(search_pattern, 1, contraseñas["api_google_search"], add_args)
  
        for url in google_query_result:
            if es_archivo_pdf(google_query_result[url]['link']):
                continue
            else:
                print(google_query_result[url]['link'], search_pattern)
                ref_event_info = "title:" + llm_result_event.title + "|" +"resume:" + llm_result_event.resume + "|"+"country:" + llm_result_event.country  + "|"+"year:" + llm_result_event.year
                try:   
                    yaml_envent_asistants = extraer_informacion_asistentes_gemini(google_query_result[url]['link'], ref_event_info , contraseñas["api_gemini"]['KEY'])
                    if yaml_envent_asistants.participants not in [None, 'None', '', ' '] and not check_similar(yaml_envent_asistants.participants, asistants_list):
                        asistants_list.append(yaml_envent_asistants.participants)
                        print(asistants_list)
                        if len(asistants_list) >=3:
                            return "|".join(asistants_list)  
                except Exception as e:
                    print(e)
                    continue
    return "|".join(asistants_list) 


In [150]:
dt.datetime.today().year -10

2014

In [7]:
contraseñas = cargar_contraseñas(ACCESS_PATH)
contraseñas

{'api_google_search': {'KEY': 'AIzaSyA2c1UUoLgsLUYDxkkOvh-UGzIZbkEziQI',
  'ID': '525a973bdd364421f'},
 'api_gemini': {'KEY': 'AIzaSyC4NWD6EqPQ-uM4xDX3MQ-Y7fgzQ1jrxU4'}}

In [8]:
contraseñas['api_google_search']['ID']

'525a973bdd364421f'

In [39]:
url= "https://ieawc2023.org/"
# http://www.scielo.org.co/scielo.php?script=sci_abstract&pid=S0122-06672019000100059    ## URL con errores Bloqueo por informacion sensible
# https://www.unbosque.edu.co/educacion-continuada/evento/congreso-iise-region-centroamerica-y-suramerica-2023  ## Retry
# https://mtci.bvsalud.org/en/contributions-of-the-traditional-complementary-and-integrative-medicines-in-the-context-of-the-covid-19-pandemic
llm_result = extraer_informacion_general_gemini(url, contraseñas["api_gemini"]['KEY'])

llm_result


 11 - 15 December 2023 | Medellin, Colombia        Day One       Day Two         Day Three       Day Four         Day Five          See photo gallery            Plenary “Caught at a crossroads: can the world economy avoid a turn for the worse?"Speaker: Gita Gopinath (IMF)Chair: Dani Rodrik (Harvard University) See more          Plenary “The Persistence Paradox: Critical Junctures and Our Shared Future”Speaker: Nathan Nunn (UBC)Chair: Silvana Tenreyro (London School of Economics) See more          Plenary “Asia’s New Growth Trajectory: Navigating an International Economic Landscape Fractured by Great Power Rivalry”Speaker: Danny Quah (National University of Singapore)Chair: Wendy Carlin (University College London, CEPR and Santa Fe Institute) See more          Plenary “A New Global Economic Order”Speakers: Justin Yifu Lin (Peking University),Jayati Ghosh (UMass Amherst),Jose Antonio Ocampo (Columbia University),Lili Yan Ing (ERIA )Chair: Dani Rodrik (Harvard University) See more        

event(resume='The 20th World Congress of the International Economic Association (IEA) will be held in Medellin, Colombia from December 11 to 15, 2023.', there_is_event='True', title='20th World Congress of the International Economic Association', general_title='World Congress of the International Economic Association', date='December 11 to 15, 2023', year='2023', description='The congress will feature research presentations by leading economists from across the globe. More than 500 economists from all over the world are expected to attend and over 400 papers will be presented in stand-alone and parallel academic sessions. There will also be a number of policy sessions and keynote lectures, with participants from governments and the corporate world.', country='Colombia', city='Medellin', place='Universidad EAFIT', key_words='Economics, Development, Globalization, Inequality, Sustainability')

In [63]:
llm_result

event(resume='The 20th World Congress of the International Economic Association (IEA) will be held in Medellin, Colombia from December 11 to 15, 2023.', there_is_event='True', title='20th World Congress of the International Economic Association', general_title='World Congress of the International Economic Association', date='December 11 to 15, 2023', year='2023', description='The congress will feature research presentations by leading economists from across the globe. More than 500 economists from all over the world are expected to attend and over 400 papers will be presented in stand-alone and parallel academic sessions. There will also be a number of policy sessions and keynote lectures, with participants from governments and the corporate world.', country='Colombia', city='Medellin', place='Universidad EAFIT', key_words='Economics, Development, Globalization, Inequality, Sustainability')

In [157]:
llm_result

event(resume='The 20th World Congress of the International Economic Association (IEA) will be held in Medellin, Colombia from December 11 to 15, 2023.', there_is_event='True', title='20th World Congress of the International Economic Association', general_title='World Congress of the International Economic Association', date='December 11 to 15, 2023', year='2023', description='The congress will feature research presentations by leading economists from across the globe. More than 500 economists from all over the world are expected to attend and over 400 papers will be presented in stand-alone and parallel academic sessions. There will also be a number of policy sessions and keynote lectures, with participants from governments and the corporate world.', country='Colombia', city='Medellin', place='Universidad EAFIT', key_words='Economics, Development, Globalization, Inequality, Sustainability')

In [156]:
def buscar_eventos_relacionados(llm_result_event, contraseñas):

    df_rel_events = pd.DataFrame(columns=['event_key',  'rel_event_key','rel_event_title', 'rel_event_year', 'rel_event_country','rel_event_link'])
    if llm_result.there_is_event == "True":

        for i in range(3):
            print(i)
            
            if i == 0:
                add_args = {
                    'lr': 'lang_eng|lang_esp'
                }
            elif i == 1:
                add_args = {
                    'lr': 'lang_esp'
                }
            elif i == 2:
                add_args = {
                    'lr': 'lang_esp',
                    'cr': 'countryCO'
                }
            if llm_result.title !="" and llm_result.general_title !=None:
                link_or_name = llm_result.general_title
                search_pattern = f"related: {link_or_name} "
            else:
                link_or_name = llm_result.title
                search_pattern = f"related: {link_or_name} "
            print("Criterio Busqueda:{}".format(search_pattern))
            google_query_result = query_google_search(search_pattern, 1, contraseñas["api_google_search"],add_args)
            for url in google_query_result:
                if es_archivo_pdf(google_query_result[url]['link']):
                    continue
                else:
                    print(google_query_result[url]['link'], search_pattern)
                    ref_event_info = "title:" + llm_result.title + "|" +"resume:" + llm_result.resume + "|"+"country:" + llm_result.country  + "|"+"year:" + llm_result.year
                    ref_event_key = llm_result.title + " | " + llm_result.country + " | " + llm_result.year 
                    try:   
                        yaml_events_related = extraer_informacion_eventos_rel_gemini(google_query_result[url]['link'], ref_event_info , contraseñas["api_gemini"]['KEY'])
                        df_events_related_link = rel_events_parser(yaml_events_related, df_rel_events, ref_event_key)
                        df_events_related_link ['rel_event_link'] = google_query_result[url]['link']
                        df_rel_events = pd.concat([df_rel_events, df_events_related_link])
                        if len(df_rel_events) >= 5:
                            return df_rel_events
                    except Exception as e:
                        print(e)
                        continue
    return df_rel_events
        
buscar_eventos_relacionados(llm_result, contraseñas)


0
Criterio Busqueda:related: World Congress of the International Economic Association 
https://www.googleapis.com/customsearch/v1
{'q': 'related: World Congress of the International Economic Association ', 'key': 'AIzaSyA2c1UUoLgsLUYDxkkOvh-UGzIZbkEziQI', 'cx': '525a973bdd364421f', 'lr': 'lang_eng|lang_esp'}
https://iea-world.org/ related: World Congress of the International Economic Association 
True False
https://ieawc2023.org/ related: World Congress of the International Economic Association 
True False
https://iea-world.org/congresses/past-congresses/ related: World Congress of the International Economic Association 
True False
https://www.imf.org/en/News/Articles/2023/12/11/sp121123-cold-war-ii-preserving-economic-cooperation-amid-geoeconomic-fragmentation related: World Congress of the International Economic Association 
True False
https://iea-world.org/the-17th-world-congress-of-the-international-economic-association/ related: World Congress of the International Economic Associa

Unnamed: 0,event_key,rel_event_key,rel_event_title,rel_event_year,rel_event_country,rel_event_link
0,20th World Congress of the International Econo...,17th World Congress of the International Econo...,17th World Congress of the International Econo...,2014,Jordan,https://iea-world.org/the-17th-world-congress-...
0,20th World Congress of the International Econo...,15th IHEA World Congress on Health Economics |...,15th IHEA World Congress on Health Economics,2023,South Africa,https://healtheconomics.org/
0,20th World Congress of the International Econo...,16th IHEA World Congress | Canada | 2025,16th IHEA World Congress,2025,Canada,https://healtheconomics.org/
0,20th World Congress of the International Econo...,19th World Congress of the International Econo...,19th World Congress of the International Econo...,2021,Indonesia,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,18th World Congress of the International Econo...,18th World Congress of the International Econo...,2017,Mexico,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,16th World Congress of International Economic ...,16th World Congress of International Economic ...,2011,China,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,15th World Congress | Turkey | 2008,15th World Congress,2008,Turkey,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,14th World Congress | Morocco | 2005,14th World Congress,2005,Morocco,https://iea-world.org/congresses/past-congresses/


In [98]:
llm_result.title

'20th World Congress of the International Economic Association'

In [211]:
ref_event_key = llm_result.title  +" " + llm_result.year + " " + llm_result.country 

search_pattern = f'+Congreso+Colombia+Universidad del Bosque'
# add_args = {
#     'lr': 'lang_esp|lang_eng',
#     'daterestrict':'y[10]',

# }
add_args = {
    'lr': 'lang_esp|lang_eng',
    'dateRestrict':'y[10]',
    'fileType': '-pdf',
    # 'sort': 'date',
    # 'orTerms':'Mundial|Internacional|Iberoamiricano|Panamericano|Latinoamericano'
    'exactTerms': '(Congreso).(Latinoamericano)'
}
google_query_result = query_google_search(search_pattern, 1, contraseñas["api_google_search"], add_args)
google_query_result

https://www.googleapis.com/customsearch/v1
{'key': 'AIzaSyA2c1UUoLgsLUYDxkkOvh-UGzIZbkEziQI', 'cx': '525a973bdd364421f', 'q': '+Congreso+Colombia+Universidad del Bosque', 'lr': 'lang_esp|lang_eng', 'dateRestrict': 'y[10]', 'fileType': '-pdf', 'exactTerms': '(Congreso).(Latinoamericano)'}
https://www.googleapis.com/customsearch/v1?key=AIzaSyA2c1UUoLgsLUYDxkkOvh-UGzIZbkEziQI&cx=525a973bdd364421f&q=%2BCongreso%2BColombia%2BUniversidad+del+Bosque&lr=lang_esp%7Clang_eng&dateRestrict=y%5B10%5D&fileType=-pdf&exactTerms=%28Congreso%29.%28Latinoamericano%29


{1: {'title': 'X Simposio Colombiano de Virología & VI Congreso ...',
  'snippet': 'X Simposio Colombiano de Virología & VI Congreso Latinoamericano ... El X Simposio Colombiano & VI Congreso ... Castellanos,, Universidad El Bosque, Bogotá,\xa0...',
  'long_description': 'N/A',
  'link': 'https://virologiacolombia2023.com/'},
 2: {'title': 'Inicio | Congreso IISE 2023 Región 16',
  'snippet': 'El congreso latinoamericano es un espacio en el que capítulos de diferentes partes del mundo se reúnen para mejorar las relaciones entre profesionales y\xa0...',
  'long_description': 'El Congreso IISE 2023 Región 16 es un espacio en el que capítulos de diferentes partes del mundo se reúnen para mejorar las relaciones entre profesionales y futuros ingenieros, promover el intercambio de conocimiento y crear experiencias que aporten a su desarrollo profesional.',
  'link': 'https://iise16region2023.wixsite.com/my-site'},
 3: {'title': 'CFP',
  'snippet': 'El congreso latinoamericano es un espacio e

In [146]:
def buscar_informacion_asistentes(llm_result_event, contraseñas):
    asistants_list = []
    for i in range(3):
        print(i)
        
        if llm_result_event.place !=None and llm_result_event.place !="":
            location = llm_result_event.place
        elif llm_result_event.country !=None and llm_result_event.country !="":
            location = llm_result_event.country
        else:
            location = ""
        search_pattern = f"{llm_result_event.general_title} {location}"
        if i == 0:
            add_args = {
                'lr': 'lang_eng|lang_esp'
            }
        elif i == 1:
            add_args = {
                'lr': 'lang_esp'
            }
        elif i == 2:
            add_args = {
                'lr': 'lang_esp',
                'cr': 'countryCO'
            }
        google_query_result = query_google_search(search_pattern, 1, contraseñas["api_google_search"], add_args)
  
        for url in google_query_result:
            if es_archivo_pdf(google_query_result[url]['link']):
                continue
            else:
                print(google_query_result[url]['link'], search_pattern)
                ref_event_key = llm_result.title + " | " + llm_result.country + " | " + llm_result.year 
                try:   
                    yaml_events_related = extraer_informacion_asistentes_gemini(google_query_result[url]['link'], ref_event_info , contraseñas["api_gemini"]['KEY'])
                    if yaml_events_related.participants not in [None, 'None', '', ' '] and not check_similar(yaml_events_related.participants, asistants_list):
                        asistants_list.append(yaml_events_related.participants)
                        print(asistants_list)
                        if len(asistants_list) >=3:
                            return "|".join(asistants_list)  
                except Exception as e:
                    print(e)
                    continue
    return "|".join(asistants_list)            
buscar_informacion_asistentes(llm_result, contraseñas)         


0
https://www.googleapis.com/customsearch/v1
{'q': 'World Congress of the International Economic Association Universidad EAFIT', 'key': 'AIzaSyA2c1UUoLgsLUYDxkkOvh-UGzIZbkEziQI', 'cx': '525a973bdd364421f', 'lr': 'lang_eng|lang_esp'}
https://iea-world.org/ World Congress of the International Economic Association Universidad EAFIT
3278
https://ieawc2023.org/ World Congress of the International Economic Association Universidad EAFIT
1868
['More than 500 economists from all over the world are expected to attend.']
https://www.aeaweb.org/news/call-for-papers-iea-world-congress-feb-14-2023 World Congress of the International Economic Association Universidad EAFIT
1142
['More than 500 economists from all over the world are expected to attend.', 'More than 500 economists from all over the world are expected to attend and over 400 papers will be presented in stand-alone and parallel academic sessions.']
https://ieawc2023.org/paper-submission/ World Congress of the International Economic Associa

['More than 500 economists from all over the world are expected to attend.',
 'More than 500 economists from all over the world are expected to attend and over 400 papers will be presented in stand-alone and parallel academic sessions.',
 'Over 1,000 participants from 57 countries attended the 20th World Congress of the International Economic Association, including two Nobel Prize winners and representatives from nearly 40 partner organizations.']

In [123]:
asistants_list

['More than 500 economists from all over the world are expected to attend and over 400 papers will be presented in stand-alone and parallel academic sessions.',
 'Information about the number of attendees is not available.',
 'During five days, you, the more than 1.000 participants of the vigésimo congreso mundial de IEA made aún more potente to this community of talento that is EAFIT.',
 'over 1,000 participants']

In [61]:
yaml_events_related = extraer_informacion_asistentes_gemini("https://www.eafit.edu.co/ieawc2023", ref_event_info , contraseñas["api_gemini"]['KEY'])

In [62]:
yaml_events_related

eventAsist(title='20th World Congress of the International Economic Association', asistants='During five days, you, the more than 1,000 participants of the twentieth world congress of IEA made even more powerful this community of talent that is EAFIT.')

In [38]:
ref_event_info = "title:" + llm_result.title + "|" +"resume:" + llm_result.resume + "|"+"country:" + llm_result.country  + "|"+"year:" + llm_result.year
yaml_events_related = extraer_informacion_eventos_rel_gemini("https://iea-world.org/congresses/past-congresses/", ref_event_info , contraseñas["api_gemini"]['KEY'])
yaml_events_related

json_resp_events(events=[Event(title='19th World Congress of the International Economic Association', year='2021', country='Indonesia'), Event(title='18th World Congress of the International Economic Association', year='2017', country='Mexico'), Event(title='17th World Congress of the International Economic Association', year='2014', country='Jordan'), Event(title='16th World Congress of International Economic Association', year='2011', country='China'), Event(title='15th World Congress', year='2008', country='Turkey'), Event(title='14th World Congress', year='2005', country='Morocco'), Event(title='13th World Congress of the International Economic Association', year='2002', country='Portugal'), Event(title='12th Congress of the International Economic Association', year='1999', country='Argentina'), Event(title='11th World Congress of the International Economic Association', year='1995', country='Tunisia'), Event(title='10th World Congress of the International Economic Association', ye

In [34]:
web_scrapper("https://iea-world.org/congresses/past-congresses/")

'Not Acceptable!Not Acceptable!An appropriate representation of the requested resource could not be found on this server. This error was generated by Mod_Security.'

In [35]:
loader = WebBaseLoader('https://iea-world.org/congresses/past-congresses/')
docs = loader.load()

In [36]:
docs

[Document(page_content='\n\n\n\n\n\n\nPast Congresses | IEA\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHome\nAbout the IEA\n\nWelcome\nGeneral Information\nGovernance\nDiversity Committee\nCode of Conduct\n\n\nMembers\n\nBecome a Member\nMember Associations\nAssociate Members\nMembers News\n\n\nActivities\n\nFeatured Economists\nDiversifying Economics Globally\n\nIssues and Recommendations\nDiscussion Panels\nRemote Student Opportunities\nOnline Courses\nMentoring Opportunities\nCORE\n\n\nWorld Economy Working Group\nTechnology, Jobs, and Development Working Group\nDealing with Debt Working Group\nLecture Series\n\nEnvironmental Economics\n\n\nG20 Report\nIndustrial Policies\nCall for Papers\nRIDGE\nPast Events\nPublications\n\n\nIEA-WE\n\nAmplifying Women’s Voices\nGeneral info\nResearch Papers\nContribute to IEA-WE Amplifying Voices Initiative\n\n\nCongresses\n\nCongress 2023 Colombia – Plenaries\nCongress 2023 Col

In [30]:
yaml_events_related

json_resp_events(events=[Event(title='20th World Congress of the International Economic Association (IEA)', year='2023', country='Colombia')])

In [19]:
from urllib.parse import urlparse

def obtener_url_base(url):
    parsed_url = urlparse(url)
    return urlparse(url).scheme + "://" + urlparse(url).netloc

# Ejemplo de uso
url_completa = "https://www.ejemplo.com/ruta/de/ejemplo"
url_base = obtener_url_base(url_completa)
print("URL base:", url_base)

URL base: https://www.ejemplo.com


In [108]:
params = {
      'key' : 'a',
      'cx' : 'b'
  }

add_params = {
    
    'q': 'query'
}

params.update(add_params)

In [141]:
url = "https://iea-world.org/"
os.environ["GOOGLE_API_KEY"] = contraseñas["api_gemini"]['KEY']
llm = ChatGoogleGenerativeAI(model="gemini-pro")
model = genai.GenerativeModel('gemini-pro')
llm_prompt_template = """Tu tarea es extraer de "context" la informacion disponible del numero de asistentes al evento {event_str} en el idioma del contexto". 
"context":{context_str}
\n{format_instructions}\n
"""
parser = YamlOutputParser(pydantic_object=eventAsist)

# To extract data from WebBaseLoader
doc_prompt = PromptTemplate.from_template("{page_content}")

# Realizar el query a Gemini
llm_prompt = PromptTemplate.from_template(llm_prompt_template)

llm_prompt = PromptTemplate(
    template=llm_prompt_template,
    input_variables=["context_str", "event_str"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)
context = web_scrapper(url)
if context.startswith('Not Acceptable!'):
    loader = WebBaseLoader(url)
    docs = loader.load()
    doc_prompt = PromptTemplate.from_template("{page_content}")
    context = "\n\n".join(format_document(doc, doc_prompt) for doc in docs)
tokens_size = int(model.count_tokens(str(llm_prompt) + context).total_tokens)
print(tokens_size)
if tokens_size > 30000:
    print("Max tokens exceded")
else:
    stuff_chain = llm_prompt | llm | parser
    # llm_result = stuff_chain.invoke({"context_str": context, "event_str": event} )


3275
