In [69]:

import streamlit as st
import os, toml, requests
from typing import List, Dict, Optional, Union
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain.output_parsers import PydanticOutputParser, YamlOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.document_loaders import WebBaseLoader
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain_google_genai import ChatGoogleGenerativeAI
import pandas as pd

GOOGLE_API_KEY = "AIzaSyC4NWD6EqPQ-uM4xDX3MQ-Y7fgzQ1jrxU4"  # add your GOOGLE API key here
# os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
# llm = ChatGoogleGenerativeAI(model="gemini-pro")

# Definicion de rutas y constantes
PATH_CWD = "c:/wom/1_VIU/TFM/app-eventos-procolombia/"
PATH_DATA = PATH_CWD + "/src/data/"

FN_KEYW = 'db_eventos_keyw.csv'
FN_EVENTS = 'events_data.xlsx'
FN_KEYW_JSON = 'app_config.json'
ACCESS_PATH = PATH_CWD + "/.scrts/access.toml"
#
MODELS_DICT = {'Gemini':0, 'GROG-LLAMA2':1}


def cargar_contraseñas(nombre_archivo):
    with open(nombre_archivo, 'r') as f:
        contraseñas = toml.load(f)
    return contraseñas

# Define your desired data structure.
class json_resp(BaseModel):
    resume: str = Field(description="The resume of the context in few words")
    there_is_event: str = Field(description="Defines if any asociative event is mentioned. If so answer 'Yes', if not answer 'No'")
    title: str = Field(description="The name of the event, dont use initials, dont use punctuation marks, if not sure keep blank")
    general_title: str = Field(description="The name of the event, dont use initials, don't use punctuation marks, don't specify the version of the event, if not sure keep blank")
    date: Optional[date] = Field(description="The date of the event in format 'YY-MM-DD', if not sure keep blank")
    year: str = Field(description="The year of the event, if not sure keep blank")
    description: str = Field(description="The description of the event, if not sure keep blank")
    country: str = Field(description="The location of the event, if not sure keep blank")
    city: str = Field(description="The city of the event, if not sure keep blank")
    place: Optional[str] = Field(description="The name of the place where the event takes place, if not sure keep blank")
    key_words: str = Field(description="Only five key words of thats describe de event, separated by comma")
    

class Event(BaseModel):
    title: str  = Field(description="The name of the event, dont use initials, dont use punctuation marks")
    year: str   = Field(description="The year of the event")
    country: str = Field(description="The location of the event")

class json_resp_events(BaseModel):
    events: List[Event] = Field(..., description="The Event details")

def extraer_informacion_general_gemini(url, API_KEY_GEMINI):
    
    os.environ["GOOGLE_API_KEY"] = API_KEY_GEMINI
    llm = ChatGoogleGenerativeAI(model="gemini-pro")
    llm_prompt_template = """Context information is below.
    ---------------------
    {context_str}
    ---------------------
    Given the context information and not prior knowledge, answer the query.
    \n{format_instructions}\n{query}\n
    """
    
    loader = WebBaseLoader(url)
    
    docs = loader.load()
    parser = JsonOutputParser(pydantic_object=json_resp)

    # To extract data from WebBaseLoader
    doc_prompt = PromptTemplate.from_template("{page_content}")
    
    # Realizar el query a Gemini
    llm_prompt = PromptTemplate.from_template(llm_prompt_template)

    llm_prompt = PromptTemplate(
        template=llm_prompt_template,
        input_variables=["context_str", "query"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    context = "\n\n".join(format_document(doc, doc_prompt) for doc in docs)
    stuff_chain = llm_prompt | llm | parser 
    llm_result = stuff_chain.invoke({"context_str": context, "query": "Is There Any event in the document?"} )

    
    return llm_result

def extraer_informacion_general_gemini_esp(url, API_KEY_GEMINI):
    os.environ["GOOGLE_API_KEY"] = API_KEY_GEMINI
    llm = ChatGoogleGenerativeAI(model="gemini-pro")
    llm_prompt_template = """Tu tarea es extraer informacion de cualquier evento mostrado en el siguiente "Context". 
    "context":{context_str}
    \n{format_instructions}\n
    """

    loader = WebBaseLoader(url)
    
    docs = loader.load()
    parser = JsonOutputParser(pydantic_object=json_resp)

    # To extract data from WebBaseLoader
    doc_prompt = PromptTemplate.from_template("{page_content}")
    
    # Realizar el query a Gemini
    llm_prompt = PromptTemplate.from_template(llm_prompt_template)

    llm_prompt = PromptTemplate(
        template=llm_prompt_template,
        input_variables=["context_str", "event_str"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    context = "\n\n".join(format_document(doc, doc_prompt) for doc in docs)
    print(context)
    stuff_chain = llm_prompt | llm | parser
    llm_result = stuff_chain.invoke({"context_str": context} )
    return llm_result

def query_google_search(google_query, page, search_engine_keys):
  """
  Query the Google Custom Search API and return the results in a dictionary.

  Args:
      google_query (str): The query to search for.
      page (int): The page number to retrieve.
  Returns:
      A dictionary containing the search results
  """

  # using the first page
  page = page
  start = (page - 1) * 10 + 1

  url = f"https://www.googleapis.com/customsearch/v1?key={search_engine_keys['KEY']}&cx={search_engine_keys['ID']}&q={google_query}&start={start}"
  print(url)
  try:
      # Make the GET request to the Google Custom Search API
      google_response = requests.get(url)

      # Check if the request was successful (status code 200)
      if google_response.status_code == 200:
          # Parse the JSON response
          google_response_data = google_response.json()
          google_response_items = {}
          # get the result items
          search_items = google_response_data.get("items")
          # iterate over 10 results found
          for i, search_item in enumerate(search_items, start=1):
              try:
                  long_description = search_item["pagemap"]["metatags"][0]["og:description"]
              except KeyError:
                  long_description = "N/A"
              # get the page title
              title = search_item.get("title")
              # page snippet
              snippet = search_item.get("snippet")
              # alternatively, you can get the HTML snippet (bolded keywords)
              html_snippet = search_item.get("htmlSnippet")
              # extract the page url
              link = search_item.get("link")
              google_response_items[i] = {
                  'title': title,
                  'snippet': snippet,
                  'long_description': long_description,
                  'link': link
              }
          return google_response_items

      else:
          print(f"Error: {google_response.status_code}")
          return None
  except Exception as e:
      print(f"An error occurred: {e}")
      return None

def extraer_informacion_eventos_rel_gemini(url, event, API_KEY_GEMINI):
    
    os.environ["GOOGLE_API_KEY"] = API_KEY_GEMINI
    llm = ChatGoogleGenerativeAI(model="gemini-pro")

    llm_prompt_template = """Your Task is to extract any event showed in following "Context" that can be related to the "event information". 
    "event information":{event_str}
    "context":{context_str}
    \n{format_instructions}\n
    """

    loader = WebBaseLoader(url)
    
    docs = loader.load()
    parser = YamlOutputParser(pydantic_object=json_resp_events)

    # To extract data from WebBaseLoader
    doc_prompt = PromptTemplate.from_template("{page_content}")
    
    # Realizar el query a Gemini
    llm_prompt = PromptTemplate.from_template(llm_prompt_template)

    llm_prompt = PromptTemplate(
        template=llm_prompt_template,
        input_variables=["context_str", "event_str"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    context = "\n\n".join(format_document(doc, doc_prompt) for doc in docs)
    print(context)
    stuff_chain = llm_prompt | llm | parser
    llm_result = stuff_chain.invoke({"context_str": context, "event_str": event} )
    return llm_result


    
def es_archivo_pdf(url):
    try:
        # Realizar una solicitud HEAD para obtener solo los encabezados de la respuesta
        response = requests.head(url)
        
        # Verificar si la respuesta tiene el tipo de contenido "application/pdf"
        if 'application/pdf' in response.headers.get('Content-Type', ''):
            return True
        else:
            return False
    except requests.exceptions.RequestException as e:
        print("Error al hacer la solicitud:", e)
        return False
    



TypeError: typing.Optional requires a single type. Got FieldInfo(default=PydanticUndefined, description="The date of the event in format 'YY-MM-DD', if not.

In [2]:
contraseñas = cargar_contraseñas(ACCESS_PATH)
contraseñas

{'api_google_search': {'KEY': 'AIzaSyA2c1UUoLgsLUYDxkkOvh-UGzIZbkEziQI',
  'ID': '525a973bdd364421f'},
 'api_gemini': {'KEY': 'AIzaSyC4NWD6EqPQ-uM4xDX3MQ-Y7fgzQ1jrxU4'}}

In [3]:
contraseñas['api_google_search']['ID']

'525a973bdd364421f'

In [54]:
url= "https://www.unbosque.edu.co/educacion-continuada/evento/congreso-iise-region-centroamerica-y-suramerica-2023"
# llm_result = extraer_informacion_general_gemini(url, contraseñas["api_gemini"]['KEY'])
llm_result = extraer_informacion_general_gemini_esp(url, contraseñas["api_gemini"]['KEY'])

llm_result



403 Forbidden

403 Forbidden





{'resume': 'The provided context does not mention any event.',
 'there_is_event': 'No',
 'title': None,
 'general_title': None,
 'date': None,
 'year': None,
 'description': None,
 'country': None,
 'city': None,
 'place': None,
 'key_words': None}

In [85]:
url= "https://www.unbosque.edu.co/educacion-continuada/evento/congreso-iise-region-centroamerica-y-suramerica-2023"

def web_scrapper(url):
  """

  """
  # Initializing variable
  lang_request = TextRequestsWrapper()
  try:
    lang_request.get(url)
  except:
    print('ERR', 'Error scrappiing the url')
    return None
  # Initializing variable
  result = lang_request.get(url)
  # Initializing variable
  bs_result = BeautifulSoup(result)
  # Calculating result
  text = bs_result.get_text()
  text = text.replace("\n", " ")
  text = text.replace("\t", " ")
  return text# Define your desired data structure.

class event(BaseModel):
    resume: str = Field(description="The resume of the context in few words")
    there_is_event: str = Field(description="Defines if any asociative event is mentioned. If so answer 'Yes', if not answer 'No'")
    title: str = Field(description="The name of the event, dont use initials, dont use punctuation marks, if not sure keep blank")
    general_title: Optional[str] = Field(description="The name of the event, dont use initials, don't use punctuation marks, don't specify the version of the event, if not sure keep blank")
    date: Optional[str] = None 
    year: Optional[str] = Field(description="The year of the event, if not sure keep blank")
    description: Optional[str] = Field(description="The description of the event,don't use punctuation marks, if not sure keep blank")
    country: Optional[str] = Field(description="The location of the event, if not sure keep blank")
    city: Optional[str] = Field(description="The city of the event, if not sure keep blank")
    place: Optional[str] = Field(description="The name of the place where the event takes place, if not sure keep blank")
    key_words: Optional[str] = Field(description="Only five key words of thats describe de event, separated by comma")
    
    
def extraer_informacion_general_gemini_esp(url, API_KEY_GEMINI):
    os.environ["GOOGLE_API_KEY"] = API_KEY_GEMINI
    llm = ChatGoogleGenerativeAI(model="gemini-pro")
    llm_prompt_template = """Your Task is to extract any event showed in following "Context" that can be related to the "event information". 
    "context":{context_str}
    \n{format_instructions}\n
    """
    # parser = YamlOutputParser(pydantic_object=event)
    parser = YamlOutputParser(pydantic_object=event)

    # Realizar el query a Gemini
    llm_prompt = PromptTemplate.from_template(llm_prompt_template)

    llm_prompt = PromptTemplate(
        template=llm_prompt_template,
        input_variables=["context_str"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    context = web_scrapper(url)
    print(context)
    stuff_chain = llm_prompt | llm | parser
    llm_result = stuff_chain.invoke({"context_str": context} )
    return llm_result
  
url= "https://www.unbosque.edu.co/educacion-continuada/evento/congreso-iise-region-centroamerica-y-suramerica-2023"
url = "https://urosario.edu.co/convocatoria/congreso-internacional-de-jovenes-investigadoras-e-investigadores-calas-7441"
llm_result = extraer_informacion_general_gemini_esp(url, contraseñas["api_gemini"]['KEY'])
llm_result

 Línea de atención a estudiantes: (601) 297 0200 opción 1 y 1 I Calle 12C Nº 6-25 - Bogotá D.C. Colombia. Términos de uso - Registra aquí: URnaconstructiva       Chatea con nosotros               Institución de educación superior sujeta a la inspección y vigilancia del Mineducación. | Personería Jurídica: Resolución 58 del 16 de septiembre de 1895 expedida por el Ministerio de Gobierno. | Notificaciones judiciales en juridica@urosario.edu.co  Transparencia y acceso a la información pública  Gobierno Universitario | Proyecto Educativo Institucional | Informe de Gestión | Boletín Estadístico | Régimen Tributario | Código de Ética                                    d de Jurisprudencia   Pregrados   Especializaciones   Maestrías   Maestrías GSB   Doctorado     Facultad de Ciencias Naturales    Facultad de Ciencias Naturales   Pregrados   Especialización   Maestrías   Doctorados     Facultad de Creación    Facultad de Creación   Pregrados     Escuela de Ciencias Humanas    Escuela de Cienci

event(resume='International congress for young researchers.', there_is_event='True', title='Congreso Internacional de Jóvenes Investigadoras e Investigadores (CALAS)', general_title='Congreso Internacional de Jóvenes Investigadoras e Investigadores', date='12-14 de junio de 2023', year='2023', description=None, country=None, city=None, place=None, key_words='researchers, latin america, futures, society, human sciences')

event(resume='The 2023 Central and South America Region Congress of the Institute of Industrial and Systems Engineers (IISE)', there_is_event='True', title='IISE Region Central America and South America Congress 2023', general_title='IISE Region Central America and South America Congress', date='September 6-8, 2023', year='2023', description='The IISE Latin American Congress is a space where chapters from different parts of the world come together to improve relationships between professionals and future engineers, promote the exchange of knowledge, and create experiences that contribute to their professional development. This year, Universidad El Bosque is pleased to host the congress, which will take place from September 6 to 8 in Bogotá, Colombia, bringing attendees a 3-day event on the university campus with plenaries, presentations, competitions, and workshops. This congress seeks to build bridges between the Latin American community of the Institute of Industrial and Systems Engi

In [56]:

import pandas as pd
import numpy as np
import requests
import datetime as dt

from langchain.utilities import TextRequestsWrapper

# Web scrapper
from bs4 import BeautifulSoup
import re

def web_scrapper(url):
  """

  """
  # Initializing variable
  lang_request = TextRequestsWrapper()
  try:
    lang_request.get(url)
  except:
    print('ERR', 'Error scrappiing the url')
    return None

  # Initializing variable
  result = lang_request.get(url)

  # Initializing variable
  bs_result = BeautifulSoup(result)

  # Calculating result
  text = bs_result.get_text()
  text = text.replace("\n", " ")
  text = text.replace("\t", " ")
  return text
url= "https://www.unbosque.edu.co/educacion-continuada/evento/congreso-iise-region-centroamerica-y-suramerica-2023"
web_scrapper(url)

'                                       CONGRESO\xa0IISE\xa0Región Centroamérica y Suramérica 2023  | Universidad El Bosque                 Cerrar               Menu Superior Home Izquierdo       Estudiantes   Acádemicos   Administrativos   Graduados           Menu Superior Home Derecho       FACULTADES   El Bosque en digital   Directorio   Idioma   Pagos      Buscar                                              ¡Estudia en otros países! Viaja y conoce todo lo que el mundo puede ofrecerte.       INTERNACIONALIZACIÓN             Menú Facultades Home       Facultad de Ciencias   Facultad de Ciencias Económicas y Administrativas   Facultad de Ciencias Jurídicas y Políticas   Facultad de Creación y Comunicación   Facultad de Educación   Facultad de Enfermería   Facultad de Ingeniería   Facultad de Medicina   Facultad de Odontología   Facultad de Psicología   Departamento de Humanidades   Departamento de Bioética                                        Adquiere más información sobre tus temas

In [44]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Verificar si los recursos necesarios de NLTK están descargados
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
# Función para preprocesar y obtener las palabras clave de una oración
def preprocess(sentence):
    stop_words = list(set(stopwords.words('english'))) + list(set(stopwords.words('spanish')))
    word_tokens = word_tokenize(sentence.lower())
    return [word for word in word_tokens if word.isalnum() and word not in stop_words ]

# Función para calcular la similitud de Jaccard entre dos conjuntos de palabras
def jaccard_similarity(sentence1, sentence2):
    words1 = set(preprocess(sentence1))
    words2 = set(preprocess(sentence2))
    intersection = len(words1.intersection(words2))
    union = len(words1.union(words2))
    return intersection / union if union != 0 else 0  # Evitar división por cero

def check_similar (new_key, old_keys):
    for old_key in old_keys:
        similarity_score = jaccard_similarity(new_key, old_key)
        if similarity_score >= 0.7:
            return True
        else:
            continue
    return False

def rel_events_parser(yaml_events, df_hist_rel_events, event_key):
    df_rel_events = pd.DataFrame(columns=['event_key', 'rel_event_link', 'rel_event_key','rel_event_title', 'rel_event_year', 'rel_event_country'])

    for event in yaml_events.events:
        events_related_parsed = {}
        event_key = llm_result['title'] + " | " +llm_result['country'] + " | " +llm_result['year']
        rel_event_key = event.title + " | " + event.country + " | " + str(event.year)
        print(check_similar(event_key, [rel_event_key]) , check_similar(rel_event_key, df_hist_rel_events['rel_event_key']))
        if not check_similar(event_key, [rel_event_key]) and not check_similar(rel_event_key, df_hist_rel_events['rel_event_key']):
            events_related_parsed['event_key'] = event_key
            events_related_parsed['rel_event_key'] = event.title + " | " + event.country + " | " + str(event.year)
            events_related_parsed['rel_event_title'] = event.title
            events_related_parsed['rel_event_country'] = event.country
            events_related_parsed['rel_event_year'] = event.year
            df_events_related_parsed = pd.DataFrame(events_related_parsed, index=[0])
            df_rel_events = pd.concat([df_rel_events, df_events_related_parsed])
    return df_rel_events

In [45]:
df_rel_events = pd.DataFrame(columns=['event_key',  'rel_event_key','rel_event_title', 'rel_event_year', 'rel_event_country','rel_event_link'])
if llm_result['there_is_event'] == "Yes":

    if llm_result['general_title'] !="" and llm_result['general_title'] !=None:
        link_or_name = llm_result['general_title']
        search_pattern = f"related: {link_or_name} "
    else:
        link_or_name = llm_result['title']
        search_pattern = f"related: {link_or_name} "
    print("Criterio Busqueda:{}".format(search_pattern))
    google_query_result = query_google_search(search_pattern, 1, contraseñas["api_google_search"])
    for url in google_query_result:
        if es_archivo_pdf(google_query_result[url]['link']):
            continue
        else:
            print(google_query_result[url]['link'], search_pattern)
            ref_event_info = "title:" + llm_result['title'] + "|" +"resume:" + llm_result['resume'] + "|"+"country:" + llm_result['country']  + "|"+"year:" + llm_result['year']
            ref_event_key = llm_result['title'] + " | " + llm_result['country'] + " | " + llm_result['year'] 
            try:   
                yaml_events_related = extraer_informacion_eventos_rel_gemini(google_query_result[url]['link'], ref_event_info , contraseñas["api_gemini"]['KEY'])
                df_events_related_link = rel_events_parser(yaml_events_related, df_rel_events, ref_event_key)
                df_events_related_link ['rel_event_link'] = google_query_result[url]['link']
                df_rel_events = pd.concat([df_rel_events, df_events_related_link])
            except:
                continue


Criterio Busqueda:related: World Congress of the International Economic Association 
https://www.googleapis.com/customsearch/v1?key=AIzaSyA2c1UUoLgsLUYDxkkOvh-UGzIZbkEziQI&cx=525a973bdd364421f&q=related: World Congress of the International Economic Association &start=1
https://iea-world.org/ related: World Congress of the International Economic Association 
True False
https://ieawc2023.org/ related: World Congress of the International Economic Association 
True False
https://iea-world.org/congresses/past-congresses/ related: World Congress of the International Economic Association 
False False
False False
False False
False False
False False
False False
False False
False False
False False
False False
https://www.imf.org/en/News/Articles/2023/12/11/sp121123-cold-war-ii-preserving-economic-cooperation-amid-geoeconomic-fragmentation related: World Congress of the International Economic Association 
True False
https://iea-world.org/the-17th-world-congress-of-the-international-economic-assoc

In [38]:
df_rel_events

Unnamed: 0,event_key,rel_event_key,rel_event_title,rel_event_year,rel_event_country,rel_event_link
0,20th World Congress of the International Econo...,19th World Congress of the International Econo...,19th World Congress of the International Econo...,2021,Indonesia,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,18th World Congress of the International Econo...,18th World Congress of the International Econo...,2017,Mexico,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,17th World Congress of the International Econo...,17th World Congress of the International Econo...,2014,Jordan,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,16th World Congress of International Economic ...,16th World Congress of International Economic ...,2011,China,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,15th World Congress | Turkey | 2008,15th World Congress,2008,Turkey,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,14th World Congress | Morocco | 2005,14th World Congress,2005,Morocco,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,XIII World Congress of the International Econo...,XIII World Congress of the International Econo...,2002,Portugal,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,Twelfth Congress of the International Economic...,Twelfth Congress of the International Economic...,1999,Argentina,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,Eleventh World Congress of the International E...,Eleventh World Congress of the International E...,1995,Tunisia,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,Tenth World Congress of the International Econ...,Tenth World Congress of the International Econ...,1992,Russia,https://iea-world.org/congresses/past-congresses/


In [291]:
df_rel_events

Unnamed: 0,event_key,rel_event_key,rel_event_title,rel_event_year,rel_event_country,rel_event_link
0,20th World Congress of the International Econo...,19th World Congress of the International Econo...,19th World Congress of the International Econo...,2021,Indonesia,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,18th World Congress of the International Econo...,18th World Congress of the International Econo...,2017,Mexico,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,17th World Congress of the International Econo...,17th World Congress of the International Econo...,2014,Jordan,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,16th World Congress of International Economic ...,16th World Congress of International Economic ...,2011,China,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,15th World Congress | Turkey | 2008,15th World Congress,2008,Turkey,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,14th World Congress | Morocco | 2005,14th World Congress,2005,Morocco,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,XIII World Congress of the International Econo...,XIII World Congress of the International Econo...,2002,Portugal,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,Twelfth Congress of the International Economic...,Twelfth Congress of the International Economic...,1999,Argentina,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,Eleventh World Congress of the International E...,Eleventh World Congress of the International E...,1995,Tunisia,https://iea-world.org/congresses/past-congresses/
0,20th World Congress of the International Econo...,Tenth World Congress of the International Econ...,Tenth World Congress of the International Econ...,1992,Moscow,https://iea-world.org/congresses/past-congresses/


In [35]:
event_info = "title:" + llm_result['title'] + "|" +"resume:" + llm_result['resume'] + "|"+"country:" + llm_result['country']  + "|"+"year:" + llm_result['year'] 
yaml_events_related = extraer_informacion_eventos_rel_gemini("https://iea-world.org/congresses/past-congresses/", event_info , contraseñas["api_gemini"]['KEY'])
yaml_events_related

json_resp_events(events=[Event(title='19th World Congress of the International Economic Association', year='2021', country='Indonesia'), Event(title='18th World Congress of the International Economic Association', year='2017', country='Mexico'), Event(title='17th World Congress of the International Economic Association', year='2014', country='Jordan'), Event(title='16th World Congress of International Economic Association', year='2011', country='China'), Event(title='15th World Congress', year='2008', country='Turkey'), Event(title='14th World Congress', year='2005', country='Morocco'), Event(title='XIII World Congress of the International Economic Association', year='2002', country='Portugal'), Event(title='Twelfth Congress of the International Economic Association', year='1999', country='Argentina'), Event(title='Eleventh World Congress of the International Economic Association', year='1995', country='Tunisia'), Event(title='Tenth World Congress of the International Economic Associat

In [30]:
yaml_events_related

json_resp_events(events=[Event(title='20th World Congress of the International Economic Association (IEA)', year='2023', country='Colombia')])

In [286]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Verificar si los recursos necesarios de NLTK están descargados
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
# Función para preprocesar y obtener las palabras clave de una oración
def preprocess(sentence):
    stop_words = list(set(stopwords.words('english'))) + list(set(stopwords.words('spanish')))
    word_tokens = word_tokenize(sentence.lower())
    return [word for word in word_tokens if word.isalnum() and word not in stop_words ]

# Función para calcular la similitud de Jaccard entre dos conjuntos de palabras
def jaccard_similarity(sentence1, sentence2):
    words1 = set(preprocess(sentence1))
    words2 = set(preprocess(sentence2))
    intersection = len(words1.intersection(words2))
    union = len(words1.union(words2))
    return intersection / union if union != 0 else 0  # Evitar división por cero

def check_similar (new_key, old_keys):
    for old_key in old_keys:
        similarity_score = jaccard_similarity(new_key, old_key)
        if similarity_score >= 0.7:
            return True
        else:
            continue
    return False

def rel_events_parser(yaml_events, df_hist_rel_events):
    df_rel_events = pd.DataFrame(columns=['event_key', 'rel_event_link', 'rel_event_key','rel_event_title', 'rel_event_year', 'rel_event_country'])
    list_events_related_parsed = []
    # list_events_related_parsed['events']
    for event in yaml_events.events:
        events_related_parsed = {}
        event_key = llm_result['title'] + " | " +llm_result['country'] + " | " +llm_result['year']
        rel_event_key = event.title + " | " + event.country + " | " + str(event.year)
        
        if not check_similar(event_key, [rel_event_key]) and not check_similar(rel_event_key, df_hist_rel_events['rel_event_key']):
            events_related_parsed['event_key'] = event_key
            events_related_parsed['rel_event_key'] = event.title + " | " + event.country + " | " + str(event.year)
            events_related_parsed['rel_event_title'] = event.title
            events_related_parsed['rel_event_country'] = event.country
            events_related_parsed['rel_event_year'] = event.year
            df_events_related_parsed = pd.DataFrame(events_related_parsed, index=[0])
            df_rel_events = pd.concat([df_rel_events, df_events_related_parsed])
    return df_rel_events

df_rel_events = pd.DataFrame(columns=['event_key', 'rel_event_link', 'rel_event_key','rel_event_title', 'rel_event_year', 'rel_event_country'])
df_events_related_link = rel_events_parser(yaml_events_related, df_rel_events)

In [288]:
df_events_related_link

Unnamed: 0,event_key,rel_event_link,rel_event_key,rel_event_title,rel_event_year,rel_event_country


In [123]:
print("Criterio Busqueda:{}".format(search_pattern))
google_query_result = query_google_search(search_pattern, 1, contraseñas["api_google_search"])

Criterio Busqueda:related: World Congress of the International Economic Association 
https://www.googleapis.com/customsearch/v1?key=AIzaSyA2c1UUoLgsLUYDxkkOvh-UGzIZbkEziQI&cx=525a973bdd364421f&q=related: World Congress of the International Economic Association &start=1


In [145]:
from pydantic import BaseModel, Field
from typing import List

# Definir el modelo para un evento
class Event(BaseModel):
    title: str
    year: str
    country: str

# Definir el modelo para el JSON completo
class JsonRespEvents(BaseModel):
    events: List[Event]

# JSON proporcionado
json_data = '''
{"events": [{"title": "19th World Congress of the International Economic Association", "year": "2021", "country": "Indonesia"}, {"title": "18th World Congress of the International Economic Association", "year": "2017", "country": "Mexico"}, {"title": "17th World Congress of the International Economic Association", "year": "2014", "country": "Jordan"}, {"title": "16th World Congress of International Economic Association", "year": "2011", "country": "China"}, {"title": "15th World Congress", "year": "2008", "country": "Turkey"}, {"title": "14th World Congress", "year": "2005", "country": "Marrakech"}, {"title": "XIII World Congress of the International Economic Association", "year": "2002", "country": "Portugal"}, {"title": "Twelfth Congress of the International Economic Association", "year": "1999", "country": "Argentina"}, {"title": "Eleventh World Congress of the International Economic Association", "year": "1995", "country": "Tunisia"}, {"title": "Tenth World Congress of the International Economic Association", "year": "1992", "country": "Moscow"}]}
'''

# Crear una instancia del modelo utilizando el JSON
json_resp_events = JsonRespEvents.parse_raw(json_data)

# Validar y obtener el JSON resultante
validated_json = json_resp_events.json()

print(validated_json)

{"events":[{"title":"19th World Congress of the International Economic Association","year":"2021","country":"Indonesia"},{"title":"18th World Congress of the International Economic Association","year":"2017","country":"Mexico"},{"title":"17th World Congress of the International Economic Association","year":"2014","country":"Jordan"},{"title":"16th World Congress of International Economic Association","year":"2011","country":"China"},{"title":"15th World Congress","year":"2008","country":"Turkey"},{"title":"14th World Congress","year":"2005","country":"Marrakech"},{"title":"XIII World Congress of the International Economic Association","year":"2002","country":"Portugal"},{"title":"Twelfth Congress of the International Economic Association","year":"1999","country":"Argentina"},{"title":"Eleventh World Congress of the International Economic Association","year":"1995","country":"Tunisia"},{"title":"Tenth World Congress of the International Economic Association","year":"1992","country":"Mos

In [195]:
class Event(BaseModel):
    title: str = Field(description="The title of the event, if not sure keep blank")
    year: str = Field(description="The year of the event, if not sure keep blank")
    country: str = Field(description="The location of the event, if not sure keep blank")

class Events(BaseModel):
    events: List[Event] = Field(..., description="The Event details")
    
output_parser = PydanticOutputParser(pydantic_object = Events)
misformatted = """ {"events": [{"title": "19th World Congress of the International Economic Association", "year": "2021", "country": "Indonesia"}, {"title": "18th World Congress of the International Economic Association", "year": "2017", "country": "Mexico"}, {"title": "17th World Congress of the International Economic Association", "year": "2014", "country": "Jordan"}, {"title": "16th World Congress of International Economic Association", "year": "2011", "country": "China"}, {"title": "15th World Congress", "year": "2008", "country": "Turkey"}, {"title": "14th World Congress", "year": "2005", "country": "Morocco"}, {"title": "XIII World Congress of the International Economic Association", "year": "2002", "country": "Portugal"}, {"title": "Twelfth Congress of the International Economic Association", "year": "1999", "country": "Argentina"}, {"title": "Eleventh World Congress of the International Economic Association", "year": "1995", "country": "Tunisia"}, {"title": "Tenth World Congress of the International Economic Association", "year": "1992", "country": "Moscow"}]}"""
output_parser.parse(misformatted)

Events(events=[Event(title='19th World Congress of the International Economic Association', year='2021', country='Indonesia'), Event(title='18th World Congress of the International Economic Association', year='2017', country='Mexico'), Event(title='17th World Congress of the International Economic Association', year='2014', country='Jordan'), Event(title='16th World Congress of International Economic Association', year='2011', country='China'), Event(title='15th World Congress', year='2008', country='Turkey'), Event(title='14th World Congress', year='2005', country='Morocco'), Event(title='XIII World Congress of the International Economic Association', year='2002', country='Portugal'), Event(title='Twelfth Congress of the International Economic Association', year='1999', country='Argentina'), Event(title='Eleventh World Congress of the International Economic Association', year='1995', country='Tunisia'), Event(title='Tenth World Congress of the International Economic Association', year

In [252]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Verificar si los recursos necesarios de NLTK están descargados
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

# Función para preprocesar y obtener las palabras clave de una oración
def preprocess(sentence):
    stop_words = list(set(stopwords.words('english'))) + list(set(stopwords.words('spanish')))
    word_tokens = word_tokenize(sentence.lower())
    return [word for word in word_tokens if word.isalnum() and word not in stop_words ]

# Función para calcular la similitud de Jaccard entre dos conjuntos de palabras
def jaccard_similarity(sentence1, sentence2):
    words1 = set(preprocess(sentence1))
    print(words1)
    words2 = set(preprocess(sentence2))
    print(words2)
    intersection = len(words1.intersection(words2))
    print(words1.intersection(words2))
    union = len(words1.union(words2))
    print(words1.union(words2))
    return intersection / union if union != 0 else 0  # Evitar división por cero

# Ejemplo de uso
sentence1 = "19th World Congress of the International Economic Association Association Association"
sentence2 = "XIII WORLD CONGRESS OF THE INTERNATIONAL ECONOMIC ASSOCIATION"
similarity_score = jaccard_similarity(sentence1, sentence2)
print("Similitud de Jaccard entre las oraciones:")
print(similarity_score)

{'economic', 'world', 'international', '19th', 'association', 'congress'}
{'economic', 'world', 'international', 'xiii', 'association', 'congress'}
{'economic', 'world', 'international', 'association', 'congress'}
{'economic', 'world', 'international', 'xiii', '19th', 'association', 'congress'}
Similitud de Jaccard entre las oraciones:
0.7142857142857143


In [19]:
from urllib.parse import urlparse

def obtener_url_base(url):
    parsed_url = urlparse(url)
    return urlparse(url).scheme + "://" + urlparse(url).netloc

# Ejemplo de uso
url_completa = "https://www.ejemplo.com/ruta/de/ejemplo"
url_base = obtener_url_base(url_completa)
print("URL base:", url_base)

URL base: https://www.ejemplo.com
