In [1]:

import streamlit as st
import os, toml, requests
import requests
import datetime as dt
import pandas as pd
import nltk, json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from typing import List, Dict, Optional, Union
from langchain.utilities import TextRequestsWrapper
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain.output_parsers import PydanticOutputParser, YamlOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
# from langchain.document_loaders import WebBaseLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain.schema.prompt_template import format_document
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai
from menu import menu

# GOOGLE_API_KEY = "AIzaSyC4NWD6EqPQ-uM4xDX3MQ-Y7fgzQ1jrxU4"  # add your GOOGLE API key here
# os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
# llm = ChatGoogleGenerativeAI(model="gemini-pro")

# Definicion de rutas y constantes
PATH_CWD = "C:/wom/1_VIU/TFM/app-eventos-procolombia"
PATH_DATA = PATH_CWD + "/src/data/"

FN_KEYW = 'db_eventos_keyw.csv'
FN_EVENTS = 'events_data.xlsx'
FN_KEYW_JSON = 'app_config.json'
ACCESS_PATH = PATH_CWD + "/.scrts/access.toml"
#
MODELS_DICT = {'Gemini':0, 'GROG-LLAMA2':1}

# Configuracion de la pagina

# Define your desired data structure.
class event(BaseModel):
    resume: str = Field(description="The resume of the context in few words")
    there_is_event: str = Field(description="Defines if any asociative event is mentioned. If so answer 'Yes', if not answer 'No'")
    title: str = Field(description="The name of the event, dont use Acronyms, dont use colon punctuation, if not sure keep blank")
    general_title: Optional[str] = Field(description="The name of the event, dont use Acronyms, don't use colon punctuation, don't specify the version of the event, if not sure keep blank")
    date: Optional[str] = None 
    year: Optional[str] = Field(description="The year of the event, if not sure keep blank")
    description: Optional[str] = Field(description="The description of the event, don't use colon punctuation, if not sure keep blank")
    country: Optional[str] = Field(description="The location of the event, if not sure keep blank")
    city: Optional[str] = Field(description="The city of the event, if not sure keep blank")
    place: Optional[str] = Field(description="The name of the place where the event takes place, if not sure keep blank")
    key_words: Optional[str] = Field(description="Only five key words of thats describe de event, separated by comma")
    
class Event(BaseModel):
    title: str  = Field(description="The name of the event, dont use initials, dont use punctuation marks")
    year: Optional[str]   = Field(description="The year of the event")
    country: Optional[str] = Field(description="The location of the event")

class json_resp_events(BaseModel):
    events: List[Event] = Field(..., description="The Event details")

class eventAsist(BaseModel):
    title: str  = Field(description="The name of the event, dont use initials, dont use punctuation marks")
    participants: Optional[str]   = Field(description="The resume of the information in few words about event participation, if not information or you are not sure put None")

def cargar_contraseñas(nombre_archivo):
    with open(nombre_archivo, 'r') as f:
        contraseñas = toml.load(f)
    return contraseñas

def cargar_llm(GEMINI_API):

    os.environ["GOOGLE_API_KEY"] = GEMINI_API
    llm = ChatGoogleGenerativeAI(model="gemini-pro")
    return llm

def cargar_configuracion():
    
    if not os.path.exists(PATH_DATA + FN_KEYW_JSON):
        configuracion = {
        "modelo": "Gemini",
        "paginas":1,
        "criterios": ["World Congress-Colombia", "Eventos-Colombia-Bogota"],
        "patrones_busqueda":{
                "Esp":{"alcance": ["Mundial", "Internacional"], "tipo_evento": ["Congreso", "Simposio"]}, 
                "Ing":{"alcance": ["World", "International"], "tipo_evento": ["Congress", "Simposium"]}
                  },
        "lugares_busqueda":{
                "Esp":["", "Universidad del Bosque"],
                "Ing":["", "Universidad del Bosque"]
                }
        }
        with open(PATH_DATA + FN_KEYW_JSON, "w") as archivo:
            json.dump(configuracion, archivo, indent=4)

    else:
        with open(PATH_DATA + FN_KEYW_JSON, 'r') as archivo:
            configuracion = json.load(archivo)
        # lista_criterios = []
        # for criterio in configuracion['criterios']:
        #     palabras = []
        #     palabras = ['"{}"'.format(cadena) for cadena in criterio.split("-")]
        #     lista_criterios.append(' + '.join(palabras))
        # configuracion['criterios'] = lista_criterios
    return configuracion

def obtener_criterios_busqueda(config):
    list_search_params = []
    for idioma in config['patrones_busqueda']:
        for alcance in config['patrones_busqueda'][idioma]['alcance']:
            for tipo_evento in config['patrones_busqueda'][idioma]['tipo_evento']:
                if idioma == "Eng":
                    for lugar in config['lugares_busqueda']['Eng']:
                        search_params = {
                                        'q': f'+{tipo_evento}+Colombia+{lugar}',
                                        'lr': 'lang_eng',
                                        'exactTerms': f'({tipo_evento}).({alcance})'
                                        }
                        # search_params = {
                        #                 'tipo_evento':tipo_evento,
                        #                 'alcance':alcance,
                        #                 'lugar': lugar,
                        #                 'lang':'lang_eng'
                        #                 }
                        list_search_params.append(search_params)
                if idioma == "Esp":
                    for lugar in config['lugares_busqueda']['Esp']:
                        search_params = {
                                        'q': f'+{tipo_evento}+colombia+{lugar}',
                                        'lr': 'lang_esp',
                                        'exactTerms': f'({tipo_evento}).({alcance})'
                                        }
                        # search_params = {
                        #                 'tipo_evento':tipo_evento,
                        #                 'alcance':alcance,
                        #                 'lugar': lugar,
                        #                 'lang':'lang_esp'
                        #                 }
                        list_search_params.append(search_params)
    return list_search_params

def actualizar_configuracion(configuracion):
    with open(PATH_DATA + FN_KEYW_JSON, "w") as archivo:
            json.dump(configuracion, archivo, indent=4)

def cargar_eventos_procesados_archivo():
    if not os.path.exists(PATH_DATA + FN_EVENTS):
        st.warning('Archivo con la base de eventos no encontrada, se creara uno en blanco en la ruta "{}".'.format(PATH_DATA), icon="⚠️")
        cols = ['resume', 'there_is_event', 'title', 'date', 'year', 'description', 'country', 'city', 'key_words', 'google_title', 'google_snippet', 'google_long_description', 'google_url', 'key_word', 'date_processed']
        df_events = pd.DataFrame(columns = cols)
        df_events.to_excel(PATH_DATA + FN_EVENTS, index=False)

    df_events = pd.read_excel(PATH_DATA + FN_EVENTS)
    # df_events["llm_event_flag"] = df_events["llm_event_flag"].astype(bool)
    # st.write(df_events)
    return df_events

def query_google_search(page=1, search_engine_keys=None, add_params = {}):
  """
  Query the Google Custom Search API and return the results in a dictionary.

  Args:
      google_query (str): The query to search for.
      page (int): The page number to retrieve.
  Returns:
      A dictionary containing the search results
  """

  # using the first page
  page = page
  start = (page - 1) * 10 + 1

#   url = f"https://www.googleapis.com/customsearch/v1?key={search_engine_keys['KEY']}&cx={search_engine_keys['ID']}&q={google_query}&start={start}" + add_args
  url = "https://www.googleapis.com/customsearch/v1"
  params = {
    'key' : search_engine_keys['KEY'],
    'cx' : search_engine_keys['ID'],
    'dateRestrict':'y[10]',
    'fileType': '-pdf',
  }
  params.update(add_params)
  print(url)
  print(params)
  try:
      # Make the GET request to the Google Custom Search API
      google_response = requests.get(url, params=params)
      print(google_response.url)
      # Check if the request was successful (status code 200)
      if google_response.status_code == 200:
          # Parse the JSON response
          google_response_data = google_response.json()
          google_response_items = {}
          # get the result items
          search_items = google_response_data.get("items")
          # iterate over 10 results found
          for i, search_item in enumerate(search_items, start=1):
              try:
                  long_description = search_item["pagemap"]["metatags"][0]["og:description"]
              except KeyError:
                  long_description = "N/A"
              # get the page title
              title = search_item.get("title")
              # page snippet
              snippet = search_item.get("snippet")
              # alternatively, you can get the HTML snippet (bolded keywords)
              html_snippet = search_item.get("htmlSnippet")
              # extract the page url
              link = search_item.get("link")
              google_response_items[i] = {
                  'title': title,
                  'snippet': snippet,
                  'long_description': long_description,
                  'link': link
              }
          return google_response_items

      else:
          print(f"Error: {google_response.status_code}")
          return None
  except Exception as e:
      print(f"An error occurred: {e}")
      return None

def web_scrapper(url):
  """

  """
  # Initializing variable
  lang_request = TextRequestsWrapper()
  try:
    lang_request.get(url)
  except:
    print('ERR', 'Error scrappiing the url')
    return None
  # Initializing variable
  result = lang_request.get(url)
  # Initializing variable
  bs_result = BeautifulSoup(result, features="html.parser")
  # Calculating result
  text = bs_result.get_text()
  text = text.replace("\n", " ")
  text = text.replace("\t", " ")
  return text

def es_archivo_pdf(url):
    try:
        # Realizar una solicitud HEAD para obtener solo los encabezados de la respuesta
        response = requests.head(url)
        
        # Verificar si la respuesta tiene el tipo de contenido "application/pdf"
        if 'application/pdf' in response.headers.get('Content-Type', ''):
            return True
        else:
            return False
    except requests.exceptions.RequestException as e:
        print("Error al hacer la solicitud:", e)
        return False

def preprocess(sentence):
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('punkt')
        nltk.download('stopwords')
    stop_words = list(set(stopwords.words('english'))) + list(set(stopwords.words('spanish')))
    word_tokens = word_tokenize(sentence.lower())
    return [word for word in word_tokens if word.isalnum() and word not in stop_words ]

def jaccard_similarity(sentence1, sentence2):
    words1 = set(preprocess(sentence1))
    words2 = set(preprocess(sentence2))
    intersection = len(words1.intersection(words2))
    try:
        if len(words1) == len(intersection):
            return 1
        else:
            union = len(words1.union(words2))
            return intersection / union if union != 0 else 0 
    except:
        union = len(words1.union(words2))
        return intersection / union if union != 0 else 0 

def check_similar (new_key, old_keys):
    for old_key in old_keys:
        similarity_score = jaccard_similarity(new_key, old_key)
        if similarity_score >= 0.7:
            return True
        else:
            continue
    return False

def extraer_informacion_general_gemini(url, API_KEY_GEMINI):
    os.environ["GOOGLE_API_KEY"] = API_KEY_GEMINI
    llm = ChatGoogleGenerativeAI(model="gemini-pro")
    model = genai.GenerativeModel('gemini-pro')
    llm_prompt_template = """Your Task is to extract any event showed in following "Context" that can be related to the "event information". 
    "context":{context_str}
    \n{format_instructions}\n
    """
    # parser = YamlOutputParser(pydantic_object=event)
    parser = YamlOutputParser(pydantic_object=event)

    # Realizar el query a Gemini
    llm_prompt = PromptTemplate.from_template(llm_prompt_template)

    llm_prompt = PromptTemplate(
        template=llm_prompt_template,
        input_variables=["context_str"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    context = web_scrapper(url)
    if context.startswith('Not Acceptable!'):
        loader = WebBaseLoader(url)
        docs = loader.load()
        doc_prompt = PromptTemplate.from_template("{page_content}")
        context = "\n\n".join(format_document(doc, doc_prompt) for doc in docs)
        
    tokens_size = int(model.count_tokens(str(llm_prompt) + context).total_tokens)
    if tokens_size > 30000:
        return None
    else:
        stuff_chain = llm_prompt | llm | parser
        llm_result = stuff_chain.invoke({"context_str": context, "event_str": event} )
        return llm_result

def buscar_eventos(contraseñas = None, pages=2, list_key_w= None):
    date =  dt.datetime.today().date().strftime("%Y-%m-%d")
    # latest_iteration = st.empty()
    df_events_hist = cargar_eventos_procesados_archivo()
    df_events_busqueda = pd.DataFrame()
    step =  int(100 / (10 * (pages) ))
 
    # Buscar Paginas asociadas a los criterios
    for key_W in list_key_w:
        i = 0
        # latest_iteration = st.empty()
        # latest_iteration.text(f'Progreso 0 %')
        # bar = st.progress(0)
        for page in range(1, pages+1):
            google_query_result = query_google_search( page, contraseñas["api_google_search"], key_W)

            for item in google_query_result.keys():
                list_hist_links = df_events_hist['google_url'].to_list()
                list_hist_title = df_events_hist['google_title'].to_list()
                url = google_query_result[item]['link']
                title = google_query_result[item]['title']
                # bar.progress(i+step)
                i = i+step
                if url in list_hist_links or title in list_hist_title:
                    continue
                else:
                    # latest_iteration.text('Criterio: {}, Progreso: {} %,\nLink: {}'.format(key_W,i+step, url))
                    
                    try:
                        llm_result = extraer_informacion_general_gemini(url, contraseñas["api_gemini"]['KEY'])
                        # if llm_result['there_is_event'] =="Yes":
                        #     extraer_informacion_eventos_rel_gemini(url, contraseñas["api_gemini"]['KEY']):
                                
                    except Exception as e:
                        continue
                    df_event_info = pd.DataFrame([llm_result.__dict__])
                    df_event_info['google_title'] = google_query_result[item]['title']
                    df_event_info['google_snippet'] = google_query_result[item]['snippet']
                    df_event_info['google_long_description'] = google_query_result[item]['long_description']
                    df_event_info['google_url'] = google_query_result[item]['link']
                    df_event_info['search_criteria'] =  str(key_W)
                    df_event_info['date_processed'] =  date
                    df_events_busqueda = pd.concat([df_events_busqueda, df_event_info])
                    df_events_hist = pd.concat([df_events_hist, df_event_info])
                    df_events_hist.to_excel(PATH_DATA + "events_data.xlsx", index=False)
    return df_events_busqueda

def extraer_informacion_eventos_rel_gemini(url, event, API_KEY_GEMINI):
    
    os.environ["GOOGLE_API_KEY"] = API_KEY_GEMINI
    llm = ChatGoogleGenerativeAI(model="gemini-pro")
    model = genai.GenerativeModel('gemini-pro')
    llm_prompt_template = """Your Task is to extract any event showed in following "Context" that can be related to the "event information". 
    "event information":{event_str}
    "context":{context_str}
    \n{format_instructions}\n
    """
    parser = YamlOutputParser(pydantic_object=json_resp_events)

    # To extract data from WebBaseLoader
    doc_prompt = PromptTemplate.from_template("{page_content}")
    
    # Realizar el query a Gemini
    llm_prompt = PromptTemplate.from_template(llm_prompt_template)

    llm_prompt = PromptTemplate(
        template=llm_prompt_template,
        input_variables=["context_str", "event_str"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    context = web_scrapper(url)
    if context.startswith('Not Acceptable!'):
        loader = WebBaseLoader(url)
        docs = loader.load()
        doc_prompt = PromptTemplate.from_template("{page_content}")
        context = "\n\n".join(format_document(doc, doc_prompt) for doc in docs)
        
    tokens_size = int(model.count_tokens(str(llm_prompt) + context).total_tokens)
    if tokens_size > 30000:
        return None
    else:
        stuff_chain = llm_prompt | llm | parser
        llm_result = stuff_chain.invoke({"context_str": context, "event_str": event} )
        return llm_result

def rel_events_parser(yaml_events, df_hist_rel_events, event_key):
    df_rel_events = pd.DataFrame(columns=['event_key', 'rel_event_link', 'rel_event_key','rel_event_title', 'rel_event_year', 'rel_event_country'])

    for event in yaml_events.events:
        events_related_parsed = {}
        rel_event_key = event.title + " | " + event.country + " | " + str(event.year)
        print(check_similar(event_key, [rel_event_key]) , check_similar(rel_event_key, df_hist_rel_events['rel_event_key']))
        if not check_similar(event_key, [rel_event_key]) and not check_similar(rel_event_key, df_hist_rel_events['rel_event_key']):
            if int(event.year) > dt.datetime.today().year -20:
                events_related_parsed['event_key'] = event_key
                events_related_parsed['rel_event_key'] = event.title + " | " + event.country + " | " + str(event.year)
                events_related_parsed['rel_event_title'] = event.title
                events_related_parsed['rel_event_country'] = event.country
                events_related_parsed['rel_event_year'] = event.year
                df_events_related_parsed = pd.DataFrame(events_related_parsed, index=[0])
                df_rel_events = pd.concat([df_rel_events, df_events_related_parsed])
    return df_rel_events

def buscar_eventos_relacionados(llm_result_event, contraseñas):

    df_rel_events = pd.DataFrame(columns=['event_key',  'rel_event_key','rel_event_title', 'rel_event_year', 'rel_event_country','rel_event_link'])
    if llm_result_event.there_is_event == "True":

        for i in range(3):
            print(i)
            
            if i == 0:
                add_args = {
                    'lr': 'lang_eng|lang_esp'
                }
            elif i == 1:
                add_args = {
                    'lr': 'lang_esp'
                }
            elif i == 2:
                add_args = {
                    'lr': 'lang_esp',
                    'cr': 'countryCO'
                }
            if llm_result_event.title !="" and llm_result_event.general_title !=None:
                link_or_name = llm_result_event.general_title
                search_pattern = f"related: {link_or_name} "
            else:
                link_or_name = llm_result_event.title
                search_pattern = f"related: {link_or_name} "
            print("Criterio Busqueda:{}".format(search_pattern))
            google_query_result = query_google_search(search_pattern, 1, contraseñas["api_google_search"],add_args)
            for url in google_query_result:
                if es_archivo_pdf(google_query_result[url]['link']):
                    continue
                else:
                    print(google_query_result[url]['link'], search_pattern)
                    ref_event_info = "title:" + llm_result_event.title + "|" +"resume:" + llm_result_event.resume + "|"+"country:" + llm_result_event.country  + "|"+"year:" + llm_result_event.year
                    ref_event_key = llm_result_event.title + " | " + llm_result_event.country + " | " + llm_result_event.year 
                    try:   
                        yaml_events_related = extraer_informacion_eventos_rel_gemini(google_query_result[url]['link'], ref_event_info , contraseñas["api_gemini"]['KEY'])
                        df_events_related_link = rel_events_parser(yaml_events_related, df_rel_events, ref_event_key)
                        df_events_related_link ['rel_event_link'] = google_query_result[url]['link']
                        df_rel_events = pd.concat([df_rel_events, df_events_related_link])
                        if len(df_rel_events) >= 5:
                            return df_rel_events
                    except Exception as e:
                        print(e)
                        continue
    return df_rel_events
    
def extraer_informacion_asistentes_gemini(url, event, API_KEY_GEMINI):
    
    os.environ["GOOGLE_API_KEY"] = API_KEY_GEMINI
    llm = ChatGoogleGenerativeAI(model="gemini-pro")
    model = genai.GenerativeModel('gemini-pro')
    llm_prompt_template = """Tu tarea es extraer de "context" la informacion disponible del numero de asistentes al evento {event_str} en el idioma del contexto". 
    "context":{context_str}
    \n{format_instructions}\n
    """
    parser = YamlOutputParser(pydantic_object=eventAsist)

    # To extract data from WebBaseLoader
    doc_prompt = PromptTemplate.from_template("{page_content}")
    
    # Realizar el query a Gemini
    llm_prompt = PromptTemplate.from_template(llm_prompt_template)

    llm_prompt = PromptTemplate(
        template=llm_prompt_template,
        input_variables=["context_str", "event_str"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    context = web_scrapper(url)
    if context.startswith('Not Acceptable!'):
        loader = WebBaseLoader(url)
        docs = loader.load()
        doc_prompt = PromptTemplate.from_template("{page_content}")
        context = "\n\n".join(format_document(doc, doc_prompt) for doc in docs)
    tokens_size = int(model.count_tokens(str(llm_prompt) + context).total_tokens)
    if tokens_size > 30000:
        return None
    else:
        stuff_chain = llm_prompt | llm | parser
        llm_result = stuff_chain.invoke({"context_str": context, "event_str": event} )
        return llm_result
    
def buscar_informacion_asistentes(llm_result_event, contraseñas):
    asistants_list = []
    for i in range(3):
        print(i)
        
        if llm_result_event.place !=None and llm_result_event.place !="":
            location = llm_result_event.place
        elif llm_result_event.country !=None and llm_result_event.country !="":
            location = llm_result_event.country
        else:
            location = ""
        search_pattern = f"{llm_result_event.general_title} {location}"
        if i == 0:
            add_args = {
                'lr': 'lang_eng|lang_esp'
            }
        elif i == 1:
            add_args = {
                'lr': 'lang_esp'
            }
        elif i == 2:
            add_args = {
                'lr': 'lang_esp',
                'cr': 'countryCO'
            }
        google_query_result = query_google_search(search_pattern, 1, contraseñas["api_google_search"], add_args)
  
        for url in google_query_result:
            if es_archivo_pdf(google_query_result[url]['link']):
                continue
            else:
                print(google_query_result[url]['link'], search_pattern)
                ref_event_info = "title:" + llm_result_event.title + "|" +"resume:" + llm_result_event.resume + "|"+"country:" + llm_result_event.country  + "|"+"year:" + llm_result_event.year
                try:   
                    yaml_envent_asistants = extraer_informacion_asistentes_gemini(google_query_result[url]['link'], ref_event_info , contraseñas["api_gemini"]['KEY'])
                    if yaml_envent_asistants.participants not in [None, 'None', '', ' '] and not check_similar(yaml_envent_asistants.participants, asistants_list):
                        asistants_list.append(yaml_envent_asistants.participants)
                        print(asistants_list)
                        if len(asistants_list) >=3:
                            return "|".join(asistants_list)  
                except Exception as e:
                    print(e)
                    continue
    return "|".join(asistants_list) 



  from .autonotebook import tqdm as notebook_tqdm


In [122]:
contraseñas = cargar_contraseñas(ACCESS_PATH)
config = cargar_configuracion()
criterios = obtener_criterios_busqueda(config)

In [128]:
str(criterios[1])

"{'q': '+Congreso+colombia+Universidad del Bosque', 'lr': 'lang_esp', 'exactTerms': '(Congreso).(Mundial)'}"

In [121]:
print(dt.datetime.today().year)

2024


In [129]:
google_query_result = query_google_search(page=1, search_engine_keys=contraseñas['api_google_search'], add_params = criterios[1])
google_query_result

https://www.googleapis.com/customsearch/v1
{'key': 'AIzaSyD10aDIGJL5jVypOwtOnHWc15GyOni3Hf0', 'cx': 'a3d39ff176a444e98', 'dateRestrict': 'y[10]', 'fileType': '-pdf', 'q': '+Congreso+colombia+Universidad del Bosque', 'lr': 'lang_esp', 'exactTerms': '(Congreso).(Mundial)'}
https://www.googleapis.com/customsearch/v1?key=AIzaSyD10aDIGJL5jVypOwtOnHWc15GyOni3Hf0&cx=a3d39ff176a444e98&dateRestrict=y%5B10%5D&fileType=-pdf&q=%2BCongreso%2Bcolombia%2BUniversidad+del+Bosque&lr=lang_esp&exactTerms=%28Congreso%29.%28Mundial%29


{1: {'title': 'Exitosa participación de El Bosque en congreso médico ...',
  'snippet': 'Oct 19, 2023 ... Docentes de la Facultad de Medicina participaron en el Segundo Congreso Mundial sobre Educación Médica, organizado por la Asociación\xa0...',
  'long_description': 'N/A',
  'link': 'https://www.unbosque.edu.co/centro-informacion/noticias/exitosa-participacion-de-el-bosque-en-congreso-medico'},
 2: {'title': 'Nacionales – Congreso Mundial 2023',
  'snippet': 'Jonathan Alexander Guezguan Pérez. Asociación Nacional de Internos y Residentes (ANIR) Universidad Pedagógica y Tecnológica de Colombia Universidad de\xa0...',
  'long_description': 'N/A',
  'link': 'https://ascofame.org.co/congreso2023/index.php/speakers-style-two/'},
 3: {'title': 'Dra. Jenny Correa - Dione Skin Clinic',
  'snippet': '2019, FORMACIÓN, ACTUALIZACIÓN, DIPLOMADO Y CONGRESO · Congreso IMCAS -Cartagena, Agosto 2019 · Congreso mundial de medicina estética – AMWC – Medellín, Octubre\xa0...',
  'long_description': 'N

In [94]:
url = google_query_result[1]['link']
llm_result = extraer_informacion_general_gemini(url, contraseñas["api_gemini"]['KEY'])

In [95]:
df = pd.DataFrame([llm_result.__dict__])

In [112]:
buscar_eventos(contraseñas, pages=config['paginas'], list_key_w= criterios)

https://www.googleapis.com/customsearch/v1
{'key': 'AIzaSyD10aDIGJL5jVypOwtOnHWc15GyOni3Hf0', 'cx': 'a3d39ff176a444e98', 'dateRestrict': 'y[10]', 'fileType': '-pdf', 'q': '+Congreso+colombia+', 'lr': 'lang_esp', 'exactTerms': '(Congreso).(Mundial)'}
https://www.googleapis.com/customsearch/v1?key=AIzaSyD10aDIGJL5jVypOwtOnHWc15GyOni3Hf0&cx=a3d39ff176a444e98&dateRestrict=y%5B10%5D&fileType=-pdf&q=%2BCongreso%2Bcolombia%2B&lr=lang_esp&exactTerms=%28Congreso%29.%28Mundial%29
ERR Error scrappiing the url


Unnamed: 0,resume,there_is_event,title,general_title,date,year,description,country,city,place,key_words,google_title,google_snippet,google_long_description,google_url,search_criteria,date_processed
0,The Minister of ICT consolidated significant a...,True,Mobile World Congress,Mobile World Congress,27 February 2023,2023,"The Minister of ICT, Sandra Milena Urrutia, co...",Colombia,,,"ICT, Connectivity, 5G, Digitalization, Rural D...","En el Congreso Mundial de Telefonía Móvil, la ...","Feb 27, 2023 ... En el Congreso Mundial de Tel...",,https://www.mintic.gov.co/portal/715/w3-articl...,"{'q': '+Congreso+colombia+', 'lr': 'lang_esp',...",2024-03-31
0,The 20th World Congress of the International E...,True,IEA World Congress 2023 Colombia,IEA World Congress,"December, 2023",2023,The 20th World Congress of the International E...,Colombia,Medellin,EAFIT University,"Economics, Sustainability, Development, Inequa...",Así va el congreso - IEA World Congress 2023 C...,"Durante cinco días, ustedes, los más de 1.000 ...",,https://www.eafit.edu.co/ieawc2023,"{'q': '+Congreso+colombia+', 'lr': 'lang_esp',...",2024-03-31
0,Colombia will host the World Congress on Passi...,True,Congreso mundial de pasifloras,Congreso mundial de pasifloras,2017,2017,The event will be the first time that this int...,Colombia,Not mentioned,Not mentioned,"Passiflora, Colombia, Latin America, Experts, ...",Colombia será anfitrión del Congreso mundial d...,"Oct 10, 2016 ... Colombia será anfitrión del C...",,https://www.minagricultura.gov.co/noticias/Pag...,"{'q': '+Congreso+colombia+', 'lr': 'lang_esp',...",2024-03-31
0,The event is the World Economic Congress that ...,True,World Economic Congress,World Economic Congress,11th and 15th of December,2023,The article refers to the World Economic Congr...,Colombia,Medellin,Not mentioned,"World Economic Congress, UNAULA, Colombia, Eco...","Delegación de UNAULA, aliada del Congreso Mund...","Dec 14, 2023 ... ... Congreso Mundial de Econo...",,https://www.unaula.edu.co/node/3960,"{'q': '+Congreso+colombia+', 'lr': 'lang_esp',...",2024-03-31
0,Colombia participated in the Second World Cong...,True,Second World Congress on Indigenous Tourism,World Congress on Indigenous Tourism,13th to 16th March,2023,The event was an opportunity to share best pra...,Australia,Perth,,"Indigenous tourism, Colombia, Australia, best ...",Participación Activa de Colombia en el Congres...,"Mar 19, 2023 ... Colombia participó en la Segu...",,http://australia.embajada.gov.co/newsroom/news...,"{'q': '+Congreso+colombia+', 'lr': 'lang_esp',...",2024-03-31
0,A convention of witches and other spiritualist...,True,1º Congreso Mundial de Brujería,Congreso Mundial de Brujería,1975,1975,"In 1975, the most important witches, clairvoya...",Colombia,Bogotá,,"Witchcraft, Colombia, Convention, Spirituality...",El diablo no esta invitado | 1º Congreso Mundi...,Pruébala y encuentra más información en lupa.a...,"En 1975, los brujos, clarividentes, psíquicos,...",https://radioambulante.org/audio/el-diablo-no-...,"{'q': '+Congreso+colombia+', 'lr': 'lang_esp',...",2024-03-31
0,The Embassy of Colombia facilitated the partic...,True,World Architecture Congress,World Architecture Congress,2-6 July 2023,2023,The World Architecture Congress is a triennial...,Denmark,Copenhagen,,"Architecture, Sustainability, Innovation, Educ...",Colombia se destaca en el Congreso Mundial de ...,"Jul 14, 2023 ... Colombia tuvo un papel destac...",,https://dinamarca.embajada.gov.co/newsroom/new...,"{'q': '+Congreso+colombia+', 'lr': 'lang_esp',...",2024-03-31


In [5]:
config = cargar_configuracion()
config

{'modelo': 'Gemini',
 'paginas': 1,
 'criterios': ['World Congress-Colombia', 'Eventos-Colombia-Bogota'],
 'patrones_busqueda': {'Esp': {'alcance': ['Mundial', 'Internacional'],
   'tipo_evento': ['Congreso', 'Simposio']},
  'Eng': {'alcance': ['World', 'International'],
   'tipo_evento': ['Congress', 'Simposium']}},
 'lugares_busqueda': {'Esp': ['', 'Universidad del Bosque'], 'Eng': ['']}}

In [8]:
config['patrones_busqueda']['Esp']['alcance']

['Mundial', 'Internacional']