In [3]:
#Imports
import math
import requests
import csv
import pandas as pd
import re
import os
from pathlib import Path
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentSplitter
from haystack.utils import Secret
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from pathlib import Path
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.writers import DocumentWriter
import configparser


In [33]:
# Función para manejar PDFs localmente
def process_local_pdfs(pdf_folder_path="recipe_files", model="modelo"):
    
    ## Procesa archivos PDF locales en un Document Store.
    document_store = InMemoryDocumentStore()
    file_type_router = FileTypeRouter(mime_types=["application/pdf"])
    pdf_converter = PyPDFToDocument()

    ## direcciones de correo
    ## borrar referencias
    combined_pattern = r"https?://(?:www\.)?[^\s/$.?#].[^\s]*|"  # URLs
    combined_pattern += r"\d{1,4}-\d{1,4},\s\d{4}\.|"             # Fechas con guiones
    combined_pattern += r"(?:[A-Z]\.\s?)+[A-Z][a-z]+(?:,\s|$)|"   # Iniciales seguidas de apellido
    combined_pattern += r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}|"
    combined_pattern += r"(?s)^.*?(Introducti?on|NTRODUCTI?ON)|"
    combined_pattern += r"(?<=\bREFERENCES\b)\s*[\s\S]*$"
 
    document_cleaner = DocumentCleaner(
    remove_repeated_substrings= True,
    remove_empty_lines=True,
    ascii_only = True,
    remove_regex=combined_pattern
    )

    document_splitter = DocumentSplitter(split_by="sentence", split_length=7, split_overlap=3, split_threshold = 3 )
    embedder = SentenceTransformersDocumentEmbedder(model="all-mpnet-base-v2")
    document_writer = DocumentWriter(document_store)

    pipeline = Pipeline()
    pipeline.add_component(instance=file_type_router, name="file_type_router")
    pipeline.add_component(instance=pdf_converter, name="pdf_converter")
    pipeline.add_component(instance=document_cleaner, name="document_cleaner")
    pipeline.add_component(instance=document_splitter, name="document_splitter")
    pipeline.add_component(instance=embedder, name="document_embeder")    
    pipeline.add_component(instance=document_writer, name="document_writer")

    
    pipeline.connect("file_type_router.application/pdf", "pdf_converter.sources")
    pipeline.connect("pdf_converter",  "document_cleaner")
    pipeline.connect("document_cleaner", "document_splitter")
    pipeline.connect("document_splitter", "document_embeder")
    pipeline.connect("document_embeder", "document_writer")
    
    

    pdf_files = list(Path(pdf_folder_path).glob("**/*.pdf"))

    pipeline.run({"file_type_router": {"sources": pdf_files}})

    return document_store

# Procesar PDFs locales y crear embeddings
document_store = process_local_pdfs("pdfs/","hol")
all_documents = document_store.filter_documents()


In [34]:
all_documents

In [35]:
len(all_documents)

In [52]:
import pandas as pd

# Extraer información de cada `Document` en `all_docs`
data = []
for doc in all_documents:
    data.append({
        "id": doc.id,
        "content": doc.content,
        "vector": doc.embedding,
        "file_path": doc.meta.get("file_path"),
        "source_id": doc.meta.get("source_id"),
        "page_number": doc.meta.get("page_number"),
        "split_id": doc.meta.get("split_id"),
        "split_idx_start": doc.meta.get("split_idx_start")
    })
    

# Convertir a DataFrame
df = pd.DataFrame(data)

# Guardar en un archivo CSV


In [53]:
df['file_path'] = df['file_path'].apply(lambda x: x[5:-4])  # Eliminar las primeras 5 letras y las últimas 4


In [54]:
df.rename(columns={'file_path': 'eid'}, inplace=True)

In [55]:
df

In [56]:
df.to_csv("fragmentos_documentos.csv", index=False, encoding="utf-8")

In [57]:
df = pd.read_csv("fragmentos_documentos.csv")


In [58]:
df

In [59]:
import matplotlib.pyplot as plt

df["word_count"] = df["content"].apply(lambda x: len(str(x).split()))
# Genera el histograma
plt.hist(df["word_count"], bins=range(1, df["word_count"].max() + 1), edgecolor="black")
plt.xlabel("Cantidad de palabras")
plt.ylabel("Frecuencia")
plt.title("Histograma de la cantidad de palabras por fila")
plt.show()

Mayores a 250

In [24]:
dfMas250 = df[df["word_count"] > 250]
dfMas250.to_csv("dfMas250.csv", index=False, encoding="utf-8")
len(dfMas250)

Menores a 30

In [25]:
dfMenos30 = df[df["word_count"] < 40]
dfMenos30.to_csv("dfMenos30.csv", index=False, encoding="utf-8")
len(dfMenos30)

Entre 30 y 250

In [61]:
dfEntre30y250 = df[(df["word_count"] >= 30) & (df["word_count"] <= 250)]
dfEntre30y250.to_csv("dfEntre30y250.csv", index=False, encoding="utf-8")
len(dfEntre30y250)

## Creación de base de datos vectorial en hugging face

In [51]:
from huggingface_hub import HfApi, login
import os
config = configparser.ConfigParser()
config.read("variables.ini")   

# Autentícate con tu token de acceso de Hugging Face
HF_Token = config['DEFAULT']['HF_Token']
login(HF_Token)


# Define el nombre del repositorio
HF_Usuario = config['DEFAULT']['HF_Usuario']

repo_name = "fragmentos_documentos_61_all-mpnet-base-v2"  # Cambia esto por el nombre que deseas para tu dataset
repo_id = f"{HF_Usuario}/{repo_name}"

# Instancia la API de Hugging Face
api = HfApi()

# Crear el repositorio en Hugging Face Hub
api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)

# Sube el archivo CSV al repositorio creado
api.upload_file(
    path_or_fileobj="fragmentos_documentos.csv",
    path_in_repo="fragmentos_documentos.csv",
    repo_id=repo_id,
    repo_type="dataset"
)


### MODELOS

In [13]:
import unicodedata
import re

document = """Stability assessments are
usually performed by means of ofﬂine time-domain simula-
tions (TDS) [11], following a worst-case approach, perform-
ing simulations for only a limited set of relevant operating
points (OP) and contingencies. These critical conditions and
contingencies are usually selected based on the historical
performance of the system and the experience of the operator
and planner [12]. For instance, frequency problems are more
likely to arise during periods of low net load, where only a
limited number of SGs are available to support frequency
response. The critical contingency considered for the simu-
lations is the sudden outage of the largest online generation
unit [13]. In this context, stability assessments of large power
systems with thousands of buses and hundreds of genera-
tors and contingencies, and for a large number of typically
encountered operating conditions are not realistically feasi-
ble, especially for real-time, online stability assessments, due
to computational limitations [14].
Although worst-case scenarios used for assessing system
stability are usually well deﬁned, in power systems dom-
inated by CGTs, traditional approaches for deﬁning these
scenarios may no longer be valid. With high levels of CGTs,
power system dynamics change in new ways, thus making the
process of deﬁning worst-case scenarios even more challeng-
ing. Additionally, the high uncertainty of CGTs may not only
result in a shift of the critical operating conditions, but also in
an increase of the number of risky conditions in which sys-
tem stability may be threaten [15]. Consequently, currently
widely accepted criteria for deﬁning critical scenarios for
stability assessments may fail to cover all critical operating
points and contingencies that might result in power system
instabilities [13]."""

document = document.replace(" ´", "")
document = unicodedata.normalize('NFKD', document).encode('ascii', 'ignore').decode('utf-8')

# Eliminar saltos de línea y caracteres innecesarios
document = re.sub(r'\s+', ' ', document)  # Sustituye múltiples espacios y saltos por uno solo



document = re.sub(r'[^A-Za-z0-9\s.,:;()\[\]-]','', document)  # Elimina caracteres especiales

prompt1 = """
You are an expert in knowledge engineering and will give you a {Document},
          First, with that information identify the RDF triples (subject, predicate, object) of each relevant sentence.
          Second, Normalizes/lemmatizes the text of the elements identified in each triple.
          Third, Recognizes the entities on subject and object and provide the category of each one, e.g.subject and object and normalizes text.
          Fourth, Only returns the output in csv format. Don't explain the process.
          Format:
          Subject,predicate,object,category of subject, category of object.
          Example:
          Tim Berner-Lee,interestIn,Fog Computing,PERSON,TOPIC
          Document to be analyze:\n
""" + document

prompt2 = """
You are an expert in knowledge engineering and will give you a Document to by analyze.
          First, from the document's content identifies or recognizes an Named entities only if exist, 
          such as PERSON, ORGANIZATION, TOPIC/SUBJECT, etc.
          Second, For relevant entities identify the underlying RDF triples in which them are implied 
          (subject, predicate, object).
          Third, Normalizes/lemmatizes the text of the elements identified in each triple.
          Fourth, Recognizes the entities on subject and object and provide the category of each one, 
          e.g.subject and object and normalizes text.
          fifth, Only returns the output in csv format if not exist any returno NO. Don't explain the process.
          Format:
          Subject,predicate,object,category of subject, category of object.
          Example:
          Tim Berner-Lee,interestIn,Fog Computing,PERSON,TOPIC
          Document to be analyze:\n
""" + document

prompt3 = """
        You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph only return that information about IA.
        Your task is to identify the entities and relations specified in the user prompt from a given text and produce the output in JSON format.
        This output should be a list of JSON objects, with each object containing the following keys:

        •⁠  ⁠*"head"*: The text of the extracted entity, which must match one of the types specified in the user prompt.
        •⁠  ⁠*"head_type"*: The type of the extracted head entity, selected from the specified list of types.
        •⁠  ⁠*"relation"*: The type of relation between the "head" and the "tail," chosen from the list of allowed relations.
        •⁠  ⁠*"tail"*: The text of the entity representing the tail of the relation.
        •⁠  ⁠*"tail_type"*: The type of the tail entity, also selected from the provided list of types.

        Extract as many entities and relationships as possible.

        *Entity Consistency*: Ensure consistency in entity representation. If an entity, like "John Doe," appears multiple times in the text under different names or pronouns (e.g., "Joe," "he"), use the most complete identifier consistently. 

        *Important Notes*:
        •⁠  ⁠Do not add any extra explanations or text.
        •⁠  ⁠If no one relation can be obtained only return NO

        allowed_nodes = ["Person", "Organization", "Location", "Method", "ResearchField","Technology","Metric","DataSet","Group","Disease"]

        
        examples = [
    {
        "head": "Adam",
        "head_type": "Person",
        "relation": "WORKS_FOR",
        "tail": "Microsoft",
        "tail_type": "Company",
    },
    {
        "head": "Adam",
        "head_type": "Person",
        "relation": "HAS_AWARD",
        "tail": "Best Talent",
        "tail_type": "Award",
    }
]
Document:
""" + document

## Gemini

In [68]:
import os
import google.generativeai as genai


api_key = "Ingrese su API key"##
genai.configure(api_key=api_key)


# Create the model
generation_config = {
  "temperature": 0,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 1024,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-pro",
  generation_config=generation_config,
)

chat_session = model.start_chat(
  history=[
    {
      "role": "user",
      "parts": [
          """
          You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph only return that information about IA.
        Your task is to identify the entities and relations specified in the user prompt from a given text and produce the output in JSON format.
        This output should be a list of JSON objects, with each object containing the following keys:

        •⁠  ⁠*"head"*: The text of the extracted entity, which must match one of the types specified in the user prompt.
        •⁠  ⁠*"head_type"*: The type of the extracted head entity, selected from the specified list of types.
        •⁠  ⁠*"relation"*: The type of relation between the "head" and the "tail," chosen from the list of allowed relations.
        •⁠  ⁠*"tail"*: The text of the entity representing the tail of the relation.
        •⁠  ⁠*"tail_type"*: The type of the tail entity, also selected from the provided list of types.

        Extract as many entities and relationships as possible.

        *Entity Consistency*: Ensure consistency in entity representation. If an entity, like "John Doe," appears multiple times in the text under different names or pronouns (e.g., "Joe," "he"), use the most complete identifier consistently. 

        *Important Notes*:
        •⁠  ⁠Do not add any extra explanations or text.
        •⁠  ⁠If no one relation can be optained only return NO

        allowed_nodes = ["Person", "Organization", "Location", "Method", "ResearchField","Technology","Metric","DataSet","Group","Disease"]

        
        examples = [
            {
                "head": "Adam",
                "head_type": "Person",
                "relation": "WORKS_FOR",
                "tail": "Microsoft",
                "tail_type": "Company",
            },
            {
                "head": "Adam",
                "head_type": "Person",
                "relation": "HAS_AWARD",
                "tail": "Best Talent",
                "tail_type": "Award",
            }
        ]
        Document:
          """
      ],
    },
  ]
)

response = chat_session.send_message(document)

print(response.text)

## Llama

Pruebassss

In [14]:
from openai import OpenAI

client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1/",
	api_key=""
)

messages = [
	{
		"role": "user",
		"content": prompt3
	}
]

completion1 = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3-8B-Instruct", 
	messages=messages, 
	max_tokens=1024,
    temperature= 0.1
    
)

output = completion1.choices[0].message.content  # Extrae el contenido del mensaje
jsonString = output.replace("\\n", "\n")  # Reemplaza los caracteres "\n" por saltos de línea reales

print(jsonString)


## Gemma

In [8]:
from openai import OpenAI

client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1/",
	api_key="h"
)

messages = [
	{
		"role": "user",
		"content": prompt3
	}
]

completion2 = client.chat.completions.create(
    model="google/gemma-1.1-2b-it", 
	messages=messages, 
	max_tokens=500
)

output = completion2.choices[0].message.content  # Extrae el contenido del mensajeformatted_output = output.replace("\\n", "\n")  # Reemplaza los caracteres "\n" por saltos de línea reales
print(output)

openai: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

