# Agregar Archivos

In [2]:
import os
import chromadb
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader

In [None]:
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
persistent_client = chromadb.PersistentClient(path='./database')

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [5]:
def add_files_to_vectordb(filepath):
    try:
        loader = PyPDFLoader(filepath)
        docs = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        splits = text_splitter.split_documents(docs)
        vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory='./database')
        print(f"Archivo procesado y agregado a la base de vectores: {filepath}")
    except Exception as e:
        print(f"Error al procesar el archivo {filepath}: {e}")

In [None]:
import os

# Ruta de la carpeta
folder_path = './data'

# Listar todos los archivos y carpetas dentro de la carpeta especificada
files_and_dirs = os.listdir(folder_path)

# Filtrar solo los archivos (ignorando subcarpetas)
files = [f for f in files_and_dirs if os.path.isfile(os.path.join(folder_path, f))]

# Imprimir la llamada a add_files_to_vectordb para cada archivo
for file in files:
    file_path = os.path.join(folder_path, file)
    print(f"add_files_to_vectordb('{file_path}')")


In [None]:
add_files_to_vectordb('./data/Caso 1.pdf')
add_files_to_vectordb('./data/Caso 10.pdf')
add_files_to_vectordb('./data/Caso 2.pdf')
add_files_to_vectordb('./data/Caso 3.pdf')
add_files_to_vectordb('./data/Caso 4.pdf')
add_files_to_vectordb('./data/Caso 5.pdf')
add_files_to_vectordb('./data/Caso 6.pdf')
add_files_to_vectordb('./data/Caso 7.pdf')
add_files_to_vectordb('./data/Caso 8.pdf')
add_files_to_vectordb('./data/Caso 9.pdf')

In [None]:
add_files_to_vectordb('./data/Caso_sintetico_01.pdf')
add_files_to_vectordb('./data/Caso_sintetico_02.pdf')
add_files_to_vectordb('./data/Caso_sintetico_03.pdf')
add_files_to_vectordb('./data/Caso_sintetico_04.pdf')
add_files_to_vectordb('./data/Caso_sintetico_05.pdf')
add_files_to_vectordb('./data/Caso_sintetico_06.pdf')
add_files_to_vectordb('./data/Caso_sintetico_07.pdf')
add_files_to_vectordb('./data/Caso_sintetico_08.pdf')
add_files_to_vectordb('./data/Caso_sintetico_09.pdf')
add_files_to_vectordb('./data/Caso_sintetico_10.pdf')

In [None]:
add_files_to_vectordb('./data/Caso_sintetico_11.pdf')
add_files_to_vectordb('./data/Caso_sintetico_12.pdf')
add_files_to_vectordb('./data/Caso_sintetico_13.pdf')
add_files_to_vectordb('./data/Caso_sintetico_14.pdf')
add_files_to_vectordb('./data/Caso_sintetico_15.pdf')
add_files_to_vectordb('./data/Caso_sintetico_16.pdf')
add_files_to_vectordb('./data/Caso_sintetico_17.pdf')
add_files_to_vectordb('./data/Caso_sintetico_18.pdf')
add_files_to_vectordb('./data/Caso_sintetico_19.pdf')
add_files_to_vectordb('./data/Caso_sintetico_20.pdf')

In [None]:
add_files_to_vectordb('./data/Caso_sintetico_21.pdf')
add_files_to_vectordb('./data/Caso_sintetico_22.pdf')
add_files_to_vectordb('./data/Caso_sintetico_23.pdf')
add_files_to_vectordb('./data/Caso_sintetico_24.pdf')
add_files_to_vectordb('./data/Caso_sintetico_25.pdf')
add_files_to_vectordb('./data/Caso_sintetico_26.pdf')
add_files_to_vectordb('./data/Caso_sintetico_27.pdf')
add_files_to_vectordb('./data/Caso_sintetico_28.pdf')
add_files_to_vectordb('./data/Caso_sintetico_29.pdf')
add_files_to_vectordb('./data/Caso_sintetico_30.pdf')

In [None]:
add_files_to_vectordb('./data/Caso_sintetico_31.pdf')
add_files_to_vectordb('./data/Caso_sintetico_32.pdf')
add_files_to_vectordb('./data/Caso_sintetico_33.pdf')
add_files_to_vectordb('./data/Caso_sintetico_34.pdf')
add_files_to_vectordb('./data/Caso_sintetico_35.pdf')
add_files_to_vectordb('./data/Caso_sintetico_36.pdf')
add_files_to_vectordb('./data/Caso_sintetico_37.pdf')
add_files_to_vectordb('./data/Caso_sintetico_38.pdf')
add_files_to_vectordb('./data/Caso_sintetico_39.pdf')
add_files_to_vectordb('./data/Caso_sintetico_40.pdf')

In [None]:
add_files_to_vectordb('./data/CMMI.pdf')
add_files_to_vectordb('./data/COBIT.pdf')
add_files_to_vectordb('./data/Convenio 108 del Consejo de Europa.pdf')
add_files_to_vectordb('./data/decalogo.pdf')
add_files_to_vectordb('./data/Decálogo sobre el ISO-IEC 27701.pdf')
add_files_to_vectordb('./data/Directrices APEC.pdf')
add_files_to_vectordb('./data/GDPR.pdf')
add_files_to_vectordb('./data/Glosario Empresarial.pdf')
add_files_to_vectordb('./data/Guia-para-garantizar-derechos-ARCO.pdf')
add_files_to_vectordb('./data/Guia_para_ejercer_derechos_ARCO.pdf')

In [None]:
add_files_to_vectordb('./data/SGSI.pdf')
add_files_to_vectordb('./data/SoluciónCaso10.docx.pdf')
add_files_to_vectordb('./data/Tema 1.pdf')
add_files_to_vectordb('./data/Tema 2.pdf')
add_files_to_vectordb('./data/Tema 3.pdf')
add_files_to_vectordb('./data/The_Legal_Implications_of_Data_Privacy_Laws_Cybers (1).pdf')
add_files_to_vectordb('./data/NIST SP 800.pdf')
add_files_to_vectordb('./data/UCAGS_Presentacion_OFR.pdf')
add_files_to_vectordb('./data/s05_gpa_icdppc.pdf')
add_files_to_vectordb('./data/ISO-IEC 27701 - Sistema de Gestión de Privacidad de la Información.pdf')

In [None]:
add_files_to_vectordb('./data/s05_gpa_icdppc.pdf')
add_files_to_vectordb('./data/ISO-IEC 27701 - Sistema de Gestión de Privacidad de la Información.pdf')
add_files_to_vectordb('./data/OCDE.pdf')

In [None]:
#ERROR
#add_files_to_vectordb('./data/Policy and information security management based on ISO 27001.pdf')
#add_files_to_vectordb('./data/Metodologia_de_analisis_y_evaluacion_de_riesgos_aplicados.pdf')
#add_files_to_vectordb('./data/Policy and information security management based on ISO 27001.pdf')


## Ver si se agregaron

In [2]:
def get_unique_sources_list(chroma_settings):
    # Obtén los datos de la colección
    collection_data = chroma_settings.get_collection('langchain').get(include=['embeddings', 'documents', 'metadatas'])
    
    # Extrae los metadatos
    metadatas = collection_data['metadatas']
    
    # Obtén los valores únicos de 'source'
    sources = set()
    for metadata in metadatas:
        source = metadata.get('source', None)
        if source:
            sources.add(source)
    
    # Obtener solo el nombre de archivo de cada ruta
    file_names = list(set(source.split('/')[-1] for source in sources))
    
    return file_names

In [None]:
get_unique_sources_list(persistent_client)