In [1]:
from constants import PINECONE, GROQ

In [4]:
import os
from odf.opendocument import load
from odf.text import P

def leer_documentos_odt(carpeta):
    """
    Lee todos los archivos .odt en una carpeta y devuelve un diccionario con el contenido.

    Args:
        carpeta (str): Ruta a la carpeta que contiene los archivos .odt.

    Returns:
        dict: Un diccionario donde las claves son los nombres de los archivos y los valores son los contenidos de los textos.
    """
    documentos = {}
    for archivo in os.listdir(carpeta):
        if archivo.endswith(".odt"):  # Verifica que el archivo sea .odt
            ruta_archivo = os.path.join(carpeta, archivo)
            # Cargar el archivo .odt
            documento = load(ruta_archivo)
            contenido = []
            # Extraer el texto de los párrafos
            for elemento in documento.getElementsByType(P):
                contenido.append(str(elemento))
            documentos[archivo] = "\n".join(contenido)
    return documentos


# Ejemplo de uso
carpeta = "docs/"  # Reemplaza con la ruta de tu carpeta
full_documents = leer_documentos_odt(carpeta)

# Mostrar el nombre y una muestra de los documentos leídos
for nombre, contenido in full_documents.items():
    print(f"Archivo: {nombre}")
    print(f"Contenido (primeros 100 caracteres): {contenido[:100]}")
    print("-" * 40)

Archivo: octavio_cv.odt
Contenido (primeros 100 caracteres): 
Octavio Deshays
Mechatronics Engineer - National University of Cuyo
Mendoza, Argentina
22/12/1997
+
----------------------------------------


In [5]:
full_documents

{'octavio_cv.odt': "\nOctavio Deshays\nMechatronics Engineer - National University of Cuyo\nMendoza, Argentina\n22/12/1997\n+54 9 2615538396\noctaviodeshays@gmail.com\nln: Octavio Deshays Moreno\nEXPERIENCE\nMARVIK, Uruguay — Machine Learning Engineer\nDecember 2022 - Present\nMarvik is a hands-on ML consulting firm. In my role, I am involved in the entire process of developing an AI solution, from identifying the customer's problem to implementing the solution.\nProjects: \nPhotoStudio Editor: an app to allow sellers from the largest E-Commerce in LatinAmerica to edit their products images using Stable Diffusion, generating attractive backgrounds for each product. Currently in production being used by thousands of users every hour.\nFashion Recommendation System: designed and built MVP for a Tinder like recsys for a fashion company. Involved building a feature extraction pipeline for garments using CLIP based classifiers and a Reinforcement Learning algorithm.\nVirtual Try On: for thi

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc

docs = [
    Document(page_content=contenido, metadata={"source": nombre})
    for nombre, contenido in full_documents.items()
]

documents=chunk_data(docs=docs,chunk_size=1000, chunk_overlap=50)

documents

[Document(metadata={'source': 'octavio_cv.odt'}, page_content="Octavio Deshays\nMechatronics Engineer - National University of Cuyo\nMendoza, Argentina\n22/12/1997\n+54 9 2615538396\noctaviodeshays@gmail.com\nln: Octavio Deshays Moreno\nEXPERIENCE\nMARVIK, Uruguay — Machine Learning Engineer\nDecember 2022 - Present\nMarvik is a hands-on ML consulting firm. In my role, I am involved in the entire process of developing an AI solution, from identifying the customer's problem to implementing the solution.\nProjects: \nPhotoStudio Editor: an app to allow sellers from the largest E-Commerce in LatinAmerica to edit their products images using Stable Diffusion, generating attractive backgrounds for each product. Currently in production being used by thousands of users every hour.\nFashion Recommendation System: designed and built MVP for a Tinder like recsys for a fashion company. Involved building a feature extraction pipeline for garments using CLIP based classifiers and a Reinforcement Lea

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

def generar_embeddings(chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """
    Genera embeddings para cada chunk usando un modelo HuggingFace.

    Args:
        chunks (list): Lista de chunks de texto.
        model_name (str): Nombre del modelo HuggingFace (puedes usar uno compatible con Groq).

    Returns:
        list: Lista de embeddings.
    """
    embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
    embeddings = [embeddings_model.embed_query(chunk.page_content) for chunk in chunks]
    return embeddings

embeddings = generar_embeddings(documents)

embeddings



  embeddings_model = HuggingFaceEmbeddings(model_name=model_name)


ImportError: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.

### Embeddings with pinecone

In [12]:
print(PINECONE)

None


In [7]:
from pinecone import Pinecone
from pinecone import ServerlessSpec
import time



# Initialize a Pinecone client with your API key
pc = Pinecone(api_key=PINECONE)

# Define a sample dataset where each item has a unique ID and piece of text
data = [{"id": f"vec{i+1}", "text": doc.page_content} for i, doc in enumerate(documents)]

# Convert the text into numerical vectors that Pinecone can index
embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d['text'] for d in data],
    parameters={"input_type": "passage", "truncate": "END"}
)

print(embeddings)

  from tqdm.autonotebook import tqdm


EmbeddingsList(
  model='multilingual-e5-large',
  vector_type='dense',
  data=[
    {'vector_type': dense, 'values': [0.031158447265625, -0.016265869140625, ..., -0.0426025390625, -0.01401519775390625]},
    {'vector_type': dense, 'values': [0.002185821533203125, -0.040802001953125, ..., -0.0263671875, -0.00390625]},
    ... (2 more embeddings) ...,
    {'vector_type': dense, 'values': [0.012176513671875, -0.029937744140625, ..., -0.01898193359375, -0.0016937255859375]},
    {'vector_type': dense, 'values': [0.0273590087890625, -0.02130126953125, ..., -0.0164031982421875, -0.0007061958312988281]}
  ],
  usage={'total_tokens': 1097}
)


In [10]:
len(embeddings.data[0]['values'])

1024

In [11]:
# Create a serverless index
index_name = "example-index"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=len(embeddings.data[0]['values']),
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [12]:
pc.list_indexes().names()

['example-index']

In [13]:
index = pc.Index("example-index")

# Prepare the records for upsert
# Each contains an 'id', the embedding 'values', and the original text as 'metadata'
records = []
for d, e in zip(data, embeddings):
    records.append({
        "id": d['id'],
        "values": e['values'],
        "metadata": {'text': d['text']}
    })

# Upsert the records into the index
index.upsert(
    vectors=records,
    namespace="example-namespace"
)

{'upserted_count': 6}

In [14]:
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'example-namespace': {'vector_count': 6}},
 'total_vector_count': 6}


In [18]:
# Define your query
query = "work in marvik"

# Convert the query into a numerical vector that Pinecone can search with
query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

# Search the index for the three most similar vectors
results = index.query(
    namespace="example-namespace",
    vector=query_embedding[0].values,
    top_k=1,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'vec1',
              'metadata': {'text': 'Octavio Deshays\n'
                                   'Mechatronics Engineer - National '
                                   'University of Cuyo\n'
                                   'Mendoza, Argentina\n'
                                   '22/12/1997\n'
                                   '+54 9 2615538396\n'
                                   'octaviodeshays@gmail.com\n'
                                   'ln: Octavio Deshays Moreno\n'
                                   'EXPERIENCE\n'
                                   'MARVIK, Uruguay — Machine Learning '
                                   'Engineer\n'
                                   'December 2022 - Present\n'
                                   'Marvik is a hands-on ML consulting firm. '
                                   'In my role, I am involved in the entire '
                                   'process of developing an AI solution, from '
                      