In [2]:
from flask import Flask, request, jsonify 
import os
import re
import pypdf
import pandas as pd
import chromadb
import google.generativeai as palm
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
from typing import List
import speech_recognition as sr
import json

os.environ[ "GEMINI_API_KEY" ]= "AIzaSyASfBmk76PK3hpBQ0Bjp_ACA3ineq53VvM"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from pypdf import PdfReader

def load_pdf(file_path):
    """
    Reads the text content from a PDF file and returns it as a single string.

    Parameters:
    - file_path (str): The file path to the PDF file.

    Returns:
    - str: The concatenated text content of all pages in the PDF.
    """
    # Logic to read pdf
    reader = PdfReader(file_path)

    # Loop over each page and store it in a variable
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

# replace the path with your file path
pdf_text = load_pdf(file_path="mati.pdf")

In [4]:
def split_text(text: str):
    """
    Splits a text string into a list of non-empty substrings based on the specified pattern.
    The "\n \n" pattern will split the document para by para
    Parameters:
    - text (str): The input text to be split.

    Returns:
    - List[str]: A list containing non-empty substrings obtained by splitting the input text.

    """
    split_text = re.split('\n \n', text)
    return [i for i in split_text if i != ""]

chunked_text = split_text(text=pdf_text)

In [5]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]

In [6]:
def create_chroma_db(documents:List, path:str, name:str):
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))

    return db, name

db,name =create_chroma_db(documents=chunked_text, 
                          path="RAG1", #replace with your path
                          name="rag_experiment")

In [8]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

db=load_chroma_collection(path="RAG1", name="rag_experiment")

In [9]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

#Example usage
relevant_text = get_relevant_passage(query="Intellykeys",db=db,n_results=3)

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


In [23]:
def escuchar_mic():
    # recognizer = sr.Recognizer()
    # with sr.Microphone() as source:
    #     print("Escuchando...")
    #     try:
    #         audio = recognizer.listen(source, phrase_time_limit=5)
    #         texto = recognizer.recognize_google(audio, language="es-ES")
    #         print(f"Texto reconocido: {texto}")
    #         return texto
    #     except sr.UnknownValueError:
    #         print("No se pudo reconocer el audio.")
    #         return None
    #     except sr.RequestError as e:
    #         print(f"Error en la solicitud de reconocimiento: {e}")
    #         return None
    texto = "¿Cómo ha influido el uso del dispositivo IntellyKeys USB en el proceso creativo y profesional de Matías Duncan Federico, y qué importancia tiene encontrar un dispositivo alternativo para continuar con su desarrollo en el diseño multimedial?"
    return texto


In [24]:

def make_rag_prompt(query, texto_mic, relevant_passage):
  escaped_passage = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  escaped_texto_mic = texto_mic.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""You are a helpful SPANISH and knowledgeable assistant that helps complete text using the reference passage and microphone context included below. \
  Ensure your response is fluent, coherent, and relevant to the user's text, adding value and depth where appropriate. \
  You are assisting a general audience, so focus on clarity, providing thoughtful and well-structured continuations or expansions of the given text. Maintain a friendly, conversational tone. \
  Give considerable importance to the context provided by the microphone. Generate ALWAYS three options of the completed text for the user to choose from. \
  Each one should be unique and different from the others and NOT MUCH LNGER than the user's text. \
  If the passage does not provide relevant information, you may complete the user's text based on your understanding. \
  Your responses have to be like this one: \
  1. user_text + your response \
  2. user_text + your response \
  3. user_text + your response \
          
  USER TEXT: '{query}'
  PASSAGE: '{escaped_passage}'
  MICROPHONE CONTEXT: '{escaped_texto_mic}'

  COMPLETION:
  """).format(query=query, escaped_passage=escaped_passage, escaped_texto_mic=escaped_texto_mic)

  return prompt

In [25]:
import google.generativeai as genai
def generate_answer_by_prompt(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text

In [26]:
def generate_answer(db,query):
    #retrieve top 3 relevant text chunks
    texto_mic = escuchar_mic()
    relevant_text = get_relevant_passage(query,db,n_results=3)
    prompt = make_rag_prompt(query, 
                             texto_mic,
                             relevant_passage="".join(relevant_text)) # joining the relevant chunks to create a single passage
    answer = generate_answer_by_prompt(prompt)

    return answer

In [27]:
db=load_chroma_collection(path="RAG1", #replace with path of your persistent directory
                          name="rag_experiment") #replace with the collection name

answer = generate_answer(db,"El uso ")
print(answer)

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


1. El uso del dispositivo IntellyKeys USB ha sido fundamental en el proceso creativo y profesional de Matías Duncan Federico, permitiéndole expresarse y comunicarse a través del diseño y el video. El dispositivo le ha dado la libertad de trabajar con independencia, adaptando nuevas tecnologías a sus necesidades. Encontrar un dispositivo alternativo es crucial para que pueda continuar desarrollando su potencial en el campo del diseño multimedial.
2. El dispositivo IntellyKeys USB ha brindado a Matías Duncan Federico una herramienta indispensable para crear y editar videos, así como para diseñar portadas atractivas. Le ha permitido superar las limitaciones físicas y expresar su creatividad sin barreras. Encontrar un dispositivo alternativo es esencial para que continúe su trayectoria profesional y alcance su máximo potencial en el diseño multimedial.
3. El dispositivo IntellyKeys USB ha sido un aliado invaluable para Matías Duncan Federico, permitiéndole superar las barreras físicas y de

In [32]:
from sklearn.metrics import accuracy_score

# Define un conjunto de datos de prueba
test_data = [
    {"query": "Hola, ", "expected_answer": """}
    1. 'Hola Pedro,'
    2. '¡Hola Pedro!'
    3. 'Hola Pedro, soy'
    """},
    {"query": "¿Cómo estás?", "expected_answer": """
     1. Hola Pedro! ¿Cómo estás? 👋
    2. ¡Hola, Pedro! ¿Todo bien? 👌
    3. ¿Cómo te va, Pedro? ¿Qué tal todo? 😎
     """}
]

# Genera respuestas y evalúa la precisión
def evaluate_model(test_data, db):
    y_true = []
    y_pred = []
    
    for data in test_data:
        query = data["query"]
        expected_answer = data["expected_answer"]
        
        # Genera la respuesta usando tu modelo
        generated_answer = generate_answer(db, query)
        
        # Añade las respuestas esperadas y generadas a las listas
        y_true.append(expected_answer)
        y_pred.append(generated_answer)
    
    # Calcula la precisión
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

# Carga la colección Chroma
db = load_chroma_collection(path="RAG1", name="rag_experiment")

# Evalúa el modelo
accuracy = evaluate_model(test_data, db)
print(f'Precisión del modelo: {accuracy}')

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).