# Procesado de PDFs y RAG

In [None]:
%pip install pymupdf==1.23.22
%pip install transformers tiktoken

# Librerias

In [2]:
import fitz
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import tiktoken

from dotenv import load_dotenv
from openai import AzureOpenAI

## Nos bajamos un pdf

In [None]:
# Get PDF document
local_filename = "human_nutrition.pdf"
url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(local_filename):
  print(f"File {local_filename} doesn't exist, downloading...")

  # GET request to the URL
  response = requests.get(url)

  if response.status_code == 200:
      # Save content
      with open(local_filename, "wb") as file:
          file.write(response.content)
      print(f"File saved as {local_filename}")
  else:
      print(f"Failed. Status code: {response.status_code}")
else:
  print(f"File {local_filename} exists.")

## Exploramos el archivo

In [None]:
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Only text
def open_and_read_pdf(pdf_path: str):
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    
    tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo") 
    
    for page_number, page in enumerate(doc):  
        text = page.get_text()  # Get plain text encoded as UTF-8
        text = text_formatter(text)
        
        # Tokenize text and calculate stats
        tokens = tokenizer.encode(text)  # Proper tokenization
        word_count = len(text.split())  # Simple word count
        
        pages_and_texts.append({
            "page_number": page_number,
            "page_word_count": word_count,
            "page_token_count": len(tokens),  # Accurate token count
            "text": text
        })
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=local_filename)

print(f"The file has {len(pages_and_texts)} pages")
pages_and_texts[:2]

## El archivo tiene muchas paginas, pero algunas sin texto.
### No tiene sentido indexar paginas vacias

In [None]:
pages_and_texts = [ page for page in pages_and_texts if page['page_token_count'] > 0 ]
len(pages_and_texts)

In [None]:
pages_df = pd.DataFrame(pages_and_texts)
pages_df = pages_df.set_index("page_number")
pages_df.describe().round(1)

### Es importante analizar la distribución de tokens por pagina para ver si es necesario chunking
El chunking es necesario si nos pasamos del maximo de tokens de los modelos de embeddings, sobre 8000 tokens.

In [None]:
pages_df['page_token_count'].hist(bins=30, alpha=0.7, label='Page Token Count')
plt.legend()
plt.show()

## En este caso, estamos muy lejos del limite de tokens de los modelos
Tenemos que solapar los paginas, de tal manera, que una pagina tenga un trozo de la anterior y de la siguiente, para evitar cortes de bloques <br>
Un 20% es un valor de referencia común

In [8]:
def create_contextual_texts(input_texts, overlap_ratio=0.2):
    """
    Genera una lista de textos con cierto nivel de solapamiento del texto previo y posterior.

    Args:
        input_texts (list): Lista de textos (uno por cada elemento).
        overlap_ratio (float): Proporción del texto previo y posterior a incluir (por defecto 0.2 = 20%).

    Returns:
        list: Lista de textos combinados con solapamiento.
    """
    output_texts = []

    for i in range(len(input_texts)):
        # Texto previo: tomar la mitad del solapamiento deseado del texto anterior si existe
        previous_context = input_texts[i - 1][-int(len(input_texts[i - 1]) * overlap_ratio/2):] if i > 0 else ""
        
        # Texto posterior: tomar la mitad del solapamiento deseado del texto siguiente si existe
        next_context = input_texts[i + 1][:int(len(input_texts[i + 1]) * overlap_ratio/2)] if i < len(input_texts) - 1 else ""
        
        # Combinar los contextos con el texto actual
        combined_text = f"{previous_context} {input_texts[i]} {next_context}".strip()
        output_texts.append(combined_text)

    return output_texts 


In [None]:
overlap_ratio = 0.2

input_texts = [page['text'] for page in pages_and_texts]

overlapped_texts = create_contextual_texts(input_texts, overlap_ratio)
# Muestra un ejemplo
overlapped_texts[80]

#### En este ejemplo, vemos que muchos textos tienen un numero pequeño de tokens.
#### Concatenaremos paginas sucesivas con pocos tokens para tener textos de longitud similar

In [None]:
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo") 

# Calculamos la longitud de tokens
input_texts_and_tokens = [ { 'text': text, 'token_size': len(tokenizer.encode(text)) } for text in overlapped_texts ]

input_texts_and_tokens[:3]

In [None]:

# Buscamos valores con los siguientes tokens
max_tokens = 1000

def concatenate_documents(docs, max_tokens):
    """
    Concatenate consecutive runs of documents until their token size reaches max_tokens.

    Args:
        docs (list): List of documents, each with 'text' and 'token_size'.
        max_tokens (int): Maximum token size for each concatenated group.

    Returns:
        list: A new list of concatenated documents.
    """
    concatenated_docs = []
    current_group = {"text": "", "token_size": 0}
    
    for doc in docs:
        # Check if adding the current document exceeds the max_tokens threshold
        if current_group["token_size"] + doc["token_size"] <= max_tokens:
            # Add the current document to the group
            current_group["text"] += (" " + doc["text"]).strip()
            current_group["token_size"] += doc["token_size"]
        else:
            # Add the current group to the result list
            concatenated_docs.append(current_group)
            # Start a new group with the current document
            current_group = {"text": doc["text"], "token_size": doc["token_size"]}
    
    # Append the last group if it has content
    if current_group["token_size"] > 0:
        concatenated_docs.append(current_group)
    
    return concatenated_docs


concatenated_input_texts_and_tokens = concatenate_documents(input_texts_and_tokens, max_tokens)

# Display result
concatenated_input_texts_and_tokens[0]

In [None]:
print(f"Hemos reducido un documento de {len(pages_and_texts)} paginas a {len(concatenated_input_texts_and_tokens)} bloques de un maximo de {max_tokens} tokens")

## Analizamos ahora la nueva distribucion de tokens

In [None]:
token_sizes = [entry["token_size"] for entry in concatenated_input_texts_and_tokens]

fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)

axes[0].hist(pages_df['page_token_count'], bins=30, range=(0, 1000), edgecolor='black', alpha=0.7)
axes[0].set_title("Histogram of Token Sizes before optimization")
axes[0].set_xlabel("Token Size")
axes[0].set_ylabel("Frequency")

axes[1].hist(token_sizes, bins=30, edgecolor='black', range=(0, 1000), alpha=0.7)
axes[1].set_title("Histogram of Token Sizes after optimization")
axes[1].set_xlabel("Token Size")
axes[1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

## Lo siguiente es calcular los embeddings de cada bloque

In [None]:
load_dotenv(override=True)

az_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT_URL")
az_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
az_openai_embeddings_deployment_name = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME")

# Initialize the Azure OpenAI client
az_openai_client = AzureOpenAI(
    azure_endpoint=az_openai_endpoint,
    api_key=az_openai_api_key,
    api_version="2024-05-01-preview",
)

az_openai_embeddings_deployment_name

In [None]:
def calculate_embeddings(text, client=az_openai_client):
    embeddings_response = client.embeddings.create(input=text,
                model=az_openai_embeddings_deployment_name 
            )

    return embeddings_response.data[0].embedding

# Test with sample embedding
vector = calculate_embeddings("En un lugar de la Mancha")

vector

### Tenemos que calcular los embeddings de la lista de texto anterior

In [None]:
text_and_embeddings = [ {'block_id': block_id, 'text': text['text'], 'embeddings': calculate_embeddings(text['text'])}   for block_id, text in enumerate(concatenated_input_texts_and_tokens)]
text_and_embeddings[0]

## Salvamos los embeddings a un archivo

In [None]:
output_file = "text_and_embeddings.json"

# Write the list to a JSON file
with open(output_file, "w") as file:
    json.dump(text_and_embeddings, file)

print(f"Data saved to {output_file}")

In [None]:
def cosine_similarity(vec1, vec2):
    """
    Calculate the cosine similarity between two vectors.

    Args:
        vec1 (np.array): First vector.
        vec2 (np.array): Second vector.

    Returns:
        float: Cosine similarity score.
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def find_most_similar(input_text_embedding, data):
    """
    Find the text most similar to the input embedding.

    Args:
        input_text_embedding (list or np.array): Embedding for the input text.
        data (list): List of dictionaries with 'text' and 'embeddings' fields.

    Returns:
        list: List of texts with their similarity scores, sorted by similarity.
    """
    similarities = []
    for entry in data:
        similarity = cosine_similarity(input_text_embedding, np.array(entry["embeddings"]))
        similarities.append((entry["text"], similarity))

    # Sort by similarity (highest first)
    return sorted(similarities, key=lambda x: x[1], reverse=True)


# Example data

# Example input text embedding
input_text_embedding = calculate_embeddings("Consumo recomendado de proteinas")

# Find most similar texts
most_similar_results = find_most_similar(input_text_embedding, text_and_embeddings)

# Display results
for text, similarity in most_similar_results[:2]:
    print(f"Text: {text}, Similarity: {similarity:.4f}")


## Ahora podemos crear un RAG

In [None]:
SYSTEM_PROMPT = """
You are an expert of human nutrition. 
Provide feedback based on the context provided to the user questions.
Limit your responses to the context provided.
Be brief in your responses, citing sources. Ideally in 4 or 5 sentences if that number delivers a complete answer.
Respond in the same language as the user question
"""

user_question = "Cantidad maxima de grasa saturada diaria en adultos"
user_question_embedding = calculate_embeddings(user_question)

most_similar_results_for_context = find_most_similar(user_question_embedding, text_and_embeddings)

most_similar_results_for_context[0]

## Por curiosidad, ploteamos el score de similarities de los diferentes resultados

In [None]:
similarity_scores = [text_and_score_tuple[1] for text_and_score_tuple in most_similar_results_for_context]


# Plot the values
plt.figure(figsize=(10, 5))
plt.bar(range(1, len(similarity_scores) +1), similarity_scores, color='skyblue', edgecolor='black')
plt.title("Bar Plot of Values")
plt.xlabel("Documents")
plt.ylabel("Values")
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.tight_layout()

# Show the plot
plt.show()

## Ya tenemos el contexto adecuado para la pregunta
### Llamamos a Azure OpenAI con grounding

In [None]:
messages_in_context = 1

context = ' '.join([text for text, score in most_similar_results_for_context[0:messages_in_context]])

messages  = [
    {"role": "system", "content": SYSTEM_PROMPT },
    {"role": "user",   "content": f"QUESTION: {user_question} - CONTEXT: {context}"}
]

az_openai_completions_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

response = az_openai_client.chat.completions.create(
            model=az_openai_completions_deployment_name,
            messages=messages
        )

response.choices[0].message.content