# Ejercicio 9: Uso de la API de Google Gemini

En este ejercicio vamos a aprender a utilizar la API de OpenAI

## 1. Uso básico

El siguiente código sirve para conectarse con la API de Google Gemini de forma básica

In [None]:
!pip install python-dotenv

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("API_KEY")

In [None]:
from google import genai

client = genai.Client(api_key="AIzaSyCoHGiv93eV3ItTHWAe_tfLFxFMqEP25xo")

response = client.models.generate_content(
    model="gemini-3-flash-preview",
    contents="Explain how AI works in a few words"
)

print(response.text)

## 2. Retrieval

### 2.1 Cargo el corpus de 20 News Groups

In [None]:
from sklearn.datasets import fetch_20newsgroups

newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
newsgroupsdocs = newsgroups.data

In [None]:
type(newsgroupsdocs), len(newsgroupsdocs)

In [None]:
import pandas as pd
df = pd.DataFrame(newsgroupsdocs, columns=["text"])
df.head()

### 2.2 Transformo a embeddings

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import re

df = df.dropna(subset=["text"]).reset_index(drop=True)

# Limpieza básica
def normalize_text(s: str) -> str:
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["text_norm"] = df["text"].astype(str).map(normalize_text)

df.head()

In [None]:
def chunk_text(text: str, max_chars: int = 800, overlap: int = 100):
    """
    Chunking por caracteres.
    max_chars ~ 600-1000 suele funcionar bien.
    overlap ayuda a no cortar ideas a la mitad.
    """
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chars, n)
        chunk = text[start:end]
        chunk = chunk.strip()
        if len(chunk) > 0:
            chunks.append(chunk)
        if end == n:
            break
        start = max(0, end - overlap)
    return chunks

records = []
for i, row in df.iterrows():
    chunks = chunk_text(row["text_norm"], max_chars=800, overlap=100)
    for j, ch in enumerate(chunks):
        records.append({
            "doc_id": int(i),
            "chunk_id": j,
            "text": ch
        })

chunks_df = pd.DataFrame(records)
chunks_df.head(), len(chunks_df)

In [None]:
from sentence_transformers import SentenceTransformer

MODEL_NAME = "intfloat/e5-base-v2"   # recomendado para retrieval
model = SentenceTransformer(MODEL_NAME)

# Textos a indexar (pasajes)
passages = ["passage: " + t for t in chunks_df["text"].tolist()]

In [None]:
# Embeddings (N x D)
# Se debe usar normalize_embeddings=True para similitud coseno
embeddings = model.encode(
    passages,
    batch_size=16,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
).astype("float32")

In [None]:
print(embeddings.shape, embeddings.dtype)

### 2.3 Creo una query y hago la búsqueda

In [None]:
def embed_query(query: str) -> np.ndarray:
    q = "query: " + query
    vec = model.encode(
        [q],
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype("float32")
    return vec

query_text = "Battery measuring"

query_vec = embed_query(query_text)
query_vec.shape

Obtengo los 5 documentos más similares a mi query

In [None]:
!pip install faiss-cpu

import numpy as np
import faiss

# Dimension de los embeddings
D = embeddings.shape[1]

# Creamos un índice FAISS Flat Inner Product (IP) ya que los embeddings están normalizados
index = faiss.IndexFlatIP(D)

# Añadimos los embeddings al índice
index.add(embeddings)

# Número de documentos a recuperar
k = 5

# Realizamos la búsqueda
distances, indices = index.search(query_vec, k)

print(f"Top {k} documentos más relevantes para la consulta '{query_text}':")
for i in range(k):
    doc_index = indices[0][i]
    score = distances[0][i]
    print(f"\n--- Documento {i+1} (Score: {score:.4f}) ---")
    print(f"{passages[doc_index]}")

Uso del LLM (Gemini) con los resultados anteriores

In [None]:
context_docs = []
for i in range(k):
    doc_index = indices[0][i]
    context_docs.append(passages[doc_index])

context = "\n\n".join(context_docs)

prompt = f"Based on the following documents and the query '{query_text}', provide a concise summary.\n\nDocuments:\n{context}"

response_gemini = client.models.generate_content(
    model="gemini-3-flash-preview",
    contents=prompt
)

print("Summary from Gemini:")
print(response_gemini.text)