# Taller 6 - Búsqueda por Similaridad Coseno (Mini Google)

## Imports

In [1]:
import os
import sys
from pathlib import Path
from typing import List, Tuple

import numpy as np
import re

import nltk
from nltk.corpus import reuters as nltk_reuters

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Funciones de Carga de Datos

In [2]:
def read_txt_directory(data_dir: str) -> Tuple[List[str], List[str]]:
    """Lee todos los archivos .txt de un directorio."""
    p = Path(data_dir)
    if not p.exists() or not p.is_dir():
        raise FileNotFoundError(f"Directorio no encontrado: {data_dir}")

    doc_ids, docs = [], []
    for name in sorted(os.listdir(p)):
        if name.lower().endswith(".txt"):
            doc_ids.append(name)
            with open(p / name, "r", encoding="utf-8", errors="ignore") as f:
                docs.append(f.read())
    if not docs:
        raise RuntimeError(f"No se encontraron .txt en {data_dir}")
    return doc_ids, docs


def read_reuters_plain(path: str) -> Tuple[List[str], List[str]]:
    """Lee un archivo Reuters en formato plain (reut2-XXXX.plain)."""
    raw = Path(path).read_text(encoding="utf-8", errors="ignore")
    parts = re.split(r'<REUTERS ID="(\d+)">\s*', raw)
    ids = parts[1::2]
    texts = [t.strip() for t in parts[2::2]]
    doc_ids = [f"reut-{i}" for i in ids]
    return doc_ids, texts


def read_nltk_reuters() -> Tuple[List[str], List[str]]:
    """Lee el corpus Reuters desde NLTK."""
    if nltk is None or nltk_reuters is None:
        raise RuntimeError("NLTK/Reuters no está disponible. Instale nltk y ejecute nltk.download('reuters', 'punkt').")
    try:
        nltk.data.find("corpora/reuters")
    except LookupError:
        nltk.download("reuters")
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt")

    fileids = nltk_reuters.fileids()
    texts = [nltk_reuters.raw(fid) for fid in fileids]
    return fileids, texts

## Funciones de Vectorización y Búsqueda

In [3]:
def build_vectorizer(ngram_max: int = 1) -> TfidfVectorizer:
    """Construye un vectorizador TF-IDF."""
    vec = TfidfVectorizer(
        lowercase=True,
        analyzer="word",
        ngram_range=(1, ngram_max),
        stop_words="english",
        max_df=0.9,      
        min_df=2,       
        dtype=np.float32,
        norm="l2",       
    )
    return vec


def sparsity_report(X) -> str:
    """Genera un reporte de la dispersión de la matriz TF-IDF."""
    nnz = X.nnz
    total = X.shape[0] * X.shape[1]
    density = nnz / total if total > 0 else 0.0
    return f"Docs: {X.shape[0]:,} | Vocab: {X.shape[1]:,} | NNZ: {nnz:,} | Densidad: {density:.6f} (~{density*100:.4f}%)"


def search(
    query: str,
    vec: TfidfVectorizer,
    X,
    doc_ids: List[str],
    top_k: int = 10,
) -> List[Tuple[str, float]]:
    """Busca documentos similares a la consulta usando similaridad coseno."""
    qv = vec.transform([query]) 
    sims = cosine_similarity(X, qv, dense_output=False).toarray().ravel()
    top_idx = np.argsort(-sims)[:top_k]
    return [(doc_ids[i], float(sims[i])) for i in top_idx if sims[i] > 0]

## Configuración y Carga del Corpus

In [4]:
USE_NLTK = False 
DATA_DIR = "../data" 
REUTERS_PLAIN = "../data/reut2-1000.plain" 
NGRAM_MAX = 1 

if REUTERS_PLAIN and Path(REUTERS_PLAIN).exists():
    print(f"Leyendo archivo Reuters plain: {REUTERS_PLAIN}")
    doc_ids, docs = read_reuters_plain(REUTERS_PLAIN)
elif USE_NLTK:
    print("Leyendo corpus Reuters desde NLTK...")
    doc_ids, docs = read_nltk_reuters()
else:
    print(f"Leyendo .txt desde: {DATA_DIR}")
    doc_ids, docs = read_txt_directory(DATA_DIR)

print(f"Documentos cargados: {len(docs):,}")

Leyendo archivo Reuters plain: ../data/reut2-1000.plain
Documentos cargados: 1,000


## Construcción de la Matriz TF-IDF

In [5]:
vectorizer = build_vectorizer(NGRAM_MAX)
X = vectorizer.fit_transform(docs)

print("Matriz TF-IDF (CSR) creada.")
print(sparsity_report(X))

Matriz TF-IDF (CSR) creada.
Docs: 1,000 | Vocab: 5,694 | NNZ: 57,512 | Densidad: 0.010100 (~1.0100%)


## Realizar Búsquedas

In [6]:
QUERY = "british jaguar sales" 
TOP_K = 10  

print(f'Consulta: "{QUERY}"')
results = search(QUERY, vectorizer, X, doc_ids, top_k=TOP_K)

if not results:
    print("No se encontraron documentos relevantes (score > 0).")
else:
    print(f"\nTop {len(results)} resultados:")
    for rank, (doc_id, score) in enumerate(results, 1):
        try:
            cats = ", ".join(nltk_reuters.categories(doc_id)) if USE_NLTK else ""
            cat_str = f"  cats=[{cats}]" if cats else ""
        except:
            cat_str = ""
        print(f"{rank:>2}. {doc_id:<50}  score={score:.6f}{cat_str}")

Consulta: "british jaguar sales"

Top 10 resultados:
 1. reut-1218                                           score=0.337039
 2. reut-612                                            score=0.270653
 3. reut-957                                            score=0.267966
 4. reut-888                                            score=0.267966
 5. reut-1179                                           score=0.252651
 6. reut-670                                            score=0.227302
 7. reut-57                                             score=0.205724
 8. reut-932                                            score=0.162981
 9. reut-726                                            score=0.160041
10. reut-617                                            score=0.158207


## Mostrar Snippets de los Resultados

In [7]:
if results:
    try:
        id2text = {i: t for i, t in zip(doc_ids, docs)}
        query_terms = [w for w in QUERY.lower().split() if len(w) > 1]
        
        print("\nSnippets:")
        for rank, (doc_id, score) in enumerate(results, 1):
            text = id2text[doc_id]
            low = text.lower()
            pos = min((low.find(term) for term in query_terms if term in low), default=-1)
            if pos >= 0:
                start = max(0, pos - 50)
                end = min(len(text), pos + 100)
                snippet = text[start:end].replace("\n", " ")
            else:
                snippet = text[:120].replace("\n", " ")
            print(f"{rank:>2}. {doc_id:50s}")
            print(f"    ... {snippet} ...")
            print()
    except Exception as e:
        print(f"Error al generar snippets: {e}")


Snippets:
 1. reut-1218                                         
    ... JAGUAR JAGRY FEBRUARY U.S. SALES FALL     LEONIA N.J. March 3   Jaguar PLC's Jaguar Cars Inc U.S. su ...

 2. reut-612                                          
    ... SCOTTY'S SHB SALES UP FIVE PCT IN FEBRUARY     WINTER HAVEN Fla March 2   Scotty's Inc said sales for the four we ...

 3. reut-957                                          
    ...     TOKYO March 3   Japanese investor interest in British gilt edged securities is growing rapidly due to expectations sterling will remain stable des ...

 4. reut-888                                          
    ...     TOKYO March 3   Japanese investor interest in British gilt edged securities is growing rapidly due to expectations sterling will remain stable des ...

 5. reut-1179                                         
    ... WALGREEN WAG FEBRUARY SALES RISE     DEERFIELD Ill March 3   Walgreen Co said its sales in February rose 18.8 pct over sal ...

 6. reut-

## Exportar Resultados

In [8]:
output_file = "../results/mini_google.txt"

with open(output_file, "w", encoding="utf-8") as f:
    f.write(f"Consulta: {QUERY}\n")
    f.write(f"Top {TOP_K} resultados:\n")
    f.write("=" * 80 + "\n\n")
    
    for rank, (doc_id, score) in enumerate(results, 1):
        f.write(f"{rank}. {doc_id}  (score: {score:.6f})\n")

        try:
            text = id2text[doc_id]
            low = text.lower()
            query_terms = [w for w in QUERY.lower().split() if len(w) > 1]
            pos = min((low.find(term) for term in query_terms if term in low), default=-1)
            if pos >= 0:
                start = max(0, pos - 50)
                end = min(len(text), pos + 150)
                snippet = text[start:end].replace("\n", " ")
            else:
                snippet = text[:150].replace("\n", " ")
            f.write(f"   ... {snippet} ...\n")
        except:
            pass
        f.write("\n")

print(f"✓ Resultados exportados a: {output_file}")

✓ Resultados exportados a: ../results/mini_google.txt


## Prueba con Diferentes Consultas

Puedes probar con diferentes consultas modificando la variable `QUERY` y volviendo a ejecutar las celdas de búsqueda.

### Ejemplos de consultas:
- `"british jaguar sales"`
- `"oil prices market"`
- `"computer technology software"`
- `"bank financial crisis"`

In [9]:
def quick_search(query_text: str, top_k: int = 10):
    """Realiza una búsqueda rápida y muestra los resultados."""
    print(f'Consulta: "{query_text}"')
    print("=" * 80)
    
    results = search(query_text, vectorizer, X, doc_ids, top_k=top_k)
    
    if not results:
        print("No se encontraron documentos relevantes.")
        return
    
    for rank, (doc_id, score) in enumerate(results, 1):
        print(f"{rank:>2}. {doc_id:<50}  score={score:.6f}")
    
    print("\nSnippets:")
    id2text = {i: t for i, t in zip(doc_ids, docs)}
    query_terms = [w for w in query_text.lower().split() if len(w) > 1]
    
    for rank, (doc_id, score) in enumerate(results[:5], 1):  
        text = id2text[doc_id]
        low = text.lower()
        pos = min((low.find(term) for term in query_terms if term in low), default=-1)
        if pos >= 0:
            start = max(0, pos - 50)
            end = min(len(text), pos + 100)
            snippet = text[start:end].replace("\n", " ")
        else:
            snippet = text[:120].replace("\n", " ")
        print(f"{rank}. ... {snippet} ...")

quick_search("oil prices market", top_k=10)

Consulta: "oil prices market"
 1. reut-248                                            score=0.439528
 2. reut-127                                            score=0.408124
 3. reut-352                                            score=0.403497
 4. reut-144                                            score=0.346958
 5. reut-543                                            score=0.296122
 6. reut-489                                            score=0.286141
 7. reut-502                                            score=0.281361
 8. reut-349                                            score=0.267038
 9. reut-242                                            score=0.249162
10. reut-668                                            score=0.234668

Snippets:
1. ...  TO OPEC PACT     BAHRAIN March 1   Saudi Arabian Oil Minister Hisham Nazer reiterated the kingdom's commitment to last December's OPEC accord to boos ...
2. ... DIAMOND SHAMROCK  DIA  CUTS CRUDE PRICES     NEW YORK FEB 26   Diamond Shamrock 