In [None]:
import kagglehub
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
import torch
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Download latest version
path = kagglehub.dataset_download("dylanjcastillo/7k-books-with-metadata")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\santy\.cache\kagglehub\datasets\dylanjcastillo\7k-books-with-metadata\versions\3


In [150]:
df = pd.read_csv(f"{path}/books.csv")
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


In [154]:
# Eliminar filas con información clave faltante
df.dropna(subset=['title', 'authors', 'description', 'categories'], inplace=True)

# Rellenar valores faltantes restantes si es necesario
df['published_year'] = df['published_year'].fillna(0)
df['average_rating'] = df['average_rating'].fillna(df['average_rating'].mean())
df['num_pages'] = df['num_pages'].fillna(df['num_pages'].median())
df = df.drop(columns=['isbn13', 'isbn10','subtitle'])

In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6446 entries, 0 to 6809
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           6446 non-null   object 
 1   authors         6446 non-null   object 
 2   categories      6446 non-null   object 
 3   thumbnail       6247 non-null   object 
 4   description     6446 non-null   object 
 5   published_year  6446 non-null   float64
 6   average_rating  6446 non-null   float64
 7   num_pages       6446 non-null   float64
 8   ratings_count   6412 non-null   float64
dtypes: float64(4), object(5)
memory usage: 503.6+ KB


In [7]:
# Mantener solo filas con la información necesaria
df.dropna(subset=['title', 'authors', 'description', 'categories', 'published_year'], inplace=True)

In [None]:
# Cargar modelo y tokenizer
flan_model_name = "google/flan-t5-base"
flan_tokenizer = AutoTokenizer.from_pretrained(flan_model_name)
flan_model = AutoModelForSeq2SeqLM.from_pretrained(flan_model_name)
flan_model.eval()

# Dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
flan_model.to(device)

In [None]:
# Función para generar embeddings
def get_embedding(text):
    inputs = flan_tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        encoder_output = flan_model.encoder(**inputs).last_hidden_state
        embedding = encoder_output.mean(dim=1).squeeze()
    return embedding.cpu().numpy()

In [None]:
# Crear representaciones textuales (si no lo has hecho ya)
def build_textual_representations(df):
    results = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Creando representaciones"):
        text = f"""Title: {row.get('title', '')}
Authors: {row.get('authors', '')}
Description: {row.get('description', '')}
Categories: {row.get('categories', '')}
Publishing Year: {row.get('published_year', '')}
Average Rating: {row.get('average_rating', '')}
Number of Pages: {row.get('num_pages', '')}"""
        results.append(text)
    df['textual_representation'] = results
    return df

# Aplicar a tu DataFrame
df = build_textual_representations(df)

In [None]:
# Generar embeddings
print("Generando embeddings...")
embeddings = []
for text in tqdm(df['textual_representation'], desc="Generando embeddings"):
    try:
        embeddings.append(get_embedding(text))
    except Exception as e:
        print(f"❌ Error con el texto: {text[:100]} — {e}")
        embeddings.append(np.zeros(flan_model.config.d_model))

X = np.vstack(embeddings)

In [123]:
from sklearn.metrics.pairwise import cosine_similarity

# Función para obtener índice por nombre
def get_book_index_by_name(book_name, df):
    # Filtramos los libros que contienen el nombre proporcionado
    matched_books = df[df['title'].str.contains(book_name, case=False, na=False)]
    
    # Si no se encuentran libros
    if len(matched_books) == 0:
        print("No se encontró un libro con ese nombre.")
        return None
    
    # Si solo se encuentra un libro
    elif len(matched_books) == 1:
        print(f"Se encontró un solo libro: {matched_books['title'].values[0]}")
        return matched_books.index[0]
    
    # Si hay múltiples coincidencias, mostramos la lista y pedimos al usuario que elija
    else:
        print(f"Se encontraron múltiples libros con nombres similares a '{book_name}':")
        for i, title in enumerate(matched_books['title'].values):
            print(f"{i+1}. {title}")
        
        # Pedimos al usuario que elija un libro
        try:
            choice = int(input("Por favor, ingresa el número del libro que deseas (1, 2, 3, ...): "))
            if choice < 1 or choice > len(matched_books):
                print("Opción no válida.")
                return None
            # Retornamos el índice del libro seleccionado
            return matched_books.index[choice - 1]
        except ValueError:
            print("Opción no válida. Debes ingresar un número.")
            return None


# Función de recomendación
def recommend_books(book_name, df, X, top_n=5, min_similarity=0.75):
    # Obtener índice del libro basado en el nombre
    book_index = get_book_index_by_name(book_name, df)
    if book_index is None:
        return

    # Obtener las características del libro en base al índice
    book_embedding = X[book_index].reshape(1, -1)
    
    # Calcular las similitudes con todos los otros libros
    similarities = cosine_similarity(book_embedding, X)[0]

    # Ordenar los libros por similitud (más altos primero)
    sorted_indices = similarities.argsort()[::-1]

    recommendations = []
    for idx in sorted_indices:
        if idx == book_index:
            continue  # Saltar el libro que es igual al solicitado

        sim_score = similarities[idx]
        book = df.iloc[idx]

        # Si la similitud es mayor que el umbral mínimo, agregamos a las recomendaciones
        if sim_score >= min_similarity:
            recommendations.append({
                "title": book['title'],
                "authors": book['authors'],
                "published_year": book['published_year'],
                "num_pages": book['num_pages'],
                "category": book['categories'],
                "description": book['description'][:300],  # Descripción recortada
                "similarity": sim_score
            })
        
        if len(recommendations) >= top_n:
            break

    # Mostrar las recomendaciones
    if recommendations:
        print(f"\n📚 Recomendaciones para: {book_name}")
        print("="*80)
        for rec in recommendations:
            print(f"📚 Similitud: {rec['similarity']:.4f}")
            print(f"Title: {rec['title']}")
            print(f"Authors: {rec['authors']}")
            print(f"Published Year: {rec['published_year']}")
            print(f"Number of Pages: {rec['num_pages']}")
            print(f"Category: {rec['category']}")
            print(f"Description: {rec['description']}...")
            print("="*80)
    else:
        print("No se encontraron recomendaciones suficientemente similares.")


In [157]:
recommend_books("Treasure", df, X)

Se encontraron múltiples libros con nombres similares a 'Treasure':
1. Treasure Box
2. Five on a Treasure Island
3. Mayflower Treasure Hunt
4. Dragon's Treasure
5. Encyclopedia Brown and the Case of the Treasure Hunt
6. Treasure Island
7. Treasure Island
8. The Treasure Principle

📚 Recomendaciones para: Treasure
📚 Similitud: 0.9893
Title: The Left Hand Dreams of Him
Authors: Satoru Kannagi
Published Year: 2006.0
Number of Pages: 242.0
Category: Comics & Graphic Novels
Description: The growing romance between high school students Yuichi and Wataru is endangered by scholastic stress and well-meant deceptions....
📚 Similitud: 0.9573
Title: The Mouse and His Child
Authors: Russell Hoban;David Small
Published Year: 2001.0
Number of Pages: 244.0
Category: Juvenile Fiction
Description: Two discarded toy mice survive perilous adventures in a hostile world before finding security and happiness with old friends and new....
📚 Similitud: 0.9566
Title: Pippi Goes on Board
Authors: Astrid Lindgren;