# BookLook Data Loader & Enrichment

This notebook:
1. Loads books from HuggingFace institutional dataset
2. Enriches missing data (covers, authors, genres, descriptions)
3. Generates realistic reviews
4. Loads everything to PostgreSQL database

**Requirements:**
```bash
pip install datasets psycopg2-binary requests faker python-dotenv
```

In [None]:
# Install required packages
!pip install datasets psycopg2-binary requests faker python-dotenv -q

In [None]:
import os
import random
import requests
from datetime import datetime, timedelta
from typing import List, Dict, Optional
import psycopg2
from psycopg2.extras import execute_batch
from datasets import load_dataset
from faker import Faker
import time

fake = Faker()

print("‚úÖ All imports successful!")

## Configuration

In [None]:
# Database configuration
DB_CONFIG = {
    'host': 'localhost',
    'port': 5432,
    'database': 'book_library',
    'user': 'bookuser',
    'password': 'bookpass123'
}

# Dataset configuration
DATASET_NAME = "institutional/institutional-books-1.0"
MAX_BOOKS = 1000  # Limit for testing, set to None for all books

# Enrichment configuration
REVIEWS_PER_BOOK = 3
BATCH_SIZE = 100

print(f"üìö Will load up to {MAX_BOOKS} books from {DATASET_NAME}")

## Helper Functions

In [None]:
# Genre mapping for books
GENRE_KEYWORDS = {
    'Fiction': ['novel', 'story', 'fiction', 'tale', 'narrative'],
    'Science': ['science', 'physics', 'chemistry', 'biology', 'mathematics', 'astronomy'],
    'History': ['history', 'historical', 'war', 'ancient', 'medieval'],
    'Philosophy': ['philosophy', 'philosophical', 'ethics', 'logic', 'metaphysics'],
    'Poetry': ['poetry', 'poems', 'verse', 'sonnet'],
    'Drama': ['drama', 'play', 'theatre', 'tragedy', 'comedy'],
    'Biography': ['biography', 'autobiography', 'memoir', 'life'],
    'Religion': ['religion', 'religious', 'theology', 'spiritual', 'bible', 'god'],
    'Travel': ['travel', 'journey', 'voyage', 'adventure', 'exploration'],
    'Art': ['art', 'painting', 'sculpture', 'architecture', 'music'],
}

def infer_genres(title: str, author: str = "") -> List[str]:
    """Infer genres based on title and author keywords."""
    text = f"{title} {author}".lower()
    matched_genres = []
    
    for genre, keywords in GENRE_KEYWORDS.items():
        if any(keyword in text for keyword in keywords):
            matched_genres.append(genre)
    
    # Default genres if none matched
    if not matched_genres:
        matched_genres = [random.choice(['Fiction', 'Literature', 'Classic'])]
    
    return matched_genres[:2]  # Max 2 genres per book

def get_book_cover_url(title: str, author: str = "") -> Optional[str]:
    """Get book cover from Open Library API."""
    try:
        # Search Open Library
        search_query = f"{title} {author}".strip()
        url = f"https://openlibrary.org/search.json?q={search_query}&limit=1"
        response = requests.get(url, timeout=5)
        
        if response.status_code == 200:
            data = response.json()
            if data.get('docs'):
                doc = data['docs'][0]
                if 'cover_i' in doc:
                    cover_id = doc['cover_i']
                    return f"https://covers.openlibrary.org/b/id/{cover_id}-L.jpg"
    except Exception as e:
        pass
    
    # Fallback to placeholder
    return f"https://via.placeholder.com/400x600/4A5568/FFFFFF?text={title[:20]}"

def generate_description(title: str, author: str, genres: List[str]) -> str:
    """Generate a realistic book description."""
    templates = [
        f"A {genres[0].lower()} masterpiece by {author}, '{title}' explores {fake.catch_phrase().lower()}. {fake.text(max_nb_chars=150)}",
        f"In this compelling {genres[0].lower()} work, {author} presents {title}, a profound examination of {fake.bs()}. {fake.text(max_nb_chars=150)}",
        f"{title} by {author} is a {genres[0].lower()} classic that {fake.catch_phrase().lower()}. {fake.text(max_nb_chars=150)}",
    ]
    return random.choice(templates)

def generate_review(book_title: str, rating: int) -> Dict:
    """Generate a realistic book review."""
    positive_reviews = [
        f"Absolutely loved {book_title}! {fake.text(max_nb_chars=100)}",
        f"A masterpiece! {fake.text(max_nb_chars=120)}",
        f"Highly recommend this book. {fake.text(max_nb_chars=100)}",
    ]
    
    neutral_reviews = [
        f"Decent read. {fake.text(max_nb_chars=100)}",
        f"It was okay. {fake.text(max_nb_chars=100)}",
    ]
    
    negative_reviews = [
        f"Not what I expected. {fake.text(max_nb_chars=100)}",
        f"Could have been better. {fake.text(max_nb_chars=100)}",
    ]
    
    if rating >= 4:
        review_text = random.choice(positive_reviews)
    elif rating >= 3:
        review_text = random.choice(neutral_reviews)
    else:
        review_text = random.choice(negative_reviews)
    
    return {
        'rating': rating,
        'review_text': review_text,
        'created_at': datetime.now() - timedelta(days=random.randint(1, 365))
    }

print("‚úÖ Helper functions defined")

## Load Dataset from HuggingFace

In [None]:
print(f"üì• Loading dataset: {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME, split='train', streaming=True)

# Convert to list with limit
books_data = []
for i, item in enumerate(dataset):
    if MAX_BOOKS and i >= MAX_BOOKS:
        break
    books_data.append(item)
    if (i + 1) % 100 == 0:
        print(f"  Loaded {i + 1} books...")

print(f"‚úÖ Loaded {len(books_data)} books from dataset")

## Enrich Book Data

In [None]:
print("üîß Enriching book data...")

enriched_books = []

for i, book in enumerate(books_data):
    # Extract basic info
    title = book.get('title', f'Unknown Title {i}')
    author = book.get('author', 'Unknown Author')
    
    # Clean author name
    if not author or author.strip() == '':
        author = 'Anonymous'
    
    # Infer genres
    genres = infer_genres(title, author)
    
    # Get cover (with rate limiting)
    if i % 10 == 0:  # Only fetch cover for every 10th book to avoid rate limits
        cover_url = get_book_cover_url(title, author)
        time.sleep(0.5)  # Rate limiting
    else:
        cover_url = f"https://via.placeholder.com/400x600/4A5568/FFFFFF?text={title[:20].replace(' ', '+')}"
    
    # Generate description
    description = generate_description(title, author, genres)
    
    # Generate reviews
    reviews = [generate_review(title, random.randint(3, 5)) for _ in range(REVIEWS_PER_BOOK)]
    
    enriched_books.append({
        'title': title,
        'author': author,
        'isbn': book.get('isbn', None),
        'date_publication': book.get('publication_date', '1900-01-01'),
        'description': description,
        'image_url': cover_url,
        'genres': genres,
        'reviews': reviews,
        'content': book.get('content', '')[:5000]  # Limit content size
    })
    
    if (i + 1) % 50 == 0:
        print(f"  Enriched {i + 1}/{len(books_data)} books...")

print(f"‚úÖ Enriched {len(enriched_books)} books")

## Connect to Database

In [None]:
print("üîå Connecting to database...")

conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()

print("‚úÖ Connected to PostgreSQL database")

## Load Data to Database

In [None]:
print("üíæ Loading data to database...")

# Create a demo user for reviews
cur.execute("""
    INSERT INTO users (email, password_hash, first_name, last_name, is_admin)
    VALUES ('demo@booklook.com', 'hashed_password', 'Demo', 'User', false)
    ON CONFLICT (email) DO NOTHING
    RETURNING id
""")
result = cur.fetchone()
if result:
    demo_user_id = result[0]
else:
    cur.execute("SELECT id FROM users WHERE email = 'demo@booklook.com'")
    demo_user_id = cur.fetchone()[0]

conn.commit()
print(f"  Demo user ID: {demo_user_id}")

# Track genre and author IDs
genre_cache = {}
author_cache = {}

loaded_count = 0

for book_data in enriched_books:
    try:
        # Insert book
        cur.execute("""
            INSERT INTO books (titre, isbn, date_publication, description, image_url)
            VALUES (%s, %s, %s, %s, %s)
            ON CONFLICT (titre) DO NOTHING
            RETURNING id
        """, (
            book_data['title'],
            book_data['isbn'],
            book_data['date_publication'],
            book_data['description'],
            book_data['image_url']
        ))
        
        result = cur.fetchone()
        if not result:
            continue  # Book already exists
        
        book_id = result[0]
        
        # Insert or get author
        author_name = book_data['author']
        if author_name not in author_cache:
            cur.execute("""
                INSERT INTO authors (nom, prenom)
                VALUES (%s, %s)
                ON CONFLICT (nom, prenom) DO UPDATE SET nom = EXCLUDED.nom
                RETURNING id
            """, (author_name, ''))
            author_cache[author_name] = cur.fetchone()[0]
        
        author_id = author_cache[author_name]
        
        # Link book to author
        cur.execute("""
            INSERT INTO book_authors (book_id, author_id)
            VALUES (%s, %s)
            ON CONFLICT DO NOTHING
        """, (book_id, author_id))
        
        # Insert genres and link to book
        for genre_name in book_data['genres']:
            if genre_name not in genre_cache:
                cur.execute("""
                    INSERT INTO genres (nom)
                    VALUES (%s)
                    ON CONFLICT (nom) DO UPDATE SET nom = EXCLUDED.nom
                    RETURNING id
                """, (genre_name,))
                genre_cache[genre_name] = cur.fetchone()[0]
            
            genre_id = genre_cache[genre_name]
            
            cur.execute("""
                INSERT INTO book_genres (book_id, genre_id)
                VALUES (%s, %s)
                ON CONFLICT DO NOTHING
            """, (book_id, genre_id))
        
        # Insert reviews
        for review in book_data['reviews']:
            cur.execute("""
                INSERT INTO reviews (book_id, user_id, rating, review_text, created_at)
                VALUES (%s, %s, %s, %s, %s)
            """, (
                book_id,
                demo_user_id,
                review['rating'],
                review['review_text'],
                review['created_at']
            ))
        
        loaded_count += 1
        
        if loaded_count % 50 == 0:
            conn.commit()
            print(f"  Loaded {loaded_count} books...")
    
    except Exception as e:
        print(f"  ‚ö†Ô∏è  Error loading book '{book_data['title']}': {e}")
        conn.rollback()
        continue

conn.commit()
print(f"‚úÖ Successfully loaded {loaded_count} books to database!")

## Verify Data

In [None]:
print("üîç Verifying loaded data...\n")

# Count books
cur.execute("SELECT COUNT(*) FROM books")
book_count = cur.fetchone()[0]
print(f"üìö Total books: {book_count}")

# Count authors
cur.execute("SELECT COUNT(*) FROM authors")
author_count = cur.fetchone()[0]
print(f"‚úçÔ∏è  Total authors: {author_count}")

# Count genres
cur.execute("SELECT COUNT(*) FROM genres")
genre_count = cur.fetchone()[0]
print(f"üè∑Ô∏è  Total genres: {genre_count}")

# Count reviews
cur.execute("SELECT COUNT(*) FROM reviews")
review_count = cur.fetchone()[0]
print(f"‚≠ê Total reviews: {review_count}")

# Sample books with covers
cur.execute("""
    SELECT titre, image_url 
    FROM books 
    WHERE image_url IS NOT NULL 
    LIMIT 5
""")
print("\nüì∏ Sample books with covers:")
for title, url in cur.fetchall():
    print(f"  - {title[:50]}...")
    print(f"    {url}")

print("\n‚úÖ Data verification complete!")

## Cleanup

In [None]:
cur.close()
conn.close()
print("‚úÖ Database connection closed")
print("\nüéâ All done! Your database is now populated with enriched book data!")