# 📚 Institutional Books Dataset Loader - FIXED VERSION

Load books from HuggingFace **institutional/institutional-books-1.0** dataset into BookLook database.

## ✅ FIXES APPLIED:
1. **Better cover image fetching** - Uses GET request with content-length check + Google Books API fallback
2. **Auto-generate descriptions** - Creates descriptions from book content if missing
3. **Author linking** - Creates Author records and links them to books properly
4. **Book content loading** - Loads actual book pages from dataset's `text` field

## Workflow
1. Load data from dataset (streaming)
2. Structure with pandas DataFrame
3. Create Author records
4. Create Book records with proper author linking
5. Create BookPage records from actual book content

## Field Mapping
Dataset → Database:
- `title_src` → `titre`
- `author_src` → Author record + book_authors link
- `date1_src` → `date_publication`
- `page_count_src` → `nombre_pages`, `total_pages`
- `language_gen` → `langue`
- `general_note_src` OR generated → `description`
- `text` → BookPage records (actual content!)
- `token_count_o200k_base_gen` → `word_count` (×0.75)
- `genre_or_form_src` + `topic_or_subject_gen` → `genre_names` (array)
- `identifiers_src.isbn` → `isbn`

## Step 1: Install Dependencies

In [None]:
!pip install datasets huggingface-hub psycopg2-binary requests pandas -q

## Step 2: Import Libraries

In [None]:
import os
import json
import time
import re
from typing import List, Dict, Optional
from datetime import datetime

import pandas as pd
import psycopg2
from psycopg2.extras import execute_values
import requests
from datasets import load_dataset
from huggingface_hub import login

print("✅ All imports successful!")

## Step 3: Configuration (⚠️ UPDATE YOUR TOKEN)

In [None]:
# ⚠️ UPDATE THIS: Your HuggingFace token
HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN_HERE"

# Dataset configuration
DATASET_NAME = "institutional/institutional-books-1.0"
CHUNK_SIZE = 50  # Books per batch (reduced for better memory management)
MAX_CHUNKS = 10   # Set to None for all books
WORDS_PER_PAGE = 500  # Words per page when splitting content

# Database configuration
DB_CONFIG = {
    'host': 'localhost',
    'port': 5432,
    'database': 'book_library',
    'user': 'bookuser',
    'password': 'bookpass123'
}

# Progress tracking
PROGRESS_FILE = 'load_progress.json'

print(f"📚 Dataset: {DATASET_NAME}")
print(f"📦 Chunk size: {CHUNK_SIZE}")
print(f"🔢 Max chunks: {MAX_CHUNKS if MAX_CHUNKS else 'All'}")
print(f"📄 Words per page: {WORDS_PER_PAGE}")

## Step 4: Helper Functions

In [None]:
def fetch_cover_image(dataset_image_url: Optional[str], isbn: Optional[str], timeout: int = 5) -> Optional[str]:
    """Fetch book cover image with fallback to Google Books API."""
    
    # Try dataset image URL first
    if dataset_image_url:
        try:
            response = requests.head(dataset_image_url, timeout=timeout, allow_redirects=True)
            if response.status_code == 200:
                content_type = response.headers.get('content-type', '')
                if 'image' in content_type.lower():
                    print(f"  ✓ Using dataset image")
                    return dataset_image_url
        except Exception as e:
            print(f"  ⚠ Dataset image failed: {e}")
    
    # Fallback to Google Books API
    if isbn:
        try:
            # Clean ISBN (remove hyphens)
            clean_isbn = isbn.replace('-', '').replace(' ', '')
            google_api_url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{clean_isbn}"
            
            response = requests.get(google_api_url, timeout=timeout)
            if response.status_code == 200:
                data = response.json()
                if data.get('totalItems', 0) > 0:
                    volume_info = data['items'][0].get('volumeInfo', {})
                    image_links = volume_info.get('imageLinks', {})
                    
                    # Try different image sizes
                    for size in ['large', 'medium', 'small', 'thumbnail', 'smallThumbnail']:
                        if size in image_links:
                            print(f"  ✓ Using Google Books image ({size})")
                            return image_links[size]
        except Exception as e:
            print(f"  ⚠ Google Books API failed: {e}")
    
    print(f"  ✗ No cover image found")
    return None

print("✅ Cover fetching function defined")

In [None]:
def generate_description(text: Optional[str], max_words: int = 250) -> str:
    """Generate description from book text content."""
    if not text or not text.strip():
        return "No description available."
    
    # Clean the text
    text = text.strip()
    
    # Split into words
    words = text.split()
    
    # Take first max_words
    if len(words) <= max_words:
        return text
    
    # Extract and join
    description = ' '.join(words[:max_words])
    
    # Try to end at a sentence boundary
    last_period = description.rfind('.')
    last_exclamation = description.rfind('!')
    last_question = description.rfind('?')
    
    last_sentence_end = max(last_period, last_exclamation, last_question)
    
    if last_sentence_end > len(description) * 0.7:  # If we're at least 70% through
        description = description[:last_sentence_end + 1]
    else:
        description += '...'
    
    return description

print("✅ Description generation function defined")

In [None]:
def split_text_into_pages(text: str, words_per_page: int = 500) -> List[str]:
    """Split book text into pages based on word count."""
    if not text or not text.strip():
        return []
    
    words = text.split()
    pages = []
    
    for i in range(0, len(words), words_per_page):
        page_words = words[i:i + words_per_page]
        page_content = ' '.join(page_words)
        pages.append(page_content)
    
    return pages

print("✅ Page splitting function defined")

In [None]:
def parse_author_name(author_str: str) -> tuple:
    """Parse author string into first name and last name."""
    if not author_str or not author_str.strip():
        return ("Unknown", "Author")
    
    author_str = author_str.strip()
    
    # Split by comma (Last, First format)
    if ',' in author_str:
        parts = author_str.split(',', 1)
        last_name = parts[0].strip()
        first_name = parts[1].strip() if len(parts) > 1 else ""
        return (first_name, last_name)
    
    # Split by space (First Last format)
    parts = author_str.split()
    if len(parts) == 1:
        return ("", parts[0])
    elif len(parts) == 2:
        return (parts[0], parts[1])
    else:
        # Multiple parts - assume first is first name, rest is last name
        return (parts[0], ' '.join(parts[1:]))

print("✅ Author parsing function defined")

## Step 5: Database Connection

In [None]:
# Connect to database
try:
    conn = psycopg2.connect(**DB_CONFIG)
    cursor = conn.cursor()
    print("✅ Connected to database")
    
    # Test query
    cursor.execute("SELECT COUNT(*) FROM books")
    book_count = cursor.fetchone()[0]
    print(f"📚 Current books in database: {book_count}")
except Exception as e:
    print(f"❌ Database connection failed: {e}")
    raise

## Step 6: Load Progress Tracking

In [None]:
def load_progress():
    """Load progress from file."""
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as f:
            return json.load(f)
    return {'processed_books': 0, 'last_index': 0, 'errors': []}

def save_progress(progress):
    """Save progress to file."""
    with open(PROGRESS_FILE, 'w') as f:
        json.dump(progress, f, indent=2)

progress = load_progress()
print(f"📊 Progress loaded: {progress['processed_books']} books processed")

## Step 7: Login to HuggingFace

In [None]:
# Login to HuggingFace
try:
    login(token=HF_TOKEN)
    print("✅ Logged in to HuggingFace")
except Exception as e:
    print(f"❌ HuggingFace login failed: {e}")
    raise

## Step 8: Load Dataset and Process Books

In [None]:
# Load dataset in streaming mode
print(f"📥 Loading dataset: {DATASET_NAME}")
dataset = load_dataset(DATASET_NAME, split='train', streaming=True)

# Process books in chunks
books_processed = 0
chunks_processed = 0
start_time = time.time()

for i, book_data in enumerate(dataset):
    # Skip already processed books
    if i < progress['last_index']:
        continue
    
    try:
        print(f"\n{'='*60}")
        print(f"📖 Processing book {i+1}: {book_data.get('title_src', 'Unknown')}")
        
        # Extract book data
        title = book_data.get('title_src', 'Unknown Title')
        author_src = book_data.get('author_src', 'Unknown Author')
        date_pub = book_data.get('date1_src')
        page_count = book_data.get('page_count_src')
        language = book_data.get('language_gen', 'en')
        general_note = book_data.get('general_note_src', '')
        text_content = book_data.get('text', '')
        token_count = book_data.get('token_count_o200k_base_gen', 0)
        genre_form = book_data.get('genre_or_form_src', [])
        topic_subject = book_data.get('topic_or_subject_gen', [])
        identifiers = book_data.get('identifiers_src', {})
        
        # Extract ISBN
        isbn = identifiers.get('isbn', [None])[0] if isinstance(identifiers.get('isbn'), list) else identifiers.get('isbn')
        if not isbn:
            isbn = f"INST-{i:08d}"  # Generate unique ISBN
        
        # Check if book already exists
        cursor.execute("SELECT id FROM books WHERE isbn = %s", (isbn,))
        if cursor.fetchone():
            print(f"  ⏭️  Book already exists (ISBN: {isbn})")
            continue
        
        # Generate description if missing
        description = general_note if general_note and general_note.strip() else generate_description(text_content)
        print(f"  📝 Description: {description[:100]}...")
        
        # Fetch cover image
        image_url = fetch_cover_image(None, isbn)
        
        # Calculate word count
        word_count = int(token_count * 0.75) if token_count else len(text_content.split())
        
        # Combine genres
        genres = []
        if isinstance(genre_form, list):
            genres.extend(genre_form)
        if isinstance(topic_subject, list):
            genres.extend(topic_subject[:2])  # Limit topics
        genres = list(set(genres))[:5]  # Unique, max 5
        
        # Parse author
        first_name, last_name = parse_author_name(author_src)
        author_full_name = f"{first_name} {last_name}".strip()
        
        # Create or get author
        cursor.execute(
            "SELECT id FROM authors WHERE nom = %s AND prenom = %s",
            (last_name, first_name)
        )
        author_result = cursor.fetchone()
        
        if author_result:
            author_id = author_result[0]
            print(f"  👤 Found existing author: {author_full_name}")
        else:
            cursor.execute(
                "INSERT INTO authors (nom, prenom) VALUES (%s, %s) RETURNING id",
                (last_name, first_name)
            )
            author_id = cursor.fetchone()[0]
            print(f"  👤 Created author: {author_full_name}")
        
        # Insert book
        cursor.execute("""
            INSERT INTO books (
                titre, isbn, date_publication, description, image_url,
                nombre_pages, langue, author_names, genre_names,
                word_count, total_pages, average_rating, review_count
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            RETURNING id
        """, (
            title, isbn, date_pub, description, image_url,
            page_count, language, [author_full_name], genres,
            word_count, 0, 0, 0
        ))
        book_id = cursor.fetchone()[0]
        print(f"  📚 Created book (ID: {book_id})")
        
        # Link book to author
        cursor.execute(
            "INSERT INTO book_author_association (book_id, author_id) VALUES (%s, %s)",
            (book_id, author_id)
        )
        
        # Split and insert book pages
        if text_content and text_content.strip():
            pages = split_text_into_pages(text_content, WORDS_PER_PAGE)
            print(f"  📄 Creating {len(pages)} pages...")
            
            page_data = []
            for page_num, page_content in enumerate(pages, start=1):
                page_word_count = len(page_content.split())
                page_data.append((book_id, page_num, page_content, page_word_count))
            
            # Bulk insert pages
            execute_values(
                cursor,
                "INSERT INTO book_pages (book_id, page_number, content, word_count) VALUES %s",
                page_data
            )
            
            # Update book's total_pages
            cursor.execute(
                "UPDATE books SET total_pages = %s WHERE id = %s",
                (len(pages), book_id)
            )
            print(f"  ✅ Added {len(pages)} pages")
        
        # Commit transaction
        conn.commit()
        
        books_processed += 1
        progress['processed_books'] = books_processed
        progress['last_index'] = i + 1
        
        print(f"  ✅ Book processed successfully")
        
    except Exception as e:
        print(f"  ❌ Error processing book: {e}")
        progress['errors'].append({'index': i, 'title': book_data.get('title_src'), 'error': str(e)})
        conn.rollback()
        continue
    
    # Save progress every 10 books
    if books_processed % 10 == 0:
        save_progress(progress)
        elapsed = time.time() - start_time
        rate = books_processed / elapsed if elapsed > 0 else 0
        print(f"\n📊 Progress: {books_processed} books | {rate:.2f} books/sec")
    
    # Check if we've reached the chunk limit
    if books_processed >= CHUNK_SIZE:
        chunks_processed += 1
        if MAX_CHUNKS and chunks_processed >= MAX_CHUNKS:
            print(f"\n🛑 Reached maximum chunks ({MAX_CHUNKS})")
            break
        books_processed = 0

# Final save
save_progress(progress)

# Close database connection
cursor.close()
conn.close()

print(f"\n{'='*60}")
print(f"✅ Processing complete!")
print(f"📚 Total books processed: {progress['processed_books']}")
print(f"❌ Errors: {len(progress['errors'])}")
print(f"⏱️  Total time: {time.time() - start_time:.2f} seconds")