# üìö Institutional Books Dataset Loader

Load books from HuggingFace **institutional/institutional-books-1.0** dataset into BookLook database.

## Features
- ‚úÖ Streaming mode (memory efficient)
- ‚úÖ Chunked loading (100 books per batch)
- ‚úÖ Resume capability
- ‚úÖ ISBN lookup (dataset ‚Üí Google Books ‚Üí Open Library ‚Üí generated)
- ‚úÖ Cover image fetching
- ‚úÖ Transaction rollback on errors

## Instructions
1. Run cells in order
2. Update HuggingFace token in Configuration cell
3. Adjust MAX_CHUNKS for testing (10 = 1000 books)

## Step 1: Install Dependencies

In [None]:
!pip install datasets huggingface-hub psycopg2-binary requests python-dotenv -q

## Step 2: Import Libraries

In [None]:
import os
import json
import time
import re
from typing import List, Dict, Optional
from datetime import datetime

import psycopg2
from psycopg2 import sql
import requests
from datasets import load_dataset
from huggingface_hub import login

print("‚úÖ All imports successful!")

## Step 3: Configuration (‚ö†Ô∏è UPDATE YOUR TOKEN HERE)

In [None]:
# ‚ö†Ô∏è UPDATE THIS: Your HuggingFace token
HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN_HERE"

# Dataset configuration
DATASET_NAME = "institutional/institutional-books-1.0"
CHUNK_SIZE = 100  # Books per batch
MAX_CHUNKS = 10   # Set to None for all books (10 = 1000 books for testing)

# Database configuration
DB_CONFIG = {
    'host': 'localhost',
    'port': 5432,
    'database': 'book_library',
    'user': 'bookuser',
    'password': 'bookpass123'
}

# API configuration
API_DELAY = 0.5  # Seconds between API calls
GOOGLE_BOOKS_API_KEY = ''  # Optional
PROGRESS_FILE = 'load_progress.json'

print(f"üìö Dataset: {DATASET_NAME}")
print(f"üì¶ Chunk size: {CHUNK_SIZE}")
print(f"üî¢ Max chunks: {MAX_CHUNKS if MAX_CHUNKS else 'All'}")
print(f"üîå Database: {DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}")

## Step 4: Helper Functions

In [None]:
def extract_isbn_from_identifiers(identifiers: Dict) -> Optional[str]:
    if not identifiers or 'isbn' not in identifiers:
        return None
    isbns = identifiers['isbn']
    if isinstance(isbns, list) and len(isbns) > 0:
        isbn = str(isbns[0]).strip().replace('-', '').replace(' ', '')
        if len(isbn) in [10, 13]:
            return isbn
    return None

def search_isbn_google_books(title: str, author: str) -> Optional[str]:
    if not title:
        return None
    try:
        query = f"{title} {author}" if author else title
        url = "https://www.googleapis.com/books/v1/volumes"
        params = {'q': query, 'maxResults': 1}
        if GOOGLE_BOOKS_API_KEY:
            params['key'] = GOOGLE_BOOKS_API_KEY
        response = requests.get(url, params=params, timeout=5)
        if response.status_code == 200:
            data = response.json()
            if 'items' in data and len(data['items']) > 0:
                identifiers = data['items'][0].get('volumeInfo', {}).get('industryIdentifiers', [])
                for identifier in identifiers:
                    if identifier.get('type') == 'ISBN_13':
                        return identifier.get('identifier')
                for identifier in identifiers:
                    if identifier.get('type') == 'ISBN_10':
                        return identifier.get('identifier')
        time.sleep(API_DELAY)
    except:
        pass
    return None

def search_isbn_open_library(title: str, author: str) -> Optional[str]:
    if not title:
        return None
    try:
        query = f"{title} {author}" if author else title
        url = "https://openlibrary.org/search.json"
        response = requests.get(url, params={'q': query, 'limit': 1}, timeout=5)
        if response.status_code == 200:
            data = response.json()
            if 'docs' in data and len(data['docs']) > 0:
                doc = data['docs'][0]
                if 'isbn' in doc and doc['isbn']:
                    for isbn in doc['isbn']:
                        isbn_clean = str(isbn).strip().replace('-', '')
                        if len(isbn_clean) == 13:
                            return isbn_clean
                    return str(doc['isbn'][0]).strip().replace('-', '')
        time.sleep(API_DELAY)
    except:
        pass
    return None

def generate_isbn_from_barcode(barcode: str) -> str:
    barcode_hash = abs(hash(barcode)) % (10 ** 10)
    return f"999{barcode_hash:010d}"

def get_isbn_for_book(book_data: Dict) -> str:
    isbn = extract_isbn_from_identifiers(book_data.get('identifiers_src'))
    if isbn:
        return isbn
    title = book_data.get('title_src', '')
    author = book_data.get('author_src', '')
    isbn = search_isbn_google_books(title, author)
    if isbn:
        return isbn
    isbn = search_isbn_open_library(title, author)
    if isbn:
        return isbn
    return generate_isbn_from_barcode(book_data.get('barcode_src', ''))

def fetch_cover_image(isbn: str) -> Optional[str]:
    try:
        url = f"https://covers.openlibrary.org/b/isbn/{isbn}-L.jpg"
        response = requests.head(url, timeout=3)
        if response.status_code == 200:
            return url
    except:
        pass
    return None

def parse_publication_date(date_str: str) -> Optional[str]:
    if not date_str:
        return None
    year_match = re.search(r'\b(1[0-9]{3}|20[0-9]{2})\b', str(date_str))
    if year_match:
        return f"{year_match.group(1)}-01-01"
    return None

def clean_text(text: str, max_length: int = 5000) -> str:
    if not text:
        return ""
    text = re.sub(r'\s+', ' ', text).strip()
    if len(text) > max_length:
        text = text[:max_length] + "..."
    return text

def extract_language(book_data: Dict) -> str:
    lang = book_data.get('language_gen', '') or book_data.get('language_src', '')
    lang_map = {
        'eng': 'English', 'fra': 'French', 'deu': 'German',
        'spa': 'Spanish', 'ita': 'Italian', 'por': 'Portuguese'
    }
    return lang_map.get(lang[:3].lower(), lang or 'English')

def extract_genres(book_data: Dict) -> List[str]:
    genres = []
    genre_str = book_data.get('genre_or_form_src', '')
    if genre_str:
        parts = re.split(r'[;,|]', genre_str)
        genres.extend([g.strip() for g in parts if g.strip()])
    topic = book_data.get('topic_or_subject_gen') or book_data.get('topic_or_subject_src', '')
    if topic and topic not in genres:
        genres.append(topic)
    return genres[:3] if genres else ['General']

def extract_description(book_data: Dict) -> str:
    desc = book_data.get('general_note_src', '')
    if not desc:
        parts = []
        topic = book_data.get('topic_or_subject_gen') or book_data.get('topic_or_subject_src')
        if topic:
            parts.append(f"Subject: {topic}")
        pages = book_data.get('page_count_src')
        if pages:
            parts.append(f"{pages} pages")
        desc = ". ".join(parts) if parts else "No description available"
    return clean_text(desc, 2000)

print("‚úÖ Helper functions defined")

## Step 5: Database Functions

In [None]:
def get_or_create_author(cursor, author_name: str) -> int:
    if not author_name or author_name.strip() == '':
        author_name = "Unknown Author"
    parts = author_name.strip().split()
    if len(parts) >= 2:
        prenom = ' '.join(parts[:-1])
        nom = parts[-1]
    else:
        prenom = ""
        nom = author_name.strip()
    cursor.execute("SELECT id FROM authors WHERE nom = %s AND prenom = %s", (nom, prenom))
    result = cursor.fetchone()
    if result:
        return result[0]
    cursor.execute(
        "INSERT INTO authors (nom, prenom, created_at) VALUES (%s, %s, NOW()) RETURNING id",
        (nom, prenom)
    )
    return cursor.fetchone()[0]

def get_or_create_genre(cursor, genre_name: str) -> int:
    if not genre_name or genre_name.strip() == '':
        genre_name = "General"
    genre_name = genre_name.strip()
    cursor.execute("SELECT id FROM genres WHERE nom = %s", (genre_name,))
    result = cursor.fetchone()
    if result:
        return result[0]
    cursor.execute(
        "INSERT INTO genres (nom, created_at) VALUES (%s, NOW()) RETURNING id",
        (genre_name,)
    )
    return cursor.fetchone()[0]

print("‚úÖ Database functions defined")

## Step 6: Progress Tracking

In [None]:
def load_progress() -> Dict:
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as f:
            return json.load(f)
    return {'last_processed_index': -1, 'total_loaded': 0, 'last_barcode': None, 'timestamp': None}

def save_progress(index: int, total: int, barcode: str):
    progress = {
        'last_processed_index': index,
        'total_loaded': total,
        'last_barcode': barcode,
        'timestamp': datetime.now().isoformat()
    }
    with open(PROGRESS_FILE, 'w') as f:
        json.dump(progress, f, indent=2)

print("‚úÖ Progress tracking functions defined")

## Step 7: Login to HuggingFace

In [None]:
login(token=HF_TOKEN)
print("‚úÖ Successfully logged in to HuggingFace")

## Step 8: Load Dataset with Streaming

In [None]:
progress = load_progress()
start_index = progress['last_processed_index'] + 1

if start_index > 0:
    print(f"üìç Resuming from index {start_index} ({progress['total_loaded']} books loaded)")
    print(f"   Last: {progress.get('last_barcode', 'N/A')} at {progress.get('timestamp', 'N/A')}")

print(f"\nüì• Loading dataset: {DATASET_NAME}")
dataset_stream = load_dataset(DATASET_NAME, split="train", streaming=True)
print("‚úÖ Dataset loaded successfully")

## Step 9: Connect to Database

In [None]:
conn = psycopg2.connect(**DB_CONFIG)
conn.set_session(autocommit=False)
cursor = conn.cursor()
print("‚úÖ Database connected")

## Step 10: Load Books (Main Processing Loop)

In [None]:
print(f"\nüöÄ Starting data loading (chunk size: {CHUNK_SIZE})")
print("=" * 80)

chunk_buffer = []
current_index = 0
chunk_number = 0
total_inserted = progress['total_loaded']
total_skipped = 0
start_time = time.time()

try:
    for book_data in dataset_stream:
        if current_index < start_index:
            current_index += 1
            continue
        
        chunk_buffer.append(book_data)
        current_index += 1
        
        if len(chunk_buffer) >= CHUNK_SIZE:
            chunk_number += 1
            print(f"\nüì¶ Chunk {chunk_number} (books {current_index - CHUNK_SIZE + 1}-{current_index})")
            print("-" * 80)
            
            inserted = 0
            skipped = 0
            
            for book in chunk_buffer:
                try:
                    isbn = get_isbn_for_book(book)
                    cursor.execute("SELECT id FROM books WHERE isbn = %s", (isbn,))
                    if cursor.fetchone():
                        skipped += 1
                        continue
                    
                    title = book.get('title_src', 'Unknown Title')
                    author_name = book.get('author_src', 'Unknown Author')
                    pub_date = parse_publication_date(book.get('date1_src', ''))
                    description = extract_description(book)
                    page_count = book.get('page_count_src')
                    language = extract_language(book)
                    genres = extract_genres(book)
                    cover_url = fetch_cover_image(isbn)
                    token_count = book.get('token_count_o200k_base_gen', 0)
                    word_count = int(token_count * 0.75) if token_count else None
                    
                    cursor.execute(
                        """
                        INSERT INTO books (
                            titre, isbn, date_publication, description, image_url,
                            nombre_pages, total_pages, langue, note_moyenne, nombre_reviews,
                            average_rating, review_count, word_count, created_at
                        )
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())
                        RETURNING id
                        """,
                        (title, isbn, pub_date, description, cover_url,
                         page_count, page_count, language, 0.0, 0, 0.0, 0, word_count)
                    )
                    book_id = cursor.fetchone()[0]
                    
                    author_id = get_or_create_author(cursor, author_name)
                    cursor.execute(
                        "INSERT INTO book_authors (book_id, author_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
                        (book_id, author_id)
                    )
                    
                    for genre_name in genres:
                        genre_id = get_or_create_genre(cursor, genre_name)
                        cursor.execute(
                            "INSERT INTO book_genres (book_id, genre_id) VALUES (%s, %s) ON CONFLICT DO NOTHING",
                            (book_id, genre_id)
                        )
                    
                    cursor.execute(
                        "UPDATE books SET author_names = ARRAY[%s], genre_names = %s WHERE id = %s",
                        (author_name, genres, book_id)
                    )
                    
                    inserted += 1
                    
                except Exception as e:
                    print(f"    ‚ö†Ô∏è  Error: {str(e)[:100]}")
                    conn.rollback()
                    skipped += 1
                    continue
            
            conn.commit()
            
            total_inserted += inserted
            total_skipped += skipped
            
            last_barcode = chunk_buffer[-1].get('barcode_src', 'unknown')
            save_progress(current_index - 1, total_inserted, last_barcode)
            
            elapsed = time.time() - start_time
            rate = total_inserted / elapsed if elapsed > 0 else 0
            
            print(f"   ‚úÖ Inserted: {inserted}")
            print(f"   ‚è≠Ô∏è  Skipped: {skipped}")
            print(f"   üìä Total: {total_inserted} loaded, {total_skipped} skipped")
            print(f"   ‚è±Ô∏è  Rate: {rate:.1f} books/sec")
            print(f"   üíæ Progress saved")
            
            chunk_buffer = []
            
            if MAX_CHUNKS and chunk_number >= MAX_CHUNKS:
                print(f"\nüèÅ Reached max chunks ({MAX_CHUNKS})")
                break

except KeyboardInterrupt:
    print("\n‚ö†Ô∏è  Interrupted by user")
    conn.commit()
except Exception as e:
    print(f"\n‚ùå Error: {e}")
    conn.rollback()

elapsed = time.time() - start_time
print("\n" + "=" * 80)
print("‚úÖ LOADING COMPLETE")
print("=" * 80)
print(f"üìä Total inserted: {total_inserted}")
print(f"‚è≠Ô∏è  Total skipped: {total_skipped}")
print(f"üì¶ Chunks processed: {chunk_number}")
print(f"‚è±Ô∏è  Time: {elapsed:.1f}s")
print(f"üìà Rate: {total_inserted / elapsed:.1f} books/sec")
print("=" * 80)

## Step 11: Close Connection

In [None]:
cursor.close()
conn.close()
print("üîå Database connection closed")

## Step 12: View Summary

In [None]:
conn = psycopg2.connect(**DB_CONFIG)
cursor = conn.cursor()

print("\nüìä Database Summary:")
print("=" * 80)

cursor.execute("SELECT COUNT(*) FROM books")
print(f"üìö Total Books: {cursor.fetchone()[0]}")

cursor.execute("SELECT COUNT(*) FROM authors")
print(f"‚úçÔ∏è  Total Authors: {cursor.fetchone()[0]}")

cursor.execute("SELECT COUNT(*) FROM genres")
print(f"üè∑Ô∏è  Total Genres: {cursor.fetchone()[0]}")

cursor.execute("SELECT COUNT(*) FROM books WHERE image_url IS NOT NULL")
print(f"üñºÔ∏è  Books with covers: {cursor.fetchone()[0]}")

cursor.execute("SELECT langue, COUNT(*) FROM books GROUP BY langue ORDER BY COUNT(*) DESC LIMIT 5")
print("\nüåç Top 5 Languages:")
for lang, count in cursor.fetchall():
    print(f"   {lang}: {count} books")

cursor.close()
conn.close()
print("\n‚úÖ Summary complete!")