In [None]:
# --- STEP 1: Install dependencies ---
!pip install pillow imagehash requests tqdm

Collecting imagehash
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Downloading ImageHash-4.3.2-py2.py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imagehash
Successfully installed imagehash-4.3.2


In [None]:

import os
import csv
import time
import requests
import imagehash
from tqdm import tqdm
from PIL import Image

In [None]:

# --- STEP 2: Config ---
SEARCH_TERMS = ["fantasy", "romance", "thriller", "science fiction", "mystery"]
MAX_RESULTS_PER_TERM = 50  # adjust to control dataset size
IMAGE_SIZE = (512, 768)  # portrait book cover format
OUTDIR = "book_covers"
META_FILE = "metadata.csv"

os.makedirs(OUTDIR, exist_ok=True)

In [None]:
# --- STEP 3: Helper - Resize image ---
def resize_cover(infile, outfile, size=IMAGE_SIZE):
    try:
        img = Image.open(infile).convert("RGB")
        img = img.resize(size, Image.LANCZOS)
        img.save(outfile, quality=95)
    except Exception as e:
        print(f"Resize failed for {infile}: {e}")

In [None]:
# --- STEP 4: Fetch from Open Library ---
def fetch_openlibrary(query, limit=MAX_RESULTS_PER_TERM):
    search_url = 'https://openlibrary.org/search.json'
    params = {'q': query, 'limit': limit}
    try:
        r = requests.get(search_url, params=params).json()
    except Exception as e:
        print(f"Failed to fetch {query}: {e}")
        return []

    rows = []
    for doc in r.get('docs', []):
        cover_id = doc.get('cover_i')
        if not cover_id:
            continue
        cover_url = f'https://covers.openlibrary.org/b/id/{cover_id}-L.jpg'
        fname = os.path.join(OUTDIR, f"{doc.get('key','').replace('/','_')}_{cover_id}.jpg")

        try:
            img_resp = requests.get(cover_url, timeout=10)
            if img_resp.status_code == 200:
                with open(fname, 'wb') as f:
                    f.write(img_resp.content)
                resize_cover(fname, fname)  # resize immediately
                rows.append([
                    doc.get('key'),
                    doc.get('title'),
                    doc.get('author_name', [''])[0],
                    doc.get('first_publish_year'),
                    query,
                    cover_url,
                    fname,
                    'public-domain/unknown',
                    'openlibrary'
                ])
        except Exception as e:
            print("Skip image due to error:", e)

        time.sleep(0.2)  # be nice to API

    return rows


In [None]:
# --- STEP 5: Collect dataset ---
all_rows = []
for term in SEARCH_TERMS:
    print(f"Fetching covers for genre: {term}")
    rows = fetch_openlibrary(term)
    all_rows.extend(rows)


Fetching covers for genre: fantasy
Fetching covers for genre: romance
Fetching covers for genre: thriller
Fetching covers for genre: science fiction
Fetching covers for genre: mystery


In [None]:

# --- STEP 6: Remove duplicates by perceptual hash ---
hashes = {}
unique_rows = []
for row in all_rows:
    img_path = row[6]
    try:
        h = imagehash.phash(Image.open(img_path))
        if str(h) not in hashes:
            hashes[str(h)] = img_path
            unique_rows.append(row)
        else:
            os.remove(img_path)
    except Exception as e:
        print(f"Hash failed for {img_path}: {e}")

Hash failed for book_covers/_works_OL52267W_9009316.jpg: [Errno 2] No such file or directory: 'book_covers/_works_OL52267W_9009316.jpg'
Hash failed for book_covers/_works_OL52266W_6419199.jpg: [Errno 2] No such file or directory: 'book_covers/_works_OL52266W_6419199.jpg'
Hash failed for book_covers/_works_OL52114W_36314.jpg: [Errno 2] No such file or directory: 'book_covers/_works_OL52114W_36314.jpg'
Hash failed for book_covers/_works_OL8193416W_14314858.jpg: [Errno 2] No such file or directory: 'book_covers/_works_OL8193416W_14314858.jpg'
Hash failed for book_covers/_works_OL472715W_13699667.jpg: [Errno 2] No such file or directory: 'book_covers/_works_OL472715W_13699667.jpg'


In [None]:
# --- STEP 7: Save metadata ---
with open(META_FILE, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['id','title','author','year','genre','cover_url','cover_filename','license','source'])
    writer.writerows(unique_rows)

print(f"\n✅ Done! Downloaded {len(unique_rows)} unique covers.")
print(f"Images in: {OUTDIR}")
print(f"Metadata saved to: {META_FILE}")


✅ Done! Downloaded 212 unique covers.
Images in: book_covers
Metadata saved to: metadata.csv


In [None]:
from google.colab import files
files.download("metadata.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r book_covers_dataset.zip book_covers metadata.csv
from google.colab import files
files.download("book_covers_dataset.zip")


  adding: book_covers/ (stored 0%)
  adding: book_covers/_works_OL98491W_11196262.jpg (deflated 0%)
  adding: book_covers/_works_OL25595002W_7267770.jpg (deflated 35%)
  adding: book_covers/_works_OL472572W_12855104.jpg (deflated 0%)
  adding: book_covers/_works_OL471576W_11100465.jpg (deflated 0%)
  adding: book_covers/_works_OL81180W_12983362.jpg (deflated 0%)
  adding: book_covers/_works_OL66562W_9278292.jpg (deflated 0%)
  adding: book_covers/_works_OL471702W_14586349.jpg (deflated 0%)
  adding: book_covers/_works_OL77775W_14640067.jpg (deflated 0%)
  adding: book_covers/_works_OL1168007W_11261770.jpg (deflated 0%)
  adding: book_covers/_works_OL39360W_3240273.jpg (deflated 1%)
  adding: book_covers/_works_OL472536W_14577913.jpg (deflated 0%)
  adding: book_covers/_works_OL80609W_6581958.jpg (deflated 0%)
  adding: book_covers/_works_OL24034W_13859660.jpg (deflated 1%)
  adding: book_covers/_works_OL54158W_9173884.jpg (deflated 0%)
  adding: book_covers/_works_OL262438W_9247987.jpg

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**ONE LINE PLOT**

In [None]:
from transformers import pipeline
import pandas as pd
import requests
import json
from tqdm import tqdm

In [None]:
# --- Load existing metadata ---
df = pd.read_csv("/content/metadata.csv")

In [None]:
# --- Summarizer ---
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0)

def fetch_description(olid):
    """Fetch description text from Open Library given a work OLID like '/works/OL12345W'."""
    olid_clean = olid.split("/")[-1]
    url = f"https://openlibrary.org/works/{olid_clean}.json"
    try:
        r = requests.get(url).json()
        desc = r.get("description", "")
        if isinstance(desc, dict):
            desc = desc.get("value", "")
        return desc
    except:
        return ""

def make_one_line(desc):
    """Convert a long description into a 1-line plot."""
    if not desc or len(desc.split()) < 8:
        return ""
    try:
        summary = summarizer(desc, max_length=25, min_length=15, do_sample=False)
        return summary[0]['summary_text']
    except:
        return desc[:120]  # fallback: truncate


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:

# --- Process each row ---
one_line_plots = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    desc = fetch_description(row['id'])
    plot = make_one_line(desc)
    one_line_plots.append(plot)

df['one_line_plot'] = one_line_plots

  5%|▌         | 11/212 [00:05<01:27,  2.30it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 18%|█▊        | 39/212 [00:18<01:29,  1.94it/s]Your max_length is set to 25, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
 61%|██████▏   | 130/212 [01:07<00:40,  2.03it/s]Your max_length is set to 25, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
 64%|██████▍   | 136/212 [01:10<00:37,  2.02it/s]Your max_length is set to 25, but your input_length is only 22. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length ma

In [None]:
# --- Save enriched metadata ---
df.to_csv("metadata_with_plots.csv", index=False)
print("\n✅ Metadata updated with one_line_plot column.")


✅ Metadata updated with one_line_plot column.


In [None]:
from google.colab import files
files.download("metadata_with_plots.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm

# Paths
metadata_path = "metadata_with_plots.csv"
images_path = "book_covers"
output_images_path = "lora_dataset/images"
output_captions_path = "lora_dataset/captions"

# Create output folders
os.makedirs(output_images_path, exist_ok=True)
os.makedirs(output_captions_path, exist_ok=True)

# Load metadata
df = pd.read_csv(metadata_path)

# Preprocess each image
for idx, row in tqdm(df.iterrows(), total=len(df)):
    img_filename = row['cover_filename']
    img_path = os.path.join(images_path, os.path.basename(img_filename))

    if not os.path.exists(img_path):
        continue  # skip missing images

    try:
        # Open and resize image
        img = Image.open(img_path).convert("RGB")
        img = img.resize((512, 512), Image.Resampling.LANCZOS)

        # Save to new dataset folder
        img_out_path = os.path.join(output_images_path, f"{idx:04d}.jpg")
        img.save(img_out_path, "JPEG", quality=95)

        # Create caption file
        prompt = f"Title: {row['title']}. Plot: {row['one_line_plot']}."
        caption_out_path = os.path.join(output_captions_path, f"{idx:04d}.txt")
        with open(caption_out_path, "w", encoding="utf-8") as f:
            f.write(prompt)

    except Exception as e:
        print(f"Error processing {img_path}: {e}")

print("✅ Dataset ready for LoRA training in 'lora_dataset/'")


100%|██████████| 212/212 [00:01<00:00, 113.55it/s]

✅ Dataset ready for LoRA training in 'lora_dataset/'





In [None]:
# Zip your dataset and offer it for download
!zip -r book_cover_dataset.zip book_covers metadata_with_plots.csv metadata.csv lora_dataset 2>/dev/null || true

from google.colab import files
files.download("book_cover_dataset.zip")


  adding: book_covers/ (stored 0%)
  adding: book_covers/_works_OL98491W_11196262.jpg (deflated 0%)
  adding: book_covers/_works_OL25595002W_7267770.jpg (deflated 35%)
  adding: book_covers/_works_OL472572W_12855104.jpg (deflated 0%)
  adding: book_covers/_works_OL471576W_11100465.jpg (deflated 0%)
  adding: book_covers/_works_OL81180W_12983362.jpg (deflated 0%)
  adding: book_covers/_works_OL66562W_9278292.jpg (deflated 0%)
  adding: book_covers/_works_OL471702W_14586349.jpg (deflated 0%)
  adding: book_covers/_works_OL77775W_14640067.jpg (deflated 0%)
  adding: book_covers/_works_OL1168007W_11261770.jpg (deflated 0%)
  adding: book_covers/_works_OL39360W_3240273.jpg (deflated 1%)
  adding: book_covers/_works_OL472536W_14577913.jpg (deflated 0%)
  adding: book_covers/_works_OL80609W_6581958.jpg (deflated 0%)
  adding: book_covers/_works_OL24034W_13859660.jpg (deflated 1%)
  adding: book_covers/_works_OL54158W_9173884.jpg (deflated 0%)
  adding: book_covers/_works_OL262438W_9247987.jpg

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

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