In [1]:
import csv
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import pandas as pd

In [2]:
# === НАСТРОЙКИ ===
INPUT_CSV = "books.csv"             # исходный файл, должен содержать колонку isbn13
OUTPUT_CSV = "books_with_covers.csv"
COVERS_DIR = "covers"
MAX_WORKERS = 10                    # количество потоков
SLEEP_BETWEEN = 0.05                # пауза между запросами
GOOGLE_API_KEY = ""                 # можно оставить пустым (тогда без ключа)
TIMEOUT = 10                        # таймаут для запросов

In [3]:
def is_image_response(resp):
    """Проверяем, что это картинка, а не HTML"""
    return resp.status_code == 200 and resp.headers.get("Content-Type", "").startswith("image/")

In [4]:
def get_openlibrary_cover(isbn13):
    """Попытка получить обложку из Open Library"""
    url = f"https://covers.openlibrary.org/b/isbn/{isbn13}-L.jpg"
    try:
        r = requests.get(url, timeout=TIMEOUT)
        if is_image_response(r):
            #print(url)
            return url, r.content
    except Exception:
        pass
    return None, None

In [5]:
def get_googlebooks_cover(isbn13):
    """Попытка получить обложку из Google Books"""
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn13}"
    if GOOGLE_API_KEY:
        url += f"&key={GOOGLE_API_KEY}"

    try:
        r = requests.get(url, timeout=TIMEOUT)
        if r.status_code == 200:
            data = r.json()
            items = data.get("items")
            if items:
                info = items[0].get("volumeInfo", {})
                image_links = info.get("imageLinks", {})
                for key in ("extraLarge", "large", "medium", "thumbnail", "smallThumbnail"):
                    link = image_links.get(key)
                    if link:
                        rr = requests.get(link, timeout=TIMEOUT)
                        if is_image_response(rr):
                            #print(link)
                            return link, rr.content
    except Exception:
        pass
    return None, None

In [6]:
def clean_isbn13(value):
    """
    Исправляет ISBN, записанный как float (9.78042517693e+12) → '9780425176930'
    """
    if not value:
        return None
    s = str(value).strip()
    # если значение типа 9.78042517693e+12 — конвертируем в int, потом в строку
    if "e" in s or "E" in s:
        try:
            s = f"{int(float(s))}"
        except Exception:
            pass
    # убираем точки, пробелы, дефисы
    s = s.replace(".", "").replace("-", "").replace(" ", "")
    # если длина больше 13 — обрезаем (на всякий случай)
    if len(s) > 13:
        s = s[:13]
    return s if len(s) >= 10 else None

In [7]:
def fetch_cover(isbn13):
    """Пробуем получить обложку по ISBN13"""
    isbn13 = clean_isbn13(isbn13)

    result = {
        "isbn13": isbn13,
        "cover_url": "",
        "saved_path": "",
        "status": "not_found"
    }

    if not isbn13:
        result["status"] = "no_isbn"
        return result

    # 1) OpenLibrary
    url, img = get_openlibrary_cover(isbn13)
    if url and img:
        path = os.path.join(COVERS_DIR, f"{isbn13}.jpg")
        with open(path, "wb") as f:
            f.write(img)
        result.update({"cover_url": url, "saved_path": path, "status": "ok_openlibrary"})
        return result

    time.sleep(SLEEP_BETWEEN)

    # 2) Google Books
    url, img = get_googlebooks_cover(isbn13)
    if url and img:
        path = os.path.join(COVERS_DIR, f"{isbn13}.jpg")
        with open(path, "wb") as f:
            f.write(img)
        result.update({"cover_url": url, "saved_path": path, "status": "ok_googlebooks"})
        return result

    return result

In [8]:
def first_search():
    os.makedirs(COVERS_DIR, exist_ok=True)

    with open(INPUT_CSV, newline='', encoding='utf-8') as fin:
        reader = csv.DictReader(fin)
        rows = list(reader)

    if not rows:
        print("⚠️ Входной CSV пуст.")
        return

    if "isbn13" not in rows[0]:
        print("❌ В CSV нет колонки 'isbn13'.")
        return

    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futures = {ex.submit(fetch_cover, row["isbn13"]): row for row in rows}
        for i, fut in enumerate(as_completed(futures), start=1):
            res = fut.result()
            row = futures[fut]
            new_row = row.copy()
            new_row.update(res)
            results.append(new_row)
            print(f"[{i}/{len(rows)}] {res['isbn13']} → {res['status']}")

    # сохраняем результат
    fieldnames = list(results[0].keys())
    with open(OUTPUT_CSV, "w", newline='', encoding='utf-8') as fout:
        writer = csv.DictWriter(fout, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)

    print("✅ Готово! Результаты сохранены в", OUTPUT_CSV)

In [9]:
first_search()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[5002/10000] 9780380810340 → not_found
[5003/10000] 9780515141400 → not_found
[5004/10000] 9780425210280 → not_found
[5005/10000] 9780894864020 → not_found
[5006/10000] 9781400069620 → not_found
[5007/10000] 9781933480090 → not_found
[5008/10000] 9780446579670 → not_found
[5009/10000] 9780152046910 → ok_openlibrary
[5010/10000] 9780743497300 → not_found
[5011/10000] 9780743426800 → ok_googlebooks
[5012/10000] None → no_isbn
[5013/10000] 9780553272120 → not_found
[5014/10000] 9780446504130 → ok_googlebooks
[5015/10000] 9780689872370 → not_found
[5016/10000] 9781439136800 → not_found
[5017/10000] 9780142428700 → not_found
[5018/10000] 9780224063980 → not_found
[5019/10000] 9781421514820 → not_found
[5020/10000] 9780142300700 → not_found
[5021/10000] 9781594487580 → not_found
[5022/10000] 9780007203120 → not_found
[5023/10000] 9781450595990 → not_found
[5024/10000] 9780679744470 → not_found
[5025/10000] 9780671449020 → not_f

In [10]:
INPUT_CSV = "books_with_covers.csv"
OUTPUT_CSV = "books_with_more_covers.csv"

In [11]:
def get_googlebooks_cover_2(isbn=None, title=None, author=None):
    if not GOOGLE_API_KEY:
        return None, None
    q = ""
    if isbn:
        q = f"isbn:{isbn}"
    elif title or author:
        title = title.replace('"', '') if title else ""
        author = author.replace('"', '') if author else ""
        q = f'intitle:"{title}"+inauthor:"{author}"'

    url = f"https://www.googleapis.com/books/v1/volumes?q={q}&maxResults=1&key={GOOGLE_API_KEY}"
    try:
        r = requests.get(url, timeout=TIMEOUT)
        if r.status_code == 200:
            data = r.json()
            items = data.get("items")
            if items:
                info = items[0].get("volumeInfo", {})
                image_links = info.get("imageLinks", {})
                for key in ("extraLarge", "large", "medium", "thumbnail", "smallThumbnail"):
                    link = image_links.get(key)
                    if link:
                        rr = requests.get(link, timeout=TIMEOUT)
                        if is_image_response(rr):
                            return link, rr.content
    except Exception:
        pass
    return None, None

In [12]:
def fetch_cover(row):
    isbn = clean_isbn13(row.get("isbn13"))
    title = row.get("title", "")
    author = row.get("authors", "") or row.get("author", "")

    result = {
        "isbn13": isbn,
        "title": title,
        "author": author,
        "cover_url": row.get("cover_url", ""),
        "saved_path": row.get("saved_path", ""),
        "status": row.get("status", ""),
    }

    # если уже найдено — пропускаем
    if result["status"].startswith("ok_"):
        return result

    # 1. пробуем OpenLibrary
    if isbn:
        url, img = get_openlibrary_cover(isbn)
        if url and img:
            path = os.path.join(COVERS_DIR, f"{isbn}.jpg")
            with open(path, "wb") as f:
                f.write(img)
            #print('OL', url)
            result.update({"cover_url": url, "saved_path": path, "status": "ok_openlibrary"})
            return result

    # 2. Google Books по ISBN
    if isbn:
        url, img = get_googlebooks_cover_2(isbn=isbn)
        if url and img:
            path = os.path.join(COVERS_DIR, f"{isbn}.jpg")
            with open(path, "wb") as f:
                f.write(img)
            #print('GB', url)
            result.update({"cover_url": url, "saved_path": path, "status": "ok_google_isbn"})
            return result

    # 3. Google Books по названию и автору
    url, img = get_googlebooks_cover_2(title=title, author=author)
    if url and img:
        name_part = "_".join(title.split()[:3])
        path = os.path.join(COVERS_DIR, f"{name_part}.jpg")
        with open(path, "wb") as f:
            f.write(img)
        #print('NA', url)
        result.update({"cover_url": url, "saved_path": path, "status": "ok_google_text"})
        return result

    result["status"] = "not_found"
    return result

In [13]:
def second_search():
    os.makedirs(COVERS_DIR, exist_ok=True)

    with open(INPUT_CSV, newline='', encoding='utf-8') as fin:
        rows = list(csv.DictReader(fin))

    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futures = {ex.submit(fetch_cover, row): row for row in rows}
        for i, fut in enumerate(as_completed(futures), start=1):
            res = fut.result()
            results.append(res)
            print(f"[{i}/{len(rows)}] {res['title'][:40]} → {res['status']}")

    with open(OUTPUT_CSV, "w", newline='', encoding='utf-8') as fout:
        writer = csv.DictWriter(fout, fieldnames=results[0].keys())
        writer.writeheader()
        writer.writerows(results)

    print(f"✅ Готово! Результаты сохранены в {OUTPUT_CSV}")

In [14]:
second_search()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[5002/10000] Inca Gold (Dirk Pitt, #12) → ok_googlebooks
[5003/10000] Own the Wind (Chaos, #1) → not_found
[5004/10000] Feeling Good: The New Mood Therapy → not_found
[5005/10000] NurtureShock: New Thinking About Childre → ok_googlebooks
[5006/10000] Predator (Kay Scarpetta, #14) → not_found
[5007/10000] Blue Smoke → not_found
[5008/10000] Delicious! → not_found
[5009/10000] Codependent No More: How to Stop Control → not_found
[5010/10000] Oblomov → not_found
[5011/10000] Wild Fire (John Corey, #4) → not_found
[5012/10000] Two Little Girls in Blue → not_found
[5013/10000] Daughter of the Empire (The Empire Trilo → not_found
[5014/10000] Everlost (Skinjacker, #1) → not_found
[5015/10000] Dreams of a Dark Warrior (Immortals Afte → not_found
[5016/10000] Theodore Boone: Kid Lawyer (Theodore Boo → not_found
[5017/10000] A Year Down Yonder (A Long Way from Chic → not_found
[5018/10000] High School Debut, Vol. 01 (High School  