In [21]:
import csv
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import pandas as pd

In [10]:
# === НАСТРОЙКИ ===
INPUT_CSV = "books.csv"             # исходный файл, должен содержать колонку isbn13
OUTPUT_CSV = "books_with_covers.csv"
COVERS_DIR = "covers"
MAX_WORKERS = 10                    # количество потоков
SLEEP_BETWEEN = 0.05                # пауза между запросами
GOOGLE_API_KEY = ""                 # можно оставить пустым (тогда без ключа)
TIMEOUT = 10                        # таймаут для запросов


In [11]:
def is_image_response(resp):
    """Проверяем, что это картинка, а не HTML"""
    return resp.status_code == 200 and resp.headers.get("Content-Type", "").startswith("image/")

In [12]:
def get_openlibrary_cover(isbn13):
    """Попытка получить обложку из Open Library"""
    url = f"https://covers.openlibrary.org/b/isbn/{isbn13}-L.jpg"
    try:
        r = requests.get(url, timeout=TIMEOUT)
        if is_image_response(r):
            return url, r.content
    except Exception:
        pass
    return None, None

In [13]:
def get_googlebooks_cover(isbn13):
    """Попытка получить обложку из Google Books"""
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn13}"
    if GOOGLE_API_KEY:
        url += f"&key={GOOGLE_API_KEY}"

    try:
        r = requests.get(url, timeout=TIMEOUT)
        if r.status_code == 200:
            data = r.json()
            items = data.get("items")
            if items:
                info = items[0].get("volumeInfo", {})
                image_links = info.get("imageLinks", {})
                for key in ("extraLarge", "large", "medium", "thumbnail", "smallThumbnail"):
                    link = image_links.get(key)
                    if link:
                        rr = requests.get(link, timeout=TIMEOUT)
                        if is_image_response(rr):
                            return link, rr.content
    except Exception:
        pass
    return None, None

In [17]:
def clean_isbn13(value):
    """
    Исправляет ISBN, записанный как float (9.78042517693e+12) → '9780425176930'
    """
    if not value:
        return None
    s = str(value).strip()
    # если значение типа 9.78042517693e+12 — конвертируем в int, потом в строку
    if "e" in s or "E" in s:
        try:
            s = f"{int(float(s))}"
        except Exception:
            pass
    # убираем точки, пробелы, дефисы
    s = s.replace(".", "").replace("-", "").replace(" ", "")
    # если длина больше 13 — обрезаем (на всякий случай)
    if len(s) > 13:
        s = s[:13]
    return s if len(s) >= 10 else None

In [18]:
def fetch_cover(isbn13):
    """Пробуем получить обложку по ISBN13"""
    isbn13 = clean_isbn13(isbn13)

    result = {
        "isbn13": isbn13,
        "cover_url": "",
        "saved_path": "",
        "status": "not_found"
    }

    if not isbn13:
        result["status"] = "no_isbn"
        return result

    # 1) OpenLibrary
    url, img = get_openlibrary_cover(isbn13)
    if url and img:
        path = os.path.join(COVERS_DIR, f"{isbn13}.jpg")
        with open(path, "wb") as f:
            f.write(img)
        result.update({"cover_url": url, "saved_path": path, "status": "ok_openlibrary"})
        return result

    time.sleep(SLEEP_BETWEEN)

    # 2) Google Books
    url, img = get_googlebooks_cover(isbn13)
    if url and img:
        path = os.path.join(COVERS_DIR, f"{isbn13}.jpg")
        with open(path, "wb") as f:
            f.write(img)
        result.update({"cover_url": url, "saved_path": path, "status": "ok_googlebooks"})
        return result

    return result

In [19]:
def main():
    os.makedirs(COVERS_DIR, exist_ok=True)

    with open(INPUT_CSV, newline='', encoding='utf-8') as fin:
        reader = csv.DictReader(fin)
        rows = list(reader)

    if not rows:
        print("⚠️ Входной CSV пуст.")
        return

    if "isbn13" not in rows[0]:
        print("❌ В CSV нет колонки 'isbn13'.")
        return

    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futures = {ex.submit(fetch_cover, row["isbn13"]): row for row in rows}
        for i, fut in enumerate(as_completed(futures), start=1):
            res = fut.result()
            row = futures[fut]
            new_row = row.copy()
            new_row.update(res)
            results.append(new_row)
            print(f"[{i}/{len(rows)}] {res['isbn13']} → {res['status']}")

    # сохраняем результат
    fieldnames = list(results[0].keys())
    with open(OUTPUT_CSV, "w", newline='', encoding='utf-8') as fout:
        writer = csv.DictWriter(fout, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)

    print("✅ Готово! Результаты сохранены в", OUTPUT_CSV)

In [20]:
if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[5002/10000] 9780307118390 → not_found
[5003/10000] 9780060002500 → not_found
[5004/10000] 9789953716890 → not_found
[5005/10000] 9780380810340 → not_found
[5006/10000] 9780894864020 → not_found
[5007/10000] 9780743426800 → not_found
[5008/10000] 9780425210280 → not_found
[5009/10000] 9780515141400 → not_found
[5010/10000] 9781933480090 → not_found
[5011/10000] 9780446504130 → not_found
[5012/10000] None → no_isbn
[5013/10000] 9780743497300 → not_found
[5014/10000] 9780553272120 → not_found
[5015/10000] 9780446579670 → not_found
[5016/10000] 9781400069620 → ok_openlibrary
[5017/10000] 9780689872370 → not_found
[5018/10000] 9780142428700 → not_found
[5019/10000] 9781439136800 → not_found
[5020/10000] 9780224063980 → not_found
[5021/10000] 9780142300700 → not_found
[5022/10000] 9788423648990 → not_found
[5023/10000] 9781594487580 → not_found
[5024/10000] 9780679744470 → not_found
[5025/10000] 9780060894080 → not_found
[5026

In [32]:
books = pd.read_csv("books_with_covers.csv")
books

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,cover_url,saved_path,status
0,4,2657,2657,3275794,487,61120081,9.780061e+12,Harper Lee,1960.0,To Kill a Mockingbird,...,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,,,not_found
1,5,4671,4671,245494,1356,743273567,9.780743e+12,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,,,not_found
2,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,,,not_found
3,8,5107,5107,3036731,360,316769177,9.780317e+12,J.D. Salinger,1951.0,The Catcher in the Rye,...,109383,185520,455042,661516,709176,https://images.gr-assets.com/books/1398034300m...,https://images.gr-assets.com/books/1398034300s...,,,not_found
4,3,41865,41865,3212258,226,316015849,9.780316e+12,Stephenie Meyer,2005.0,Twilight,...,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,,,not_found
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,15613,15613,2764239,199,1416523723,9.781417e+12,Herman Melville,1924.0,"Billy Budd, Sailor",...,1478,2225,3805,2985,1617,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,,,not_found
9996,9998,77431,77431,2393986,60,039330762X,9.780393e+12,Patrick O'Brian,1977.0,The Mauritius Command,...,11,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...,,,not_found
9997,9999,8565083,8565083,13433613,7,61711527,9.780062e+12,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from th...,...,275,1002,3765,4577,2375,https://images.gr-assets.com/books/1279214118m...,https://images.gr-assets.com/books/1279214118s...,,,not_found
9998,9994,4769651,4769651,4834466,2,810983559,9.780811e+12,"Michael Buckley, Peter Ferguson",2009.0,"The Everafter War (The Sisters Grimm, #7)",...,110,289,1745,3989,6832,https://images.gr-assets.com/books/1388278230m...,https://images.gr-assets.com/books/1388278230s...,,,not_found


In [33]:
cond = books["status"] == 'ok_googlebooks'

In [35]:
books['saved_path'].count()

np.int64(629)