In [None]:
import pdfplumber
import os
from tqdm import tqdm
import re
import time
import pandas as pd
from datetime import datetime, timedelta

In [None]:
from dotenv import load_dotenv

load_dotenv("../var.env")

In [None]:
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)
logging.getLogger("pdfplumber").setLevel(logging.ERROR)

In [None]:
def parse_single_entry(text, category, date):
    """
    Get the details for each book from the text passed.
    Matching the appropriate details and storing them.
    """
    try:
        rank_match = re.match(r"(\d+)", text)
        rank = int(rank_match.group(1)) if rank_match else None

        title_match = re.search(r"\d+\s+(.*?), by", text)
        title = title_match.group(1).title() if title_match else None

        author_match = re.search(r"by (.*?)\.", text)
        author = author_match.group(1).strip() if author_match else None

        pub_match = re.search(r"\((.*?)\)", text)
        publisher = pub_match.group(1).strip() if pub_match else None
        
        rank_pairs = re.findall(r"(?:(\d{1,2})|--)\s+(\d{1,2})", text)
        if rank_pairs:
            first = rank_pairs[0]
            last_week_rank = int(first[0]) if first[0] and first[0].isdigit() else None
            weeks_on_list = int(first[1])
        else:
            last_week_rank = None
            weeks_on_list = None

        text = re.sub(r"(?:(\d{1,2})|--)\s+(\d{1,2})", "", text, count=1).strip()

        return {
            "date": date,
            "category": category,
            "rank": rank,
            "title": title,
            "author": author,
            "publisher": publisher,
            "last_week_rank": last_week_rank,
            "weeks_on_list": weeks_on_list
        }

    except Exception as e:
        print(f"Failed to parse entry: {e}")
        return None

In [None]:
def parse_bestseller_pdf(filepath):
    
    """
    Store the list for each file.
    Matching each entry to it's appropriate format and storing them correctly.
    """
    buffer = []
    entries = []

    date_match = re.search(r"s_(.*?)\.", filepath)
    date = date_match.group(1) if date_match else None
    category = None
    
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            
            try:
                text = page.extract_text()
                lines = text.split("\n")[2:]

                for line in lines:
                    line = line.strip()
                    category =  re.match(r"Week (.*?) Week On List$", line, re.IGNORECASE)
                    if category:
                        category = category.group(1).strip().title()
                        continue

                    elif re.match(r"^\d{1,2}\s", line):
                        if buffer:
                            full_entry = " ".join(buffer)
                            parsed = parse_single_entry(full_entry, category, date)
                            if parsed:
                                entries.append(parsed)
                            buffer = []

                    buffer.append(line)
            except Exception as e:
                print(f"Skipping bad page in {filepath}: {e}")
                continue

        if buffer:
            full_entry = " ".join(buffer)
            parsed = parse_single_entry(full_entry, category, date)
            if parsed:
                entries.append(parsed)
        
    cleaned_entries = [e for e in entries if e is not None]
    return cleaned_entries

In [None]:
def parse_pdf(filepath):
    """
    Helper function to accelerate the parsing of all data.
    """
    try:
        return parse_bestseller_pdf(filepath)
    except Exception as e:
        print(f"Failed: {e}")
        return []



pdf_folder = "../pdfs"
filepaths = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

all_entries = []

from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=4) as executor:
    for result in tqdm(executor.map(parse_pdf, filepaths), total=len(filepaths)):
            if result:
                all_entries.extend(result)

In [None]:
books_data = pd.DataFrame([e for e in all_entries if e is not None])
books_data.to_csv("../data/raw/nyt_bestsellers_data.csv")

In [None]:
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    elif month in [9, 10, 11]:
        return "Autumn"

In [None]:
books_data.dropna(how="all", inplace=True)

In [None]:
books_data["date"] = pd.to_datetime(books_data["date"])

In [None]:
books_data["season"] = books_data["date"].apply(get_season)

In [None]:
books_data.to_csv("../data/raw/nyt_bestsellers.csv")

In [None]:
def book_detail_generator(batch):
    """
    Query Google Books API for public information 
    about each book in the intended database.
    """
    base_url = "https://www.googleapis.com/books/v1/volumes"
    
    for entry in tqdm(batch):
        query = f'intitle:{entry["title"]}'
        if entry["author"]:
            query += f'+inauthor:{entry["author"]}'
            

        books_url = f"{base_url}?q={query}&maxResults=1&key={books_key}"
        
        try:
            response = requests.get(books_url, timeout=10)       
            response.raise_for_status()
        except requests.exceptions.RequestException as error:
            print(f"Failed to get book info{error}")
        if response and response.ok: 
            try:
                data = response.json()
                if "items" in data:
                    result = data["items"][0]["volumeInfo"]
                    entry["maturityRating"] = result.get("maturityRating") if "maturityRating" in result else None
                    entry["description"] = result.get("description") if "description" in result else None
                    entry["categories"] = result.get("categories") if "categories" in result else None
            except Exception as e:
                continue
        time.sleep(2)
        

In [None]:
import requests
from more_itertools import chunked
books_key = os.getenv("BOOKS_KEY")

first_batch = False
batches = list(chunked(all_entries, 1000))
for batch in batches:
    book_detail_generator(batch)
    batch_data = pd.DataFrame([e for e in batch if e is not None])
    if first_batch:
        batch_data.to_csv("../data/raw/nyt_bestsllers_detailed.csv",mode="w",index=False)
        first_batch = False
    else:
        batch_data.to_csv("../data/raw/nyt_bestsellers_detailed.csv", mode="a", index=False, header=False)