# Imports

In [None]:
from neo4j import GraphDatabase
import os
import random
import pymupdf
import pymupdf4llm
from tqdm import tqdm
import weaviate
from helpers.md_processing import create_directories, extract_header_info_wev, process_page_images
from helpers.download_utils import advanced_pdf_download
import shutil

# Credentials

In [None]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "password1234"

weaviate_client = weaviate.connect_to_local(skip_init_checks=True)
driver = GraphDatabase.driver(uri, auth=(username, password))

# Get the unchunked datasheets

In [None]:
def get_datasheet_manufacturer_dict(driver):
    query = """
    MATCH (c:Category)<-[:BELONGS_TO]-(p:Product)
    MATCH (p:Product)-[:MANUFACTURED_BY]->(m:Manufacturer)
    MATCH (p)-[r:HAS_DATASHEET]->(d:Datasheet)
    WHERE r.state = "Unprocessed"
    WITH d, c, collect(distinct m.name) as manufacturers, collect(distinct p.name) as products
    RETURN d.name AS datasheet, c.name as category, manufacturers, products
    """
    datasheet_dict = []
    with driver.session() as session:
        result = session.execute_read(lambda tx: tx.run(query).data())
        for record in result:
            datasheet_url = record["datasheet"]
            manufacturers = record["manufacturers"]
            products = record["products"]
            category_name = record["category"]
            datasheet_dict.append((datasheet_url, manufacturers, products, category_name))
    return datasheet_dict

def mark_datasheet_as_(driver, datasheet_url, status):
    query = """
    MATCH (p:Product)-[r:HAS_DATASHEET]->(d:Datasheet)
    WHERE d.name = $datasheet_url AND r.state<>"Undownloadable"
    SET r.state = $status
    """
    with driver.session() as session:
        session.execute_write(lambda tx: tx.run(query, datasheet_url=datasheet_url, status=status))
    print(f"Marked datasheet {datasheet_url} as {status}.")

datasheet_manufacturer = get_datasheet_manufacturer_dict(driver)
print(datasheet_manufacturer[0:5])

### You can generate a small test sample

In [None]:
n = 20
li = random.sample(range(1, len(datasheet_manufacturer)), n)

print(li) 

simple_list = list()
for ind in li:
    data = datasheet_manufacturer[ind]
    if data[0] is not None:
        simple_list.append(data)
datasheet_manufacturer = simple_list

datasheet_manufacturer

In [None]:
items = []

def process_datasheets_in_batches(datasheet_manufacturer, driver, weaviate_client, items, batch_size=10):
    total_datasheets = len(datasheet_manufacturer)
    
    for batch_start in range(0, total_datasheets, batch_size):
        batch_end = min(batch_start + batch_size, total_datasheets)
        batch = datasheet_manufacturer[batch_start:batch_end]
        
        print(f"Processing batch {batch_start//batch_size + 1} ({batch_start}-{batch_end-1} of {total_datasheets})")
        
        # Download PDFs for this batch
        for ind, data in enumerate(batch, start=batch_start):
            datasheet_url, manufacturer, product,  category = data
            pdf_filename = f"{str(ind)}.pdf"
            
            initial_url = datasheet_url
            if initial_url.startswith('//'):
                initial_url = "https:" + initial_url
            
            success = advanced_pdf_download(initial_url, pdf_filename)
            
            if not success:
                print(f"Unable to download datasheet from URL: {initial_url}")
                mark_datasheet_as_(driver, datasheet_url, "Undownloadable")
                continue
        
        # Process the PDFs that were successfully downloaded
        for ind, data in enumerate(batch, start=batch_start):
            datasheet_url, manufacturers, products, category = data
            pdf_filename = f"{str(ind)}.pdf"
            
            if not os.path.exists(pdf_filename):
                continue
                
            try:
                doc = pymupdf.open(pdf_filename)
                md_text = pymupdf4llm.to_markdown(pdf_filename)
                num_pages = len(doc)
                base_dir = f"./data_{str(ind)}"
                create_directories(base_dir)
                
                extract_header_info_wev(
                    md_text,
                    base_dir,
                    pdf_filename,
                    items,
                    manufacturers,
                    products,
                    weaviate_client,
                    datasheet_url=datasheet_url,
                )
                
                for page_num in tqdm(range(num_pages), desc=f"Processing PDF {ind} pages"):
                    page = doc[page_num]
                    process_page_images(
                        page, page_num, base_dir, items,
                        save_to_storage=True,
                        data=data
                    )
                
                mark_datasheet_as_(driver, datasheet_url, "Chunked")
                doc.close()
                os.remove(pdf_filename)
                shutil.rmtree(base_dir)
                
            except Exception as e:
                print(f"Error processing PDF {ind}: {str(e)}")
                mark_datasheet_as_(driver, datasheet_url, "ProcessingError")
        
        print(f"Completed batch {batch_start//batch_size + 1}")

process_datasheets_in_batches(datasheet_manufacturer, driver, weaviate_client, items, batch_size=2)

driver.close()