# Imports

In [1]:
from neo4j import GraphDatabase
import os
import random
import pymupdf
import pymupdf4llm
from tqdm import tqdm
import weaviate
from helpers.md_processing import create_directories, extract_header_info_wev, process_page_images
from helpers.download_utils import advanced_pdf_download
import shutil

# Credentials

In [2]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "password1234"

weaviate_client = weaviate.connect_to_local(skip_init_checks=True)
driver = GraphDatabase.driver(uri, auth=(username, password))

# Get the unchunked datasheets

In [3]:
def get_datasheet_manufacturer_dict(driver):
    query = """
    MATCH (c:Category)<-[:BELONGS_TO]-(p:Product)
    MATCH (p:Product)-[:MANUFACTURED_BY]->(m:Manufacturer)
    MATCH (p)-[r:HAS_DATASHEET]->(d:Datasheet)
    WHERE r.state = "Unprocessed"
    WITH d, c, collect(distinct m.name) as manufacturers, collect(distinct p.name) as products
    RETURN d.name AS datasheet, c.name as category, manufacturers, products
    """
    datasheet_dict = []
    with driver.session() as session:
        result = session.execute_read(lambda tx: tx.run(query).data())
        for record in result:
            datasheet_url = record["datasheet"]
            manufacturers = record["manufacturers"]
            products = record["products"]
            category_name = record["category"]
            datasheet_dict.append((datasheet_url, manufacturers, products, category_name))
    return datasheet_dict

def mark_datasheet_as_(driver, datasheet_url, status):
    query = """
    MATCH (p:Product)-[r:HAS_DATASHEET]->(d:Datasheet)
    WHERE d.name = $datasheet_url AND r.state<>"Undownloadable"
    SET r.state = $status
    """
    with driver.session() as session:
        session.execute_write(lambda tx: tx.run(query, datasheet_url=datasheet_url, status=status))
    print(f"Marked datasheet {datasheet_url} as {status}.")

datasheet_manufacturer = get_datasheet_manufacturer_dict(driver)
print(datasheet_manufacturer[0:5])

[('https://rocelec.widen.net/view/pdf/0e4apuiify/INTLS03195-1.pdf?t.download=true&u=5oefqw', ['Altera'], ['N28F256A150'], 'Memory'), ('https://www.gigadevice.com.cn/Public/Uploads/uploadfile/files/20220714/DS-00353-GD25D05C-Rev1.6.pdf', ['GigaDevice Semiconductor (HK) Limited'], ['GD25D10CTIG', 'GD25D10CKIGR', 'GD25D10CEIGR', 'GD25D10CTIGR', 'GD25D05CTIG', 'GD25D05CEIGR', 'GD25D05CKIGR', 'GD25D05CTIGR'], 'Memory'), ('https://www.byte-semi.com/download/SPI_NOR_Flash/BY25D05AS.pdf', ['BYTe Semiconductor'], ['BY25D05ASOIG(R)', 'BY25D05ASTIG(R)', 'BY25D05ASOIG(T)', 'BY25D05ASMIG(R)', 'BY25D05ASTIG(T)'], 'Memory'), ('https://mm.digikey.com/Volume0/opasdata/d220001/medias/docus/5050/2023__Gde.pdf', ['GigaDevice Semiconductor (HK) Limited'], ['GD25VE20CTIG', 'GD25VE20CSIG', 'GD25VQ20CTIG', 'GD25VQ20CSIG', 'GD25LQ20CTIG', 'GD25Q20ETIGR', 'GD25Q20ESIGR', 'GD25WD10CEIGR', 'GD25WD10CTIGR', 'GD25LQ10CEIGR', 'GD25LQ10CTIGR', 'GD25LD10CEIGR', 'GD25LD10CTIGR', 'GD25WD05CTIG', 'GD25WD05CTIGR', 'GD25WD

### You can generate a small test sample

In [4]:
n = 20
li = random.sample(range(1, len(datasheet_manufacturer)), n)

print(li) 

simple_list = list()
for ind in li:
    data = datasheet_manufacturer[ind]
    if data[0] is not None:
        simple_list.append(data)
datasheet_manufacturer = simple_list

datasheet_manufacturer

[9, 17, 18, 1, 24, 14, 10, 2, 6, 8, 4, 21, 23, 20, 12, 7, 25, 15, 5, 13]


[('https://www.analog.com/media/en/technical-documentation/data-sheets/LTC2875.pdf',
  ['Analog Devices Inc.'],
  ['LTC2875MPDD#TRPBF',
   'LTC2875MPDD#PBF',
   'LTC2875HDD#PBF',
   'LTC2875HDD#TRPBF',
   'LTC2875IDD#TRPBF',
   'LTC2875IDD#PBF'],
  'Transceivers'),
 ('https://static.3peak.com/res/doc/ds/Datasheet_TPT1042-TPT1042V.pdf',
  ['3PEAK'],
  ['TPT1042V-DF6R-S', 'TPT1042-DF6R-S'],
  'Transceivers'),
 ('https://static.3peak.com/res/doc/ds/Datasheet_TPT1042-TPT1042V.pdf',
  ['3PEAK'],
  ['TPT1042-DF6R-S'],
  'Drivers, Receivers, Transceivers'),
 ('https://www.gigadevice.com.cn/Public/Uploads/uploadfile/files/20230926/DS-00803-GD25D20E-Rev1.0.pdf',
  ['GigaDevice Semiconductor (HK) Limited'],
  ['GD25D20EKIGR', 'GD25D20ETIGR', 'GD25D20EEIGR'],
  'Memory'),
 ('https://static.3peak.com/res/doc/ds/Datasheet_TPT1029Q.pdf',
  ['3PEAK'],
  ['TPT1029Q-DFCR-S'],
  'Transceivers'),
 ('https://www.analog.com/media/en/technical-documentation/data-sheets/28767fa.pdf',
  ['Analog Devices Inc.'

In [4]:
items = []

def process_datasheets_in_batches(datasheet_manufacturer, driver, weaviate_client, items, batch_size=10):
    total_datasheets = len(datasheet_manufacturer)
    
    for batch_start in range(0, total_datasheets, batch_size):
        batch_end = min(batch_start + batch_size, total_datasheets)
        batch = datasheet_manufacturer[batch_start:batch_end]
        
        print(f"Processing batch {batch_start//batch_size + 1} ({batch_start}-{batch_end-1} of {total_datasheets})")
        
        # Download PDFs for this batch
        for ind, data in enumerate(batch, start=batch_start):
            datasheet_url, manufacturer, product,  category = data
            pdf_filename = f"{str(ind)}.pdf"
            
            initial_url = datasheet_url
            if initial_url.startswith('//'):
                initial_url = "https:" + initial_url
            
            success = advanced_pdf_download(initial_url, pdf_filename)
            
            if not success:
                print(f"Unable to download datasheet from URL: {initial_url}")
                mark_datasheet_as_(driver, datasheet_url, "Undownloadable")
                continue
        
        # Process the PDFs that were successfully downloaded
        for ind, data in enumerate(batch, start=batch_start):
            datasheet_url, manufacturers, products, category = data
            pdf_filename = f"{str(ind)}.pdf"
            
            if not os.path.exists(pdf_filename):
                continue
                
            try:
                doc = pymupdf.open(pdf_filename)
                md_text = pymupdf4llm.to_markdown(pdf_filename)
                num_pages = len(doc)
                base_dir = f"./data_{str(ind)}"
                create_directories(base_dir)
                
                extract_header_info_wev(
                    md_text,
                    base_dir,
                    pdf_filename,
                    items,
                    manufacturers,
                    products,
                    weaviate_client,
                    datasheet_url=datasheet_url,
                )
                
                for page_num in tqdm(range(num_pages), desc=f"Processing PDF {ind} pages"):
                    page = doc[page_num]
                    process_page_images(
                        page, page_num, base_dir, items,
                        save_to_storage=True,
                        data=data
                    )
                
                mark_datasheet_as_(driver, datasheet_url, "Chunked")
                doc.close()
                os.remove(pdf_filename)
                shutil.rmtree(base_dir)
                
            except Exception as e:
                print(f"Error processing PDF {ind}: {str(e)}")
                mark_datasheet_as_(driver, datasheet_url, "ProcessingError")
        
        print(f"Completed batch {batch_start//batch_size + 1}")

process_datasheets_in_batches(datasheet_manufacturer, driver, weaviate_client, items, batch_size=2)

driver.close()

Processing batch 1 (0-1 of 57)
Downloaded file as 0.pdf
0.pdf is invalid or too small. Marking download as failed.
Found PDF link: https://rocelec.widen.net/content/0e4apuiify/original/INTLS03195-1.pdf?u=5oefqw&download=true
[Viewer] Downloaded file as 0.pdf
Downloaded file as 1.pdf


Processing PDF 0 pages: 100%|██████████| 21/21 [00:12<00:00,  1.73it/s]


Marked datasheet https://rocelec.widen.net/view/pdf/0e4apuiify/INTLS03195-1.pdf?t.download=true&u=5oefqw as Chunked.


Processing PDF 1 pages: 100%|██████████| 42/42 [00:28<00:00,  1.50it/s]


Marked datasheet https://www.gigadevice.com.cn/Public/Uploads/uploadfile/files/20220714/DS-00353-GD25D05C-Rev1.6.pdf as Chunked.
Completed batch 1
Processing batch 2 (2-3 of 57)
Downloaded file as 2.pdf
Downloaded file as 3.pdf


KeyboardInterrupt: 