In [22]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import random
import pandas as pd
import re
import nest_asyncio

MAX_CONCURRENT_REQUESTS = 30
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
MAX_RETRIES = 3
RETRY_DELAY = 10

async def parse_google_patent(patent_id):
    url = f"https://patents.google.com/patent/{patent_id}/en"
    headers = {"User-Agent": "Mozilla/5.0"}
    async with semaphore:
        delay = random.uniform(1, 3)
        await asyncio.sleep(delay)

        text = None
        for attempt in range(1, MAX_RETRIES + 1):
            try:
                async with aiohttp.ClientSession(headers=headers) as session:
                    async with session.get(url) as response:
                        if response.status != 200:
                            raise Exception(f"Error with open url: {response.status}")
                        text = await response.text()
                break
            except Exception as e:
                if attempt == MAX_RETRIES:
                    # После последней попытки — просто возвращаем пустую строку или текст-плейсхолдер
                    return ""
                await asyncio.sleep(RETRY_DELAY)

    soup = BeautifulSoup(text, "html.parser")

    title_tag = soup.find("span", {"itemprop": "title"})
    title = title_tag.text.strip() if title_tag else ""

    abstract_tag = soup.find("section", {"itemprop": "abstract"})
    abstract = abstract_tag.get_text(strip=True) if abstract_tag else ""

    claims_tag = soup.find("section", {"itemprop": "claims"})
    claims = claims_tag.get_text(separator="\n").strip() if claims_tag else ""

    description_tag = soup.find("section", {"itemprop": "description"})
    description = description_tag.get_text(separator="\n").strip() if description_tag else ""

    # Просто склеиваем все текстовые части через "\n\n" (или через любой разделитель)
    joined_text = " ".join([title, abstract, claims, description]).strip()
    return joined_text

async def parse_google_patent_main(patent_ids):
    tasks = [asyncio.create_task(parse_google_patent(pid)) for pid in patent_ids]
    return await asyncio.gather(*tasks)


In [23]:
async def process_batch(batch, pattern, batch_num):
    result = []
    for patent in batch:
        if not patent or not patent["text"]:
            continue
        if re.search(pattern, patent["text"], re.IGNORECASE):
            result.append({"id": patent["id"], "text": patent["text"]})

    if result:
        output_filename = f"output_batch_{batch_num:05d}.json"
        with open(output_filename, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        print(f"Batch {batch_num}: {len(result)} matching patents saved ({output_filename})")

async def parse_google_patent_main(patent_ids, pattern, batch_size=100):
    batch_num = 1
    for i in range(0, len(patent_ids), batch_size):
        batch_patent_ids = patent_ids[i:i+batch_size]
        tasks = [asyncio.create_task(parse_google_patent(pid)) for pid in batch_patent_ids]
        batch_results = await asyncio.gather(*tasks)
        await process_batch(batch_results, pattern, batch_num)
        batch_num += 1

In [24]:
patent_ids = pd.read_csv('filtered_patents.csv')['patent_number'].tolist()

In [25]:
# Парсер v3
nest_asyncio.apply()
sub_5 = "₅"
sub_0 = "₀"
digits = r'(?:50|₅₀)'

pattern = (
    r'(?:'
    r'\bIC' + digits + r'\s*\(\s*nM\s*\)'
    r'|\bEC' + digits + r'\s*\(\s*nM\s*\)'
    r'|\bKi\s*\(\s*nM\s*\)'
    r'|\bKd\s*\(\s*nM\s*\)'
    r')'
)
regex = re.compile(pattern)

In [None]:
await parse_google_patent_main(patent_ids[15000:25000], regex, batch_size=100)

  raise KeyError("{!r} is not registered".format(fileobj)) from None
