In [6]:
import requests
import json
import pandas as pd
from typing import Optional, Dict, List
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

class SureChEMBLClient:
    """
    Клиент для работы с SureChEMBL API
    """
    def __init__(self, base_url: str = "https://www.surechembl.org/api"):
        self.base_url = base_url
        self.session = requests.Session()
        # Устанавливаем заголовки для имитации браузера
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

    def get_patent_document(self, patent_id: str, annotated: bool = True) -> Optional[Dict]:
        """
        Получает полный документ патента по ID

        Args:
            patent_id (str): ID патента в формате SCPN
            annotated (bool): Включить химические аннотации

        Returns:
            Dict: Данные патента или None при ошибке
        """
        url = f"{self.base_url}/documents/{patent_id}"
        params = {}
        if annotated:
            params['annotated'] = 'true'
        try:
            response = self.session.get(url, params=params, timeout=30)
            response.raise_for_status()
            data = response.json()
            if data.get('status') == 'success':
                return data.get('data')
            else:
                logging.error(f"API error for {patent_id}: {data.get('errorMessage')}")
                return None
        except requests.exceptions.RequestException as e:
            logging.error(f"Network error for {patent_id}: {e}")
            return None
        except json.JSONDecodeError as e:
            logging.error(f"JSON parsing error for {patent_id}: {e}")
            return None
            
    def extract_full_text(self, patent_data: dict) -> str:
        """
        Извлекает полный текст из данных патента
    
        Args:
            patent_data (Dict): Данные патента от API
    
        Returns:
            str: Полный текст патента
        """
        if not patent_data:
            return ""
    
        # Извлекаем текст из различных секций
        sections = {
            'Title': patent_data.get('title', ''),
            'Abstract': patent_data.get('abstract', ''),
            'Claims': patent_data.get('claims', ''),
            'Description': patent_data.get('description', ''),
        }
    
        # Объединяем все секции
        full_text = ""
        for section_name, section_text in sections.items():
            if section_text and section_text.strip():
                full_text += f"=== {section_name.upper()} ===\n"
                full_text += f"{section_text.strip()}\n\n"
    
        return full_text
        
    def batch_download_patents(self, patent_ids: list[str], max_workers: int = 5) -> pd.DataFrame:
        """
        Скачивает множество патентов параллельно
    
        Args:
            patent_ids (List[str]): Список ID патентов
            max_workers (int): Максимальное количество потоков
    
        Returns:
            pd.DataFrame: DataFrame с данными патентов
        """
        results = []
    
        def download_single_patent(patent_id):
            patent_data = self.get_patent_document(patent_id)
            if patent_data:
                full_text = self.extract_full_text(patent_data)
                return {
                    'patent_id': patent_id,
                    'title': patent_data.get('title', ''),
                    'abstract': patent_data.get('abstract', ''),
                    'claims': patent_data.get('claims', ''),
                    'description': patent_data.get('description', ''),
                    'full_text': full_text,
                    'success': True
                }
            else:
                return {
                    'patent_id': patent_id,
                    'title': '',
                    'abstract': '',
                    'claims': '',
                    'description': '',
                    'full_text': '',
                    'success': False
                }
    
        # Параллельное скачивание
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_patent = {
                executor.submit(download_single_patent, patent_id): patent_id
                for patent_id in patent_ids
            }
    
            for future in as_completed(future_to_patent):
                patent_id = future_to_patent[future]
                try:
                    result = future.result()
                    results.append(result)
                except Exception as exc:
                    logging.error(f"Patent {patent_id} generated an exception: {exc}")
                    results.append({
                        'patent_id': patent_id,
                        'title': '',
                        'abstract': '',
                        'claims': '',
                        'description': '',
                        'full_text': '',
                        'success': False
                    })
    
        return pd.DataFrame(results)


In [7]:
df = pd.read_csv('filtered_patents.csv')
df.head()

Unnamed: 0,patent_number
0,US-5153197-A
1,US-5360800-A
2,US-4650884-A
3,US-5250534-A
4,US-4681893-A


In [8]:
patent_ids = df['patent_number'].tolist()

In [None]:
client = SureChEMBLClient()
df2 = client.batch_download_patents(patent_ids[::-1][0:10], max_workers=5)