In [10]:
import json
from typing import List, Optional, Any, Dict

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.runnables import Runnable, RunnableLambda
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import BaseMessage, AIMessage, HumanMessage, SystemMessage
from langchain_core.outputs import ChatResult, ChatGeneration
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser

from pydantic import BaseModel, Field
from langchain_mistralai import ChatMistralAI

In [4]:
with open('/Users/amrasov/Downloads/all_patents_v2.json', 'r') as file:  
    data = json.load(file)

In [60]:
for _ in data:
    if 'Phenyl-sulfamates as aromatase inhibitors' in _['text']:
        hui = _['text']
        break

import re

def clean_patent_text(text: str) -> str:

    cleaned_text = re.sub(r'-\n', '', text)
    cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    cleaned_text = re.sub(r'^[ \t]+|[ \t]+$', '', cleaned_text, flags=re.MULTILINE)
    return cleaned_text

hui = clean_patent_text(hui)

In [11]:
class BiologicalActivity(BaseModel):
    activity_type: Optional[str] = Field(description="e.g., 'Ki', 'IC50'")
    value: Optional[float] = Field(description="The numerical value.")
    units: Optional[str] = Field(description="e.g., 'nM', 'µM'")
    target: Optional[str] = Field(description="Name of the protein target.")
    compound: Optional[str] = Field(description="Name of the compound.")
    target_sequence: Optional[str] = Field(default=None, description="Amino acid sequence, NOT accession number.")
    compound_smiles: Optional[str] = Field(default=None, description="SMILES string.")

class ActivityCollection(BaseModel):
    activities: List[BiologicalActivity]

class ValidationResult(BaseModel):
    is_valid: bool = Field(description="True if the core fields are correct.")
    reason: str = Field(description="Brief explanation of the decision.")

In [34]:
def _repair_and_reserialize_json_string(llm_output: str) -> str:

    fallback_json_string = '{"activities": []}'

    try:
        if llm_output.startswith("```json"):
            llm_output = llm_output.split("```json\n", 1)[1].split("\n```")[0]
        llm_output = llm_output.strip()
    except (IndexError, AttributeError):
        pass

    if not llm_output:
        return fallback_json_string

    try:
        data = json.loads(llm_output)
        if isinstance(data, list):
            fixed_data = {"activities": [item for item in data if isinstance(item, dict) and item]}
        elif isinstance(data, dict) and isinstance(data.get("activities"), list):
            data["activities"] = [item for item in data["activities"] if isinstance(item, dict) and item]
            fixed_data = data
        else:
            return fallback_json_string
        return json.dumps(fixed_data)
        
    except json.JSONDecodeError:
        # недогенерированные JSON'ы идут сюда
        print(f"  [!] INFO: JSON is invalid, attempting to repair a partial list...")
        start_key = '"activities":'
        start_pos_key = llm_output.find(start_key)
        start_pos_list = llm_output.find('[', start_pos_key)

        if start_pos_list == -1:
            print("  [!] ERROR: Could not find the start of the activities list '['. Giving up.")
            return fallback_json_string

        partial_list_str = llm_output[start_pos_list:]
        last_good_bracket_pos = partial_list_str.rfind('}')
        
        while last_good_bracket_pos != -1:
            potential_list_str = partial_list_str[:last_good_bracket_pos + 1]
            test_json_str = potential_list_str + ']'
            
            try:
                repaired_list = json.loads(test_json_str)
                if isinstance(repaired_list, list):
                    print(f"  [+] SUCCESS: Repaired partial JSON, saved {len(repaired_list)} items.")
                    final_data = {"activities": repaired_list}
                    return json.dumps(final_data)
            except json.JSONDecodeError:
                last_good_bracket_pos = partial_list_str.rfind('}', 0, last_good_bracket_pos)
        
        # если не спасли JSON
        print("  [!] ERROR: Could not repair the partial JSON. Giving up.", llm_output)
        return fallback_json_string

In [20]:
def build_pipeline(llm: BaseChatModel):
    # агент-сборщик
    def run_preprocessor(text: str) -> List[str]:
        print("--- 1. Running Preprocessor Agent ---")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=20000, chunk_overlap=500, length_function=len)
        chunks = text_splitter.split_text(text)
        print(f"Text split into {len(chunks)} chunks.")
        return chunks

    # агент-фильтр
    def run_filter(chunks: List[str]) -> List[str]:
        print("\n--- 2. Running Filter Agent ---")
        keywords = ['ic50', 'ec50', 'ki', 'kd', 'activity', 'inhibition', 'binding', 'nm', 'µm', 'micromolar']
        relevant_chunks = [chunk for chunk in chunks if any(keyword in chunk.lower() for keyword in keywords)]
        print(f"Found {len(relevant_chunks)} potentially relevant chunks.")
        return relevant_chunks

    # агент-экстрактор
    extractor_parser = PydanticOutputParser(
        pydantic_object=ActivityCollection,
        retry_on_error=False,
        llm=None
    )

    extractor_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a meticulous and highly specialized biochemist expert. Your mission is to parse pharmaceutical and biochemical patents to extract biological activity data with extreme precision. You must understand the scientific context to correctly identify the roles of molecules.
    
    ### Core Scientific Concepts for Extraction:
    This section provides the fundamental knowledge you must use to interpret the text.
    
    **1. Defining `compound` vs. `target` - THE MOST IMPORTANT RULE:**
    
    *   **`compound` (The "Key"):** This is the substance being tested, usually a smaller molecule. It's the "actor" or "drug".
        *   **Types:** Small organic molecules, peptides, drug candidates, natural products, tool compounds, fragments.
        *   **Examples:** "imatinib", "aspirin", "atorvastatin", "cyclosporine A", "benzonitrile derivative", "quinazoline analog", "peptide P-123", "Example 4b", "the compound of formula (I)".
    
    *   **`target` (The "Lock"):** This is the biological entity that the `compound` interacts with. It's usually a large biomolecule.
        *   **Types:** Proteins, enzymes, receptors, ion channels, antibodies, nucleic acids.
        *   **Examples:** "EGFR kinase", "ABL1", "p38 MAP kinase", "human GlyT1", "the 5-HT2A receptor", "voltage-gated sodium channel Nav1.7", "a monoclonal antibody against TNF-alpha", "human topoisomerase II".
    
    *   **CRITICAL LOGIC & COMMON PATTERNS (Study these carefully):**
        *   **"Inhibition":** "Compound X inhibited **Enzyme Y** with a Ki of 50 nM."
            *   `compound`: "Compound X"
            *   `target`: "Enzyme Y"
        *   **"Binding":** "The binding affinity (Kd) of **ligand A** for the **B-Receptor** was 1.2 µM."
            *   `compound`: "ligand A"
            *   `target`: "B-Receptor"
        *   **"Antagonism/Agonism":** "Molecule Z acts as an antagonist at the **dopamine D2 receptor** (EC50 = 30 nM)."
            *   `compound`: "Molecule Z"
            *   `target`: "dopamine D2 receptor"
        *   **Antibody-Antigen Interaction:** "The antibody, clone 3F4, bound to **human VEGF** with a Kd of 5 pM."
            *   `compound`: "human VEGF" (The antigen, which is the smaller molecule *relative to the antibody's specific binding site*, is treated as the compound in this context).
            *   `target`: "antibody, clone 3F4"
        *   **Inverse Case (Antibody as the Drug):** "The therapeutic antibody **Trastuzumab** targets the **HER2 receptor**."
            *   `compound`: "Trastuzumab" (Here the antibody IS the drug/compound being tested).
            *   `target`: "HER2 receptor"
        *   **Your primary task is to identify the drug-like molecule (`compound`) and the biological system it affects (`target`).**
    
    **2. Handling Activity Values and Units:**
    
    *   You MUST extract a **specific numerical `value`**.
    *   If the text says "in the nanomolar range", "active", or "potent" without a specific number, **you MUST IGNORE and DISCARD that data point.**
    *   The `units` field MUST be a standard abbreviation (e.g., 'nM', 'µM', 'M', 'pM'). Do NOT invent units or use descriptive text. If the unit is "micromolar", normalize it to "µM".
    
    **3. Handling Compound Lists and Tables:**
    
    *   **Lists:** "Compounds A, B, and C inhibited the target with IC50s of 10, 25, and 50 nM, respectively." You MUST create **three separate entries**.
    *   **Tables:** Process each row of a table as a separate data point.
    *   **Ranges:** "IC50 values for the series ranged from 10-100 nM." **DISCARD** this, as it does not link a specific compound to a specific value.
    
    ### Final Output Rules:
    *   **LINKED DATA ONLY**: All fields in an entry must come from the same sentence or a clear, unambiguous context (like a table row).
    *   **SMILES/SEQUENCES**: Only extract literal `compound_smiles` strings or `target_sequence` amino acid chains. Do not guess or infer them.
    *   **FORMAT**: Your entire response MUST be a single, valid JSON object. This object must have a single key, `activities`, which holds a list of the extracted data points. If no valid activities are found according to ALL the rules above, return an empty list: `{{"activities": []}}`.
    
    {format_instructions}"""),
        ("user", "Here is the text to analyze:\n---\n{text_chunk}\n---")
    ]).partial(format_instructions=extractor_parser.get_format_instructions())

    extractor_chain = (
        extractor_prompt
        | llm
        | StrOutputParser()
        | RunnableLambda(_repair_and_reserialize_json_string) # чиним JSON, если генерация прервалась
        | extractor_parser # парсим в Pydantic
    )

    # агент-валидатор
    validator_parser = PydanticOutputParser(
        pydantic_object=ValidationResult,
        retry_on_error=False,
        llm=None
    )
    
    validator_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a highly precise and critical Quality Assurance Specialist for biochemical data. Your task is to validate extracted data points against two criteria: 1) Accuracy based on the text, and 2) Informational Value.
    
    ### Validation Criteria:
    
    **1. Accuracy Check (Is it in the text?):**
    *   Verify that the core fields (`compound`, `target`, `activity_type`, `value`, `units`) are fully and accurately supported by the original text.
    *   Ensure the relationship between them is correct as stated in the text.
    
    **2. Informational Value Check (Is the data useful?):**
    *   This is a critical quality control step. A data point is considered **LOW-VALUE and INVALID** if the `compound` identifier is a generic, patent-specific placeholder that cannot be used for external database lookups.
    *   **Generic Placeholders Include:** "Example 1", "Example 25a", "Compound 42", "the compound of Example 10", "Preparation 5".
    *   **The Rule:** If the `compound` field contains such a generic placeholder **AND** the `compound_smiles` field is empty or null, you **MUST** mark the data point as invalid.
    *   **Example Scenario:**
        *   **Input:** `{{"compound": "Example 8", "value": 18.0, "compound_smiles": null}}`
        *   **Your Decision:** `{{"is_valid": false, "reason": "Compound identifier is a generic placeholder ('Example 8') without an associated SMILES string."}}`
    *   **Exception:** If a SMILES string **IS** present (`compound_smiles` is not null), the data point is considered VALID even if the `compound` name is generic, because the structure is known.
    
    ### Final Instructions:
    *   Apply both checks. A data point must pass both to be `is_valid: true`.
    *   Respond ONLY with a valid JSON object with two fields: "is_valid" (boolean) and "reason" (a brief explanation for your decision, especially for invalid points).
    
    {format_instructions}"""),
        ("user", "Original Text:\n---\n{text_chunk}\n---\nExtracted Data Point to Verify:\n{extracted_data}")
    ]).partial(format_instructions=validator_parser.get_format_instructions())
    
    validator_chain = validator_prompt | llm | validator_parser
    
    # цикл экстракции-валидации
    def run_extraction_validation_loop(chunks: List[str]) -> List[BiologicalActivity]:
        validated_activities = []
        for i, chunk in enumerate(chunks):
            print(f"\n--- 3/4. Processing Chunk {i+1}/{len(chunks)} with Extractor & Validator ---")
            try:
                extracted_collection = extractor_chain.invoke({"text_chunk": chunk})
                
                if not extracted_collection.activities:
                    print("  Extractor found 0 potential data points.")
                    continue
                
                print(f"  Extractor found {len(extracted_collection.activities)} potential data points.")
                for activity in extracted_collection.activities:
                    validation_result = validator_chain.invoke({"text_chunk": chunk, "extracted_data": activity.model_dump_json(indent=2)})
                    if validation_result.is_valid:
                        print(f"  [+] VALIDATED: Activity for {activity.compound}")
                        validated_activities.append(activity)
                    else:
                        print(f"  [-] INVALID: Activity for {activity.compound}. Reason: {validation_result.reason}")
            except Exception as e:
                # Эта ошибка теперь будет возникать только в действительно исключительных случаях
                print(f"  [!] FATAL ERROR during processing chunk {i+1}: {e}")
        return validated_activities

    # агент-аггрегатор
    def run_aggregator(validated_results: List[BiologicalActivity]) -> List[Dict]:
        print("\n--- 5. Running Aggregator Agent ---")
        final_list = []
        seen = set()
        for activity in validated_results:
            value_rounded = round(activity.value, 4) if activity.value is not None else None
            units_norm = activity.units if activity.units else None
            target_norm = activity.target if activity.target else None
            deduplication_key = (activity.compound, target_norm, value_rounded, units_norm)
            
            if deduplication_key not in seen:
                final_list.append(activity.model_dump())
                seen.add(deduplication_key)
                print(f"  -> Added entry for {activity.compound}")
            else:
                print(f"  -> Skipped duplicate entry for {activity.compound}")
        return final_list

    # полный пайплайн
    pipeline = (
        RunnableLambda(run_preprocessor)
        | RunnableLambda(run_filter)
        | RunnableLambda(run_extraction_validation_loop)
        | RunnableLambda(run_aggregator)
    )
    return pipeline

In [62]:
api_key = 'ByOSmaa6I15d940cXAlwKTN2c40I6rst'
model_name = "mistral-large-latest" 

llm = ChatMistralAI(
    api_key=api_key,
    model=model_name,
    temperature=0
)

patent_extraction_pipeline = build_pipeline(llm)

print("🚀🚀🚀 Starting Patent Extraction Pipeline (with Mistral AI)... 🚀🚀🚀\n")
final_json_output = patent_extraction_pipeline.invoke(hui)

print("\n======================================")
print("   ✅ FINAL EXTRACTION RESULT (JSON)")
print("======================================")
print(json.dumps(final_json_output, indent=2))

🚀🚀🚀 Starting Patent Extraction Pipeline (with Mistral AI)... 🚀🚀🚀

--- 1. Running Preprocessor Agent ---
Text split into 13 chunks.

--- 2. Running Filter Agent ---
Found 13 potentially relevant chunks.

--- 3/4. Processing Chunk 1/13 with Extractor & Validator ---
  [!] INFO: JSON is invalid, attempting to repair a partial list...
  [!] ERROR: Could not repair the partial JSON. Giving up. Based on the provided text, there is no specific biological activity data that meets the criteria outlined in the instructions. The text discusses various compounds and their potential uses as inhibitors of steroid sulphatase and aromatase, but it does not provide specific numerical values for activities such as IC50, Ki, or Kd with corresponding units. Therefore, the output should be an empty list.

```json
{"activities": []}
```
  Extractor found 0 potential data points.

--- 3/4. Processing Chunk 2/13 with Extractor & Validator ---
  [!] INFO: JSON is invalid, attempting to repair a partial list...