In [1]:
import os
from anthropic import Anthropic
from dotenv import load_dotenv
import re
import json

load_dotenv()

True

In [None]:
client = Anthropic(
    api_key=os.environ.get("ANTHROPIC_API_KEY"),
)

In [None]:
template = "You are tasked with extracting biological terminology from a conference transcript. Your goal is to identify and catalog scientific terms specifically related to biology, genetics, molecular biology, and related fields.\n\nHere is the conference transcript to analyze:\n\n<transcript>\n{{TRANSCRIPT}}\n</transcript>\n\n## What Qualifies as a Biological Term\n\nInclude terms that fall into these categories:\n- Gene names (e.g., p53, BRCA1, myc)\n- Protein names (e.g., hemoglobin, insulin, collagen)\n- Biological processes (e.g., transcription, translation, mitosis)\n- Cellular components (e.g., mitochondria, ribosome, nucleus)\n- Organisms and species names (e.g., E. coli, Drosophila, Homo sapiens)\n- Biological molecules (e.g., DNA, RNA, ATP)\n- Medical/biological conditions (e.g., diabetes, cancer, mutation)\n- Biological techniques and methods (e.g., PCR, CRISPR, sequencing)\n- Anatomical terms (e.g., liver, neuron, tissue)\n\n## What to Exclude\n\nDo not include:\n- General scientific terms that aren't specifically biological (like \"data\", \"analysis\", \"significant\")\n- Common words that aren't technical terminology\n- General laboratory equipment or basic scientific concepts\n\n## Extraction Requirements\n\n1. **Preserve exact formatting**: Capture each term exactly as it appears in the transcript - maintain original spelling, capitalization, and formatting\n2. **Include variants**: If both abbreviated forms (like \"PCR\") and full forms (like \"polymerase chain reaction\") appear, include both\n3. **Include all nomenclature**: Capture species names in both common and scientific formats\n4. **No duplicates**: List each unique term only once, even if it appears multiple times\n5. **Focus on technical terms**: Prioritize specialized biological terminology over general words\n\n## Analysis Process\n\nBefore providing your final answer, work through the transcript systematically in <term_extraction> tags inside your thinking block:\n\n1. Scan through the entire transcript and quote all potential biological terms exactly as they appear in the text. It's OK for this section to be quite long.\n2. For each quoted term, evaluate whether it qualifies based on the criteria above, noting your reasoning for including or excluding each term\n3. Create a running list of accepted terms, preserving their exact formatting and capitalization\n4. Check for and eliminate any duplicates from your accepted terms list\n5. Verify that your final list is sorted alphabetically while maintaining exact formatting\n\n## Output Format\n\nProvide your final answer as a JSON object with this exact structure:\n\n```json\n{\n  \"biological_terms\": [\n    \"first_term_exactly_as_appears\",\n    \"second_term_exactly_as_appears\",\n    \"third_term_exactly_as_appears\"\n  ],\n  \"total_count\": number_of_unique_terms\n}\n```\n\nThe biological_terms array should:\n- Be sorted alphabetically\n- Preserve exact capitalization, spelling, and formatting from the original transcript\n- Contain only unique terms\n\nYour final output should contain only the JSON object and should not duplicate or rehash any of the extraction work you did in the thinking block."

In [None]:
TRANSCRIPT = 'Generates synthetic spatial data to augment sparse Xenium and MERFISH data to iteratively improve feature learning'

In [20]:
message = client.messages.create(
    model="claude-sonnet-4-5-20250929",
    max_tokens=20000,
    temperature=1,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": template.replace('{{TRANSCRIPT}}', TRANSCRIPT)
                }
            ]
        }
    ]
)
print(message.content)

[TextBlock(citations=None, text='<thinking>\nLet me carefully analyze the transcript for biological terminology.\n\nThe transcript reads:\n"Generates synthetic spatial data to augment sparse Xenium and MERFISH data to iteratively improve feature learning"\n\nLet me go through this systematically:\n\n<term_extraction>\n\n1. "synthetic" - This is a general scientific term, not specifically biological. Exclude.\n\n2. "spatial data" - "Spatial" is a general term, and "data" is not biological. Exclude.\n\n3. "augment" - General term, not biological. Exclude.\n\n4. "sparse" - General term, not biological. Exclude.\n\n5. "Xenium" - This is a spatial genomics platform/technology developed by 10x Genomics for high-resolution spatial transcriptomics. It\'s a biological technique/method for analyzing gene expression in tissue samples. This qualifies as a biological term (biological technique). Include.\n\n6. "MERFISH" - This stands for "Multiplexed Error-Robust Fluorescence In Situ Hybridization"

In [22]:
message.content[0].text

'<thinking>\nLet me carefully analyze the transcript for biological terminology.\n\nThe transcript reads:\n"Generates synthetic spatial data to augment sparse Xenium and MERFISH data to iteratively improve feature learning"\n\nLet me go through this systematically:\n\n<term_extraction>\n\n1. "synthetic" - This is a general scientific term, not specifically biological. Exclude.\n\n2. "spatial data" - "Spatial" is a general term, and "data" is not biological. Exclude.\n\n3. "augment" - General term, not biological. Exclude.\n\n4. "sparse" - General term, not biological. Exclude.\n\n5. "Xenium" - This is a spatial genomics platform/technology developed by 10x Genomics for high-resolution spatial transcriptomics. It\'s a biological technique/method for analyzing gene expression in tissue samples. This qualifies as a biological term (biological technique). Include.\n\n6. "MERFISH" - This stands for "Multiplexed Error-Robust Fluorescence In Situ Hybridization" and is a spatial transcriptomic

In [25]:
import re
import json

match = re.search(r"```json\s*([\s\S]+?)\s*```", message.content[0].text)

if match:
    # 2. 提取匹配到的JSON字符串
    json_string = match.group(1)
    
    try:
        # 3. 将JSON字符串解析为Python字典
        data = json.loads(json_string)
        
        # 现在 'data' 就是你想要的Python对象
        print("成功解析JSON：")
        print(data)
        
        # 你可以访问它
        # print(data['total_count'])
        
    except json.JSONDecodeError as e:
        print(f"解析JSON时出错: {e}")
else:
    print("在文本中没有找到JSON块。")

成功解析JSON：
{'biological_terms': ['MERFISH', 'Xenium'], 'total_count': 2}


In [27]:
json.dump(data, open("keyword/test2.json", "w"))