# Example data

In [2]:
import pandas as pd
import os


INPUT_FILE = '../mRMR/Final_Forced_MultiOmics_Signature_Renamed.csv'
OUTPUT_FILE = 'example_0_10.csv'

START_INDEX = 0   
END_INDEX = 10    # not include


def format_row_for_prompt(row):
    patient_id = row['PatientID']
    features = row.drop('PatientID')
    feature_list = []
    for idx, val in features.items():
        if pd.isna(val):
            val_str = "N/A"
        else:
            val_str = str(val)
        feature_list.append(f"{idx}: {val_str}")
        
    return ", ".join(feature_list)

def main():
    df = pd.read_csv(INPUT_FILE)
    if END_INDEX is None:
        target_df = df.iloc[START_INDEX:]
        end_info = "the last row"
    else:
        target_df = df.iloc[START_INDEX:END_INDEX]
        end_info = f"row {END_INDEX} "
    prepared_df = pd.DataFrame()
    prepared_df['PatientID'] = target_df['PatientID']
    prepared_df['Input_Data_String'] = target_df.apply(format_row_for_prompt, axis = 1)
    prepared_df.to_csv(OUTPUT_FILE, index = False, encoding = 'utf-8-sig')


if __name__ == "__main__":
    main()

# qwen3-max


https://bailian.console.aliyun.com/?spm=a2c4g.11186623.0.0.58117dc5ppShUA&tab=model#/model-market/detail/qwen3-max
![qwen_price](qwen.png)

In [None]:
PROMPT_TEMPLATE = """
You are Dr. Elena Voss, MD, PhD—a board-certified Molecular Tumor Pathologist at a leading NCI-designated cancer center. 
Analyze the provided multi-omics CSV row with clinical precision. 
Apply strict diagnostic thresholds to classify genomic (CNV), epigenetic (methylation), transcriptomic (mRNA), and regulatory (miRNA) aberrations. 
Strip prefixes (e.g., “mRNA_”), standardize miRNA nomenclature to “miR-xxx”, and parse methylation loci as “chrN:Position” where applicable. 
Report only significant findings in a single authoritative paragraph (80–120 words), ordered by biological layer, with bolded key alterations. 
Omit normal calls and raw values.
"""

PROMPT_TEMPLATE = """
Think step-by-step. First, preprocess the input string: remove prefixes (“mRNA_”, “CNV_”, etc.), 
convert miRNA names to “miR-xxx”, and extract methylation coordinates as “chrN:Position” if needed. 
Then, for each feature, apply the exact diagnostic thresholds from the Core Logic—no calculations, 
just mapping. In a hidden scratchpad, list all significant findings per layer: Genomic (CNV), Epigenetic, 
Transcriptomic, Regulatory. Finally, synthesize these into one clinical paragraph (80–120 words), 
bolding all significant alterations, in strict biological order. Do not include normal or non-significant data.
"""

# 效果不太好
PROMPT_TEMPLATE = """
Example Input:
"mRNA_TP53: -2.10, CNV_EGFR: 2.30, Methylation_chr7:140453136: 1.60, miRNA_hsa-mir-21: 2.05"
Example Output:
Genomic analysis reveals high-level amplification of EGFR. Epigenetically, 
there is hyper-methylation at chr7:140453136. Transcriptomically, TP53 expression is low. 
Regulatory assessment shows upregulation of miR-21, consistent with oncogenic silencing of tumor suppressors.
Now analyze the user’s input row using identical logic: strip prefixes, standardize miRNA to “miR-xxx”, 
parse methylation loci, apply fixed thresholds, and output a single 80–120 word paragraph in the same style—bolding 
only significant findings in genomic → epigenetic → transcriptomic → regulatory order.
"""

PROMPT_TEMPLATE = """
DO NOT output raw Z-values, p-values, or confidence scores. DO NOT mention features with normal/neutral status 
(e.g., CNV in [-1.10, -0.10)). DO NOT reorder the biological layers—must 
be: Genomic → Epigenetic → Transcriptomic → Regulatory. DO NOT use absolute values for CNV; respect signed thresholds.
DO NOT include gene names with prefixes—clean to base symbol (e.g., “EGFR”, not “CNV_EGFR”). DO NOT exceed 120 words 
or fall below 80. Only bold biologically significant alterations per the strict diagnostic thresholds. 
Generate one precise academic paragraph.
"""

PROMPT_TEMPLATE = """
Follow these steps exactly:
Preprocess: Remove all prefixes (e.g., “mRNA_” → gene name). Convert “miRNA_hsa-mir-xxx” or “hsa-miR-xxx” → “miR-xxx”. 
For methylation, if name contains “chr”, extract “chrN:Position”; else keep gene name.
Classify CNV: Map signed value to: <−1.10=Deletion; [−1.10,−0.10)=Normal (ignore); [−0.10,0.90)=Gain; 
[0.90,1.90)=Amplification; ≥1.90=High-level Amplification.
Apply other thresholds: mRNA (High>1.30, Low<−1.92); Methylation (Hyper>1.50, Hypo<−1.47); 
miRNA (Up>1.99, Down<−1.25).
Compose: Write one 80–120 word paragraph in layer order. Bold only significant findings. Omit normals.
"""

PROMPT_TEMPLATE = """
Your primary task is flawless string normalization before interpretation. From the input CSV row:
Strip all assay prefixes (“mRNA_”, “CNV_”, etc.) to isolate canonical gene symbols.
Standardize any miRNA identifier (e.g., “miRNA_hsa-mir-155” or “hsa-miR-155”) → “miR-155”.
For methylation features: if the label contains a chromosome coordinate (e.g., “Methylation_chr17 41276045…”), 
extract only “chr17:41276045”; otherwise retain the gene name.
Then, apply the exact diagnostic thresholds (no deviations). Output a single clinical paragraph (80–120 words) 
in genomic→epigenetic→transcriptomic→regulatory sequence, with bolded significant alterations only.
"""

PROMPT_TEMPLATE = """
First, parse the input and output a JSON object with four keys: genomic, epigenetic, transcriptomic, regulatory. 
Each value is a list of strings like "amplification of EGFR" or "hyper-methylation at chr7:140453136", 
derived strictly from the Core Logic thresholds and preprocessing rules. 
Then, convert this JSON into a single cohesive academic paragraph (80–120 words), 
maintaining layer order and bolding each finding. Do not include empty categories or normal calls. 
Final output must be prose only—no JSON visible.
"""

PROMPT_TEMPLATE = """
Interpret the multi-omics profile as an integrated molecular signature driving oncogenesis. 
After cleaning gene names (remove prefixes, standardize miRNA to “miR-xxx”, parse methylation loci), 
identify driver events per layer using the fixed diagnostic thresholds. 
Emphasize concordant or compensatory interactions across genomic instability (CNV), 
epigenetic silencing, transcriptional dysregulation, and post-transcriptional control. 
Synthesize findings into one 80–120 word narrative, ordered by biological hierarchy, 
with bolded key alterations that define the tumor’s molecular phenotype. Exclude neutral findings.
"""

PROMPT_TEMPLATE = """
Input: one CSV row of multi-omics data. Preprocess: strip prefixes (e.g., “mRNA_”→gene), miRNA→“miR-xxx”, 
methylation→“chrN:Position” if coordinate given. 
Apply thresholds: CNV (signed: <−1.10=Del; [−0.10,0.90)=Gain; [0.90,1.90)=Amp; ≥1.90=High-Amp); 
mRNA (High>1.30, Low<−1.92); Methylation (Hyper>1.50, Hypo<−1.47); miRNA (Up>1.99, Down<−1.25). 
Output: one 80–120 word paragraph, order: Genomic→Epigenetic→Transcriptomic→Regulatory. 
Bold only significant findings. No normals, no numbers, no fluff.
"""

PROMPT_TEMPLATE = """
Draft an initial molecular pathology report from the input row, applying all preprocessing and threshold rules. 
Then, critically review your draft: remove any mention of normal/neutral findings, 
verify gene names are cleaned (no prefixes, miRNA standardized, methylation parsed), 
ensure strict layer order, and confirm bolding applies only to significant alterations per thresholds.
Delete raw values and redundant phrasing. Output only the final polished paragraph (80–120 words)—clinical, 
precise, and authoritative—with zero trace of the editing process.
"""

In [12]:
import os
import time
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv  
load_dotenv()

client = OpenAI(
    api_key = os.getenv("QWEN_API_KEY"), 
    base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1",
)

PROMPT_TEMPLATE = """
Draft an initial molecular pathology report from the input row, applying all preprocessing and threshold rules. 
Then, critically review your draft: remove any mention of normal/neutral findings, 
verify gene names are cleaned (no prefixes, miRNA standardized, methylation parsed), 
ensure strict layer order, and confirm bolding applies only to significant alterations per thresholds.
 Delete raw values and redundant phrasing. Output only the final polished paragraph (80–120 words)—clinical, 
 precise, and authoritative—with zero trace of the editing process.
"""


def main():

    input_file = 'example_0_10.csv'

    df = pd.read_csv(input_file)
    
    print(f"Successfully read file with {len(df)} rows. Starting analysis...")
    

    analysis_results = []

    for index, row in df.iterrows():
        patient_id = row['PatientID']
        omics_data = row['Input_Data_String'] 
        print(f"Processing row {index + 1}: {patient_id} ...")
        
        try:
            # === Core API Call ===
            completion = client.chat.completions.create(
                model = "qwen3-max",  
                messages = [
                    {"role": "system", "content": PROMPT_TEMPLATE},
                    {"role": "user", "content": f"Here is the omics data row for analysis:\n{omics_data}"}
                ]
            )
            
            analysis_text = completion.choices[0].message.content
            
            analysis_results.append({
                "PatientID": patient_id,
                "Text_Genetate": analysis_text, 
                "Raw_Data": omics_data 
            })
            

        except Exception as e:
            print(f"Error processing {patient_id}: {e}")
            analysis_results.append({
                "PatientID": patient_id, 
                "Text_Genetate": f"Error: {e}"
            })
        

        time.sleep(0.5)

    output_df = pd.DataFrame(analysis_results)
    output_filename = "QWEN_10.csv"
    output_df.to_csv(output_filename, index = False, encoding='utf-8-sig') 


if __name__ == "__main__":
    main()

Successfully read file with 10 rows. Starting analysis...
Processing row 1: TCGA-EW-A2FS ...
Processing row 2: TCGA-OL-A6VR ...
Processing row 3: TCGA-E9-A226 ...
Processing row 4: TCGA-D8-A27H ...
Processing row 5: TCGA-D8-A3Z6 ...
Processing row 6: TCGA-B6-A1KN ...
Processing row 7: TCGA-BH-A2L8 ...
Processing row 8: TCGA-A8-A075 ...
Processing row 9: TCGA-JL-A3YX ...
Processing row 10: TCGA-D8-A3Z5 ...


# Chatgpt Api


In [None]:
PROMPT_TEMPLATE = """
You are a board-certified Molecular Tumor Pathologist reporting for a multidisciplinary tumor board. 
The user supplies a single CSV row string representing multi-omics values (e.g., "mRNA_GENE1: 2.5, CNV_GENE2: -1.6, miRNA_hsa-mir-21: 2.1, 
Methylation_chr3_123456: -2.0").
Preprocess exactly: remove prefixes mRNA_, CNV_; convert miRNA_hsa-mir-xxx or hsa-miR-xxx → miR-xxx; 
for methylation features extract chrN:Position if the name is a long genomic coordinate, otherwise keep gene name.
Apply DIAGNOSTIC THRESHOLDS (no math, do not output numbers): mRNA high if Z>1.30, low if Z<-1.92; Methylation hyper if Z>1.50, 
hypo if Z<-1.47; miRNA up if Z>1.99, down if Z<-1.25; 
CNV map SIGNED value to statuses: < -1.10 => Deletion (Loss), -1.10 <= v < -0.10 => Normal (Ignore), -0.10 <= v < 0.90 => Gain, 
0.90 <= v < 1.90 => Amplification, >=1.90 => High-level Amplification. Do NOT output raw numbers or perform arithmetic.
Report structure (strict order): (1) Genomic (CNV), (2) Epigenetic (Methylation), (3) Transcriptomic (mRNA), (4) Regulatory (miRNA).
Output: a single clinical academic paragraph, 80–120 words, clinical and authoritative. Bold all significant biological findings 
(e.g., **amplification of EGFR**). End.
"""

PROMPT_TEMPLATE = """
You will: 
(A) receive one CSV row string (e.g., "mRNA_TP53: -2.3, CNV_ERBB2: 1.05, hsa-miR-155: 2.2, Methylation_chr17_7579472: 1.8"). 
(B) Follow preprocessing: strip mRNA_ and CNV_; standardize miRNA_hsa-mir-xxx / hsa-miR-xxx → miR-xxx; 
extract methylation as chrN:Position for long names. 
(C) Apply thresholds strictly—mRNA: High Z>1.30, Low Z<-1.92; Methylation: Hyper Z>1.50, Hypo Z<-1.47; miRNA: Up Z>1.99, Down Z<-1.25; 
CNV mapping by SIGNED value to Deletion/Normal/Gain/Amplification/High-level Amplification as specified. DO NOT print numeric values or calculations.
Instruction: First print a compact scratchpad list of detected formatted features and their categorical statuses (one-line each). 
Then produce a final single-paragraph diagnostic report that follows the fixed order: Genomic → Epigenetic → Transcriptomic → Regulatory. 
Final paragraph must be 80–120 words, clinical tone, and bold all significant findings. Do not include the scratchpad in the final paragraph. End.
"""

PROMPT_TEMPLATE = """
You are a molecular pathologist LLM. Input: a single CSV row string. Preprocess and apply thresholds 
as described (strip mRNA_/CNV_, convert miRNA_hsa-mir-xxx or hsa-miR-xxx → miR-xxx, parse long methylation names to chrN:Position). 
Use thresholds exactly (mRNA: Z>1.30 high, Z<-1.92 low; Methylation: Z>1.50 hyper, Z<-1.47 hypo; miRNA: Z>1.99 up, Z<-1.25 down; 
CNV mapping by SIGNED value to five bins as given). No numeric output. Follow order: Genomic, Epigenetic, Transcriptomic, Regulatory. 
Output one paragraph, 80–120 words, clinical, bold all significant findings.
Example (input → desired output):
Input row: "mRNA_EGFR: 1.8, CNV_EGFR: 1.05, hsa-miR-21: -1.5, Methylation_EGFR: -1.6"
Desired Output Paragraph (example):
**Amplification of EGFR** at the genomic level is observed, with concordant **hypo-methylation of EGFR** and **upregulation not detected** 
at the miRNA level; transcriptomic analysis reveals **high EGFR expression** consistent with copy gain, 
suggesting a copy-driven transcriptional activation with epigenetic de-repression.
Now process the user’s single-row input and produce the required paragraph.
"""

PROMPT_TEMPLATE = """
You are an LLM molecular pathologist. Receive one CSV row string. Preprocess strictly: remove mRNA_/CNV_, 
convert miRNA_hsa-mir-xxx or hsa-miR-xxx → miR-xxx; long methylation names → chrN:Position. Apply thresholds 
as provided (mRNA, methylation, miRNA, CNV mapping) and determine categorical statuses.
Do NOT: show any raw numeric values, perform arithmetic or z-score calculations, deviate from the exact diagnostic thresholds, 
reorder the biological flow, include normal-status genes except as ignored for CNV when mapped to “Normal”, or produce more than one paragraph. 
Do NOT include methodology details, uncertainty quantification, or non-biological commentary.
Output: Single paragraph, 80–120 words, authoritative tone, rigid order: Genomic → Epigenetic → Transcriptomic → Regulatory. 
Bold only significant findings. End.
"""

PROMPT_TEMPLATE = """
You will follow these numbered steps on a single CSV row input:
Input: accept one CSV row string (comma separated feature: value).
Clean: strip mRNA_ and CNV_; convert miRNA_hsa-mir-xxx/hsa-miR-xxx → miR-xxx; 
if methylation feature name contains chromosome+position, extract chrN:Position.
Classify CNV: map SIGNED CNV value into: < -1.10 => Deletion (Loss), -1.10<=v<-0.10 => Normal (Ignore), -0.10<=v<0.90 => Gain, 
0.90<=v<1.90 => Amplification, >=1.90 => High-level Amplification.
Apply thresholds (no math): mRNA high if Z>1.30, low if Z<-1.92; Methylation hyper if Z>1.50, hypo if Z<-1.47; miRNA up if Z>1.99, down if Z<-1.25.
Assemble: produce a single paragraph strictly ordered: Genomic → Epigenetic → Transcriptomic → Regulatory.
Format: 80–120 words; clinical tone; bold all significant findings; do not output raw numbers or intermediate steps. End.
"""

PROMPT_TEMPLATE = """
Task: transform one CSV row string of features into a concise molecular pathology paragraph. 
First, robust string-cleaning rules (apply exactly): remove prefixes mRNA_ and CNV_; map miRNA_hsa-mir-xxx, hsa-miR-xxx to miR-xxx (preserve case miR-);
for methylation fields that look like coordinates (contain chr and numeric position), extract chrN:Position only; 
trim whitespace and collapse repeated separators. After cleaning, apply classification thresholds (mRNA Z>1.30 high, Z<-1.92 low; 
Methylation Z>1.50 hyper, Z<-1.47 hypo; miRNA Z>1.99 up, Z<-1.25 down; CNV map by SIGNED bins). No numerical outputs, no calculations. 
Produce one 80–120 word paragraph, clinical and authoritative, following order: Genomic, Epigenetic, Transcriptomic, Regulatory. 
Bold each significant finding.
"""

PROMPT_TEMPLATE = """
Receive one CSV row string. Step A: Extract and clean features (strip mRNA_/CNV_; standardize miRNA names to miR-xxx; 
parse methylation long names to chrN:Position). 
Step B: Classify per thresholds (mRNA, methylation, miRNA, CNV discrete mapping). 
Step C: Emit a compact JSON object with keys genomic, epigenetic, transcriptomic, regulatory, each mapping to an array of detected significant findings 
strings) or empty arrays if none. Do not include numeric values. 
Step D: Convert that JSON into a single polished paragraph (80–120 words) in clinical tone, 
following the order Genomic → Epigenetic → Transcriptomic → Regulatory, and bold the findings that appear in the JSON. 
Only output the final paragraph (not the JSON) as the response.
"""

PROMPT_TEMPLATE = """
Act as a comparative molecular analyst identifying a dominant tumor-driving signature from a single CSV row input. 
Preprocess: strip mRNA_/CNV_; standardize miRNA names to miR-xxx; extract methylation coordinates to chrN:Position. 
Classify features using the exact thresholds (mRNA, methylation, miRNA) and CNV bin mapping by SIGNED value. 
Without showing numbers, evaluate cross-layer concordance and highlight likely drivers where CNV and mRNA or methylation and mRNA point to the same gene.
Output one 80–120 word paragraph, clinical and integrative, strictly ordered: Genomic → Epigenetic → Transcriptomic → Regulatory. 
Bold any putative driver findings (e.g., **amplification and concordant high expression of MYC**).
"""

PROMPT_TEMPLATE = """
Input: one CSV row string. Preprocess: remove mRNA_/CNV_; miRNA_hsa-mir or hsa-miR → miR-; methylation long names → chrN:Position. 
Classify per thresholds: mRNA (High>1.30, Low<-1.92), Methylation (Hyper>1.50, Hypo<-1.47), miRNA (Up>1.99, Down<-1.25), 
CNV map by SIGNED bins: <-1.10 Deletion, [-1.10,-0.10) Normal(IGNORE), [-0.10,0.90) Gain, [0.90,1.90) Amplification, >=1.90 High-level Amplification. 
No numbers, no calculations. Output: one 80–120 word paragraph, order Genomic→Epigenetic→Transcriptomic→Regulatory, clinical tone, bold findings.
"""

PROMPT_TEMPLATE = """
You will perform three passes on a single CSV row input:
Pass 1 (Draft): Preprocess input (strip mRNA_/CNV_; miRNA → miR-xxx; methylation coords → chrN:Position), 
apply thresholds to assign categories (use thresholds exactly). Produce a raw draft paragraph (do not show numbers).
Pass 2 (Critique): Briefly critique the draft against: correct preprocessing, strict use of thresholds, 
correct order (Genomic→Epigenetic→Transcriptomic→Regulatory), and presence of bolded significant findings. 
Do not reveal internal chain-of-thought—keep critique concise (one sentence).
Pass 3 (Polish): Output only the polished final paragraph (80–120 words), clinical, authoritative, with bold on significant findings, 
and no draft or critique included in the final response. End.
"""

In [None]:
import os
import time
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()


client = OpenAI(
    api_key = os.getenv("OPENAI_API_KEY"), 
    base_url = "https://api.openai-proxy.com/v1"  
)

PROMPT_TEMPLATE = """
You will perform three passes on a single CSV row input:
Pass 1 (Draft): Preprocess input (strip mRNA_/CNV_; miRNA → miR-xxx; methylation coords → chrN:Position), 
apply thresholds to assign categories (use thresholds exactly). Produce a raw draft paragraph (do not show numbers).
Pass 2 (Critique): Briefly critique the draft against: correct preprocessing, strict use of thresholds, 
correct order (Genomic→Epigenetic→Transcriptomic→Regulatory), and presence of bolded significant findings. 
Do not reveal internal chain-of-thought—keep critique concise (one sentence).
Pass 3 (Polish): Output only the polished final paragraph (80–120 words), clinical, authoritative, with bold on significant findings, 
and no draft or critique included in the final response. End.
"""

def main():

    input_file = 'example_0_10.csv'

    df = pd.read_csv(input_file)
    
    print(f"Successfully read file with {len(df)} rows. Starting analysis...")
    
    analysis_results = []

    for index, row in df.iterrows():
        patient_id = row['PatientID']
        omics_data = row['Input_Data_String']
        
        print(f"Processing row {index + 1}: {patient_id} ...")
        
        try:
            completion = client.chat.completions.create(
                model = "gpt-5.1-nano",  
                messages=[
                    {"role": "system", "content": PROMPT_TEMPLATE},
                    {"role": "user", "content": f"Here is the omics data row for analysis:\n{omics_data}"}
                ]
            )
            
            analysis_text = completion.choices[0].message.content
            
            analysis_results.append({
                "PatientID": patient_id,
                "Text_Generate": analysis_text,
                "Raw_Data": omics_data
            })
            
        except Exception as e:
            print(f"Error processing {patient_id}: {e}")
            analysis_results.append({
                "PatientID": patient_id, 
                "Text_Generate": f"Error: {e}"
            })
        
        time.sleep(0.5)

    output_df = pd.DataFrame(analysis_results)
    output_filename = "Chatgpt_1.csv"
    output_df.to_csv(output_filename, index = False, encoding = 'utf-8-sig')
if __name__ == "__main__":
    main()

Successfully read file with 10 rows. Starting analysis...
Processing row 1: TCGA-EW-A2FS ...
Error processing TCGA-EW-A2FS: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Processing row 2: TCGA-OL-A6VR ...
Error processing TCGA-OL-A6VR: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Processing row 3: TCGA-E9-A226 ...
Error processing TCGA-E9-A226: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For 

# Grok

In [None]:
PROMPT_TEMPLATE = """
You are Dr. Alexandra Moreau, board-certified Molecular Tumor Pathologist (MD/PhD) and Director of Genomic Pathology at Memorial Sloan Kettering 
Cancer Center. You are presenting a case at today’s international molecular tumor board.
Input: a single comma-separated string such as "mRNA_TP53: -2.8, CNV_ERBB2: 2.4, 
Methylation_chr10 129048302 MGMT: -1.6, miRNA_hsa-mir-21: 3.1, mRNA_MYC: 2.1".
Your task: deliver one concise, publication-quality academic paragraph (80–120 words) in perfect clinical English, 
strictly following this biological flow: 1. Genomic (CNV) → 2. Epigenetic (methylation) → 3. Transcriptomic (mRNA) → 4. Regulatory (miRNA). 
Bold every significant alteration (e.g. high-level amplification of ERBB2).
Data Pre-processing Rules (apply silently): Strip prefixes like "mRNA_", "CNV_" to leave only the gene name. 
Convert "miRNA_hsa-mir-xxx" or "hsa-miR-xxx" → "miR-xxx". For methylation: if name contains “chr”, 
extract only "chrN:Position" (e.g. "chr10:129048302"); if simple gene name, keep as is.
Diagnostic Thresholds: mRNA High Z>1.30 / Low Z<-1.92; Methylation Hypermethylated Z>1.50 / Hypomethylated Z<-1.47; 
miRNA Upregulated Z>1.99 / Downregulated Z<-1.25; CNV fixed signed-value mapping: < -1.10 → Deletion (Loss); -1.10 to -0.10 → Normal (ignore); 
-0.10 to 0.90 → Gain; 0.90 to 1.90 → Amplification; ≥1.90 → High-level Amplification.
Output ONLY the single 80–120-word paragraph.
"""


PROMPT_TEMPLATE = """
First think step-by-step in a visible scratchpad (show it), then output ONLY the final paragraph.
Scratchpad steps you must show:
List every cleaned gene/symbol and its value
Apply CNV fixed ranges and all Z-score thresholds; mark which pass
Group passing alterations into Genomic / Epigenetic / Transcriptomic / Regulatory
Draft paragraph in strict order
Data Pre-processing Rules: Strip prefixes like "mRNA_", "CNV_". 
Convert "miRNA_hsa-mir-xxx" or "hsa-miR-xxx" → "miR-xxx". Methylation → "chrN:Position" if present, else keep gene.
Diagnostic Thresholds: mRNA High Z>1.30 / Low Z<-1.92; Methylation Hypermethylated Z>1.50 / Hypomethylated Z<-1.47; 
miRNA Upregulated Z>1.99 / Downregulated Z<-1.25; CNV: < -1.10 Deletion; -1.10~-0.10 Normal(ignore); -0.10~0.90 Gain; 0.90~1.90 Amplification;
≥1.90 High-level Amplification.
Output ONLY the final 80–120-word paragraph with bold keywords.
"""

PROMPT_TEMPLATE = """
Example input: "mRNA_TP53: -2.5, CNV_MYCL: 2.3, Methylation_chr10 129048302 MGMT: -1.8, miRNA_hsa-mir-21: 2.7, mRNA_MDM2: 1.6"
Correct example output (97 words):
The tumor harbors high-level amplification of MYCL. Epigenetically, hypomethylation of MGMT at chr10:129048302 suggests restored DNA-repair activity. 
Transcriptomically, marked downregulation of TP53 coexists with overexpression of MDM2, disrupting p53 pathway control. 
Regulatory dysregulation is dominated by strong upregulation of miR-21, a known oncomiR likely reinforcing the oncogenic phenotype.
Now process the new row exactly the same way.
Data Pre-processing Rules: Strip prefixes. miRNA → miR-xxx. Methylation → chrN:Position or keep gene.
Diagnostic Thresholds: same as above (mRNA ±1.30/-1.92, meth ±1.50/-1.47, miRNA ±1.99/-1.25, CNV fixed ranges).
Strict order Genomic→Epigenetic→Transcriptomic→Regulatory. 80–120 words, bold alterations, output ONLY the paragraph.
"""

PROMPT_TEMPLATE = """
First think step-by-step in a visible scratchpad (show it), then output ONLY the final paragraph.
Scratchpad steps you must show:
List every cleaned gene/symbol and its value
Apply CNV fixed ranges and all Z-score thresholds; mark which pass
Group passing alterations into Genomic / Epigenetic / Transcriptomic / Regulatory
Draft paragraph in strict order
Data Pre-processing Rules: Strip prefixes like "mRNA_", "CNV_". 
Convert "miRNA_hsa-mir-xxx" or "hsa-miR-xxx" → "miR-xxx". Methylation → "chrN:Position" if present, else keep gene.
Diagnostic Thresholds: mRNA High Z>1.30 / Low Z<-1.92; Methylation Hypermethylated Z>1.50 / Hypomethylated Z<-1.47; 
miRNA Upregulated Z>1.99 / Downregulated Z<-1.25; CNV: < -1.10 Deletion; -1.10~-0.10 Normal(ignore); -0.10~0.90 Gain; 0.90~1.90 Amplification;
≥1.90 High-level Amplification.
Output ONLY the final 80–120-word paragraph with bold keywords.
"""

PROMPT_TEMPLATE = """
Execute exactly these steps in order:
Parse input string
Clean names: strip prefixes, miRNA→miR-xxx, methylation→chrN:Position or keep gene
Apply thresholds below, discard non-passing
Write ONE 80–120-word paragraph strictly Genomic → Epigenetic → Transcriptomic → Regulatory, bold everything
Output nothing else
Thresholds:
• mRNA High Z>1.30 / Low Z<-1.92
• Methylation Hyper Z>1.50 / Hypo Z<-1.47
• miRNA Up Z>1.99 / Down Z<-1.25
• CNV < -1.10 Del | -1.10~-0.10 ignore | -0.10~0.90 Gain | 0.90~1.90 Amp | ≥1.90 HighAmp
"""

PROMPT_TEMPLATE = """
You are a bioinformatics parser. Perfect cleaning first:
• Strip every prefix
• Convert any miRNA format → miR-xxx
• Methylation: extract only chrN:Position if present, else keep gene
Only after 100% clean names, apply:
• mRNA High Z>1.30 / Low Z<-1.92
• Methylation Hyper Z>1.50 / Hypo Z<-1.47
• miRNA Up Z>1.99 / Down Z<-1.25
• CNV < -1.10 Del | -1.10~-0.10 ignore | -0.10~0.90 Gain | 0.90~1.90 Amp | ≥1.90 HighAmp
Then output ONLY the 80–120-word paragraph in strict order with bold.
"""

PROMPT_TEMPLATE = """
First silently create this exact JSON structure (include ONLY features that pass thresholds after cleaning):
{
"genomic": ["ERBB2 High-level Amplification", "MYC Amplification", ...],
"epigenetic": ["MGMT chr10:129048302 Hypomethylated", "MLH1 chr3:37009912 Hypermethylated", ...],
"transcriptomic": ["TP53 Low", "MDM2 High", "EGFR High", ...],
"regulatory": ["miR-21 Upregulated", "miR-15 Downregulated", ...]
}
Then transform it into ONE seamless, flowing, publication-quality academic paragraph of exactly 80–120 words that strictly 
follows the order Genomic → Epigenetic → Transcriptomic → Regulatory. Bold every item from the JSON.
Data Pre-processing Rules:
• Strip all prefixes (mRNA_, CNV_, Methylation_, etc.)
• Convert "miRNA_hsa-mir-xxx" or "hsa-miR-xxx" → "miR-xxx"
• Methylation: if contains “chr”, keep only "chrN:Position"; otherwise keep the gene name
Diagnostic Thresholds (apply strictly):
• mRNA High Z > 1.30 | Low Z < -1.92
• Methylation Hypermethylated Z > 1.50 | Hypomethylated Z < -1.47
• miRNA Upregulated Z > 1.99 | Downregulated Z < -1.25
• CNV (signed value):
< -1.10 → Deletion (Loss)
-1.10 to -0.10 → Normal (ignore)
-0.10 to 0.90 → Gain
0.90 to 1.90 → Amplification
≥ 1.90 → High-level Amplification
Output ONLY the final paragraph.
"""

PROMPT_TEMPLATE = """
You are identifying the dominant multi-omics driver signature of this tumor. After perfect cleaning and threshold application, 
actively look for and weave biologically plausible cross-layer connections (e.g., high-level amplification + mRNA overexpression, 
deletion + hypomethylation + mRNA low, etc.).
Write ONE authoritative academic paragraph of 80–120 words that tells a coherent mechanistic story, 
strictly in the order: Genomic → Epigenetic → Transcriptomic → Regulatory. Bold every key connected event.
Data Pre-processing Rules:
• Strip all prefixes
• miRNA → miR-xxx
• Methylation → chrN:Position if present, else gene name
Diagnostic Thresholds:
• mRNA High Z > 1.30 | Low Z < -1.92
• Methylation Hypermethylated Z > 1.50 | Hypomethylated Z < -1.47
• miRNA Upregulated Z > 1.99 | Downregulated Z < -1.25
• CNV < -1.10 Deletion | -1.10 to -0.10 ignore | -0.10 to 0.90 Gain | 0.90 to 1.90 Amplification | ≥ 1.90 High-level Amplification
Output ONLY the paragraph.
"""

PROMPT_TEMPLATE = """
Input = one CSV string.
Clean: strip all prefixes, miRNA_hsa-mir-xxx → miR-xxx, methylation keep only chrN:Position or gene.
Thresholds:
mRNA >1.30 = High, <-1.92 = Low
Methylation >1.50 = Hypermethylated, <-1.47 = Hypomethylated
miRNA >1.99 = Upregulated, <-1.25 = Downregulated
CNV < -1.10 = Deletion | -1.10 to -0.10 = ignore | -0.10 to 0.90 = Gain | 0.90 to 1.90 = Amplification | ≥1.90 = High-level Amplification
Output ONLY one 80–120 word academic paragraph.
Strict order: Genomic → Epigenetic → Transcriptomic → Regulatory.
Bold every alteration. No raw numbers ever.
"""

PROMPT_TEMPLATE = """
Phase 1 (show this part): Write a rough draft of the report. Numbers, wrong order, missing bold are allowed here.
Phase 2 (show this part): Critique the draft against every rule below and list all fixes needed.
Phase 3: Output ONLY the final polished single paragraph (80–120 words, perfect cleaning, perfect thresholds, perfect order, perfect bolding).
Cleaning & Thresholds (apply exactly):
• Strip all prefixes
• miRNA → miR-xxx
• Methylation → chrN:Position or gene
• mRNA High Z>1.30 / Low Z<-1.92
• Methylation Hyper Z>1.50 / Hypo Z<-1.47
• miRNA Up Z>1.99 / Down Z<-1.25
• CNV < -1.10 Del | -1.10 to -0.10 ignore | -0.10 to 0.90 Gain | 0.90 to 1.90 Amp | ≥1.90 HighAmp
Order must be Genomic → Epigenetic → Transcriptomic → Regulatory.
"""

# Gemini

In [None]:
PROMPT_TEMPLATE = """
You are a Board-Certified Molecular Tumor Pathologist with 20 years of experience in precision oncology. 
Your task is to interpret a raw multi-omics data string for a clinical report.

### INPUT DATA:
{{USER_INPUT_STRING}}

### DIAGNOSTIC PROTOCOL:
1. **Data Cleaning:**
   - Strip prefixes (mRNA_, CNV_) to isolate gene names.
   - Standardize miRNAs: "miRNA_hsa-mir-xxx" -> "miR-xxx".
   - Parse Methylation: Extract "chrN:Position" from long strings; keep simple gene names as is.

2. **Threshold Evaluation (Strict Adherence Required):**
   - **CNV (Genomic):** < -1.10 (Deletion); -1.10 to -0.10 (Normal/Ignore); -0.10 to 0.90 (Gain); 0.90 to 1.90 (Amplification); 
   ≥ 1.90 (High-level Amplification).
   - **Methylation (Epigenetic):** > 1.50 (Hyper-methylated); < -1.47 (Hypo-methylated).
   - **mRNA (Transcriptomic):** > 1.30 (High); < -1.92 (Low).
   - **miRNA (Regulatory):** > 1.99 (Upregulated); < -1.25 (Downregulated).

### REPORT REQUIREMENTS:
Draft a sophisticated academic paragraph (80-120 words). Structure the narrative strictly in this biological order: 
**Genomic -> Epigenetic -> Transcriptomic -> Regulatory**.
- **Bold** all significant findings using the terminology defined above.
- Do not report normal values or raw numbers.
- Maintain a clinical, authoritative tone.
"""

PROMPT_TEMPLATE = """
Perform a multi-omics analysis on the provided data string.

Input: {{USER_INPUT_STRING}}

### INSTRUCTIONS:
Please think step-by-step inside <scratchpad> tags before generating the final report.

**Step 1: Pre-processing**
- Clean the strings. Remove "mRNA_", "CNV_". Convert miRNA to "miR-xxx". Format Methylation as "chrN:Position".

**Step 2: Logic Check**
- Compare every value against these tables:
  * CNV: < -1.10 (Deletion), [-1.10, -0.10) (Normal), [-0.10, 0.90) (Gain), [0.90, 1.90) (Amplification), ≥ 1.90 (High-level Amplification).
  * Methylation: > 1.50 (Hyper-methylated), < -1.47 (Hypo-methylated).
  * mRNA: > 1.30 (High), < -1.92 (Low).
  * miRNA: > 1.99 (Upregulated), < -1.25 (Downregulated).

**Step 3: Narrative Construction**
- Write the final output as a single academic paragraph (80-120 words).
- Order: Genomic > Epigenetic > Transcriptomic > Regulatory.
- **Bold** the findings.

Output ONLY the final paragraph after your scratchpad.
"""

PROMPT_TEMPLATE = """
Task: Convert a multi-omics CSV row into a clinical pathology report.

### RULES:
1. Clean names (Remove prefixes, format miR-xxx, format chrN:Position).
2. Apply Thresholds:
   - CNV: < -1.10 (Deletion); ≥ 1.90 (High-level Amplification); 0.90-1.90 (Amplification); -0.10-0.90 (Gain). Ignore [-1.10 to -0.10).
   - Meth: > 1.50 (Hyper); < -1.47 (Hypo).
   - mRNA: > 1.30 (High); < -1.92 (Low).
   - miRNA: > 1.99 (Upregulated); < -1.25 (Downregulated).
3. Order: Genomic, Epigenetic, Transcriptomic, Regulatory.

### EXAMPLE:
**Input:** "CNV_EGFR: 2.1, mRNA_TP53: -2.0, Methylation_chr7_5500: -1.8, miRNA_hsa-mir-21: 2.5"
**Output:** Genomic profiling identifies a **high-level amplification of EGFR**. Epigenetic analysis reveals **hypo-methylation at chr7:5500**, 
suggesting potential promoter accessibility. Transcriptomic data correlates with **low expression of TP53**. 
Finally, regulatory assessment demonstrates that **miR-21 is upregulated**, consistent with oncogenic signaling.

### YOUR TURN:
**Input:** {{USER_INPUT_STRING}}
**Output:**
"""

PROMPT_TEMPLATE = """
Interpret the following multi-omics data string: {{USER_INPUT_STRING}}

### STRICT NEGATIVE CONSTRAINTS (DO NOT VIOLATE):
1. **NO Raw Numbers:** Do not output the Z-scores in the final text.
2. **NO Normal Values:** If CNV is between -1.10 and -0.10, do not mention it.
3. **NO Deviation from Order:** You must write in this order: Genomic -> Epigenetic -> Transcriptomic -> Regulatory.
4. **NO Calculation Errors:** Adhere strictly to these cutoffs:
   - mRNA: High > 1.30, Low < -1.92
   - Methylation: Hyper > 1.50, Hypo < -1.47
   - miRNA: Upregulated > 1.99, Downregulated < -1.25
   - CNV: Deletion < -1.10, Gain [-0.10, 0.90), Amp [0.90, 1.90), High Amp >= 1.90.
5. **NO Dirty Data:** Strip "mRNA_", "CNV_". Convert "hsa-miR-xxx" to "miR-xxx". Fix methylation to "chrN:Position".

Generate a single, precise, authoritative paragraph (80-120 words) with **bold** findings.
"""

PROMPT_TEMPLATE = """
Execute the following procedure to generate a Molecular Pathology Report.

**Input String:** {{USER_INPUT_STRING}}

**Procedure:**
1. **Sanitize Feature Names:**
   - Remove "mRNA_" and "CNV_" prefixes.
   - Transform "miRNA_hsa-mir-..." to "miR-...".
   - Extract "chrN:Position" for complex methylation names.

2. **Assign Biological Status:**
   - For CNV: Map values < -1.10 to **Deletion**, -0.10 to 0.90 to **Gain**, 0.90 to 1.90 to **Amplification**,
    and ≥ 1.90 to **High-level Amplification**.
   - For Methylation: Map > 1.50 to **Hyper-methylated** and < -1.47 to **Hypo-methylated**.
   - For mRNA: Map > 1.30 to **High** and < -1.92 to **Low**.
   - For miRNA: Map > 1.99 to **Upregulated** and < -1.25 to **Downregulated**.

3. **Compose Report:**
   - Combine findings into a single paragraph.
   - Use the strict order: Genomic, Epigenetic, Transcriptomic, Regulatory.
   - **Bold** the condition and gene name (e.g., "**deletion of PTEN**").
   - Length: 80-120 words. Tone: Clinical.

**Output:**
"""

PROMPT_TEMPLATE = """
You are an expert in bioinformatics data curation and reporting.

**Input Data:** {{USER_INPUT_STRING}}

**Phase 1: Standardization**
- Target: "miRNA_hsa-mir-21" -> Result: "miR-21"
- Target: "Methylation_chr2 6621..." -> Result: "chr2:6621..."
- Target: "mRNA_BRCA1" -> Result: "BRCA1"
- Target: "CNV_MYC" -> Result: "MYC"

**Phase 2: Classification**
Apply these exact filters:
- **Genomic (CNV):** Deletion (<-1.10), Normal (-1.10 to -0.10), Gain (-0.10 to 0.90), Amp (0.90 to 1.90), High Amp (>= 1.90).
- **Epigenetic (Meth):** Hyper (> 1.50), Hypo (< -1.47).
- **Transcriptomic (mRNA):** High (> 1.30), Low (< -1.92).
- **Regulatory (miRNA):** Up (> 1.99), Down (< -1.25).

**Phase 3: Synthesis**
Write a clinical summary (80-120 words). Follow the layer order: Genomic, Epigenetic, Transcriptomic, Regulatory. **Bold** 
the standardized name and status.
"""

PROMPT_TEMPLATE = """
Processing Task: Multi-omics Data Interpretation.

Input: {{USER_INPUT_STRING}}

**Part 1: Extraction**
Internalize the data and map to the following logic (do not output this JSON, just use it to structure your thoughts):
{
  "Genomic": "CNV values (Strip prefix). <-1.10: Deletion, [-0.10, 0.90): Gain, [0.90, 1.90): Amplification, >=1.90: High-level Amplification",
  "Epigenetic": "Methylation values (Format chrN:Position). >1.50: Hyper-methylated, <-1.47: Hypo-methylated",
  "Transcriptomic": "mRNA values (Strip prefix). >1.30: High, <-1.92: Low",
  "Regulatory": "miRNA values (Format miR-xxx). >1.99: Upregulated, <-1.25: Downregulated"
}

**Part 2: Narrative Generation**
Based on the extracted logic, write a single academic paragraph.
- Strictly follow the order of keys in the JSON above.
- **Bold** findings.
- Ignore CNV values between -1.10 and -0.10.
- Tone: Clinical and precise. 80-120 words.
"""

PROMPT_TEMPLATE = """
Analyze the provided multi-omics signature to identify key molecular drivers of pathology.

**Data:** {{USER_INPUT_STRING}}

**Analysis Framework:**
1. **Genomic Architecture:** Assess Copy Number Variations. Identify **Deletions** (< -1.10), **Gains** (-0.10 to 0.90), 
**Amplifications** (0.90 to 1.90), or **High-level Amplifications** (≥ 1.90).
2. **Epigenetic Landscape:** Assess Methylation. Identify **Hyper-methylation** (> 1.50) or **Hypo-methylation** (< -1.47). Format as "chrN:Position".
3. **Transcriptional Output:** Assess mRNA. Identify **High** (> 1.30) or **Low** (< -1.92) expression.
4. **Regulatory Control:** Assess miRNA. Identify **Upregulated** (> 1.99) or **Downregulated** (< -1.25) status (format as "miR-xxx").

**Deliverable:**
Construct a cohesive, clinically phrased paragraph describing this tumor's molecular signature. Move linearly from Genomic to Regulatory layers. 
**Bold** significant anomalies.
"""

PROMPT_TEMPLATE = """
[Input String]: {{USER_INPUT_STRING}}

[Directives]
1. Parse & Clean: Remove prefixes. Format "miR-xxx". Format "chrN:Position".
2. Filter & Label:
   - mRNA: High(>1.30), Low(<-1.92)
   - Meth: Hyper(>1.50), Hypo(<-1.47)
   - miRNA: Up(>1.99), Down(<-1.25)
   - CNV: Deletion(<-1.10), Normal([-1.10, -0.10)), Gain([-0.10, 0.90)), Amp([0.90, 1.90)), HighAmp(>=1.90)
3. Output:
   - Single paragraph. 80-120 words. Academic tone.
   - Order: Genomic > Epigenetic > Transcriptomic > Regulatory.
   - **Bold** findings.
"""

PROMPT_TEMPLATE = """
Acting as a Molecular Pathologist, process this data: {{USER_INPUT_STRING}}

**Logic Reference:**
- Names: Strip prefixes, standardize "miR-xxx", "chrN:Position".
- CNV: Deletion (< -1.10), Gain (-0.10 to 0.90), Amp (0.90 to 1.90), High Amp (≥ 1.90).
- Meth: Hyper (> 1.50), Hypo (< -1.47).
- mRNA: High (> 1.30), Low (< -1.92).
- miRNA: Up (> 1.99), Down (< -1.25).

**Task:**
1. Draft a clinical report covering Genomic, Epigenetic, Transcriptomic, and Regulatory layers.
2. Review your draft:
   - Did you bold the findings?
   - Is it between 80-120 words?
   - Did you ignore "Normal" CNVs?
3. Output ONLY the polished, final version.
"""