# Example data

In [1]:
import pandas as pd
import os


INPUT_FILE = '../mRMR/Final_Forced_MultiOmics_Signature_Renamed.csv'
OUTPUT_FILE = '10_data_string.csv'

START_INDEX = 0   
END_INDEX = 10    # not include


def format_row_for_prompt(row):
    patient_id = row['PatientID']
    features = row.drop('PatientID')
    feature_list = []
    for idx, val in features.items():
        if pd.isna(val):
            val_str = "N/A"
        else:
            val_str = str(val)
        feature_list.append(f"{idx}: {val_str}")
        
    return ", ".join(feature_list)

def main():
    df = pd.read_csv(INPUT_FILE)
    if END_INDEX is None:
        target_df = df.iloc[START_INDEX:]
        end_info = "the last row"
    else:
        target_df = df.iloc[START_INDEX:END_INDEX]
        end_info = f"row {END_INDEX} "
    prepared_df = pd.DataFrame()
    prepared_df['PatientID'] = target_df['PatientID']
    prepared_df['Input_Data_String'] = target_df.apply(format_row_for_prompt, axis = 1)
    prepared_df.to_csv(OUTPUT_FILE, index = False, encoding = 'utf-8-sig')


if __name__ == "__main__":
    main()

# qwen3-max


https://bailian.console.aliyun.com/?spm=a2c4g.11186623.0.0.58117dc5ppShUA&tab=model#/model-market/detail/qwen3-max
![qwen_price](qwen.png)

In [None]:
PROMPT_TEMPLATE = """
You are Dr. Elena Voss, MD, PhD—a board-certified Molecular Tumor Pathologist at a leading NCI-designated cancer center. 
Analyze the provided multi-omics CSV row with clinical precision. 
Apply strict diagnostic thresholds to classify genomic (CNV), epigenetic (methylation), transcriptomic (mRNA), and regulatory (miRNA) aberrations. 
Strip prefixes (e.g., “mRNA_”), standardize miRNA nomenclature to “miR-xxx”, and parse methylation loci as “chrN:Position” where applicable. 
Report only significant findings in a single authoritative paragraph (80–120 words), ordered by biological layer, with bolded key alterations. 
Omit normal calls and raw values.
"""

PROMPT_TEMPLATE = """
Think step-by-step. First, preprocess the input string: remove prefixes (“mRNA_”, “CNV_”, etc.), 
convert miRNA names to “miR-xxx”, and extract methylation coordinates as “chrN:Position” if needed. 
Then, for each feature, apply the exact diagnostic thresholds from the Core Logic—no calculations, 
just mapping. In a hidden scratchpad, list all significant findings per layer: Genomic (CNV), Epigenetic, 
Transcriptomic, Regulatory. Finally, synthesize these into one clinical paragraph (80–120 words), 
bolding all significant alterations, in strict biological order. Do not include normal or non-significant data.
"""

# 效果不太好
PROMPT_TEMPLATE = """
Example Input:
"mRNA_TP53: -2.10, CNV_EGFR: 2.30, Methylation_chr7:140453136: 1.60, miRNA_hsa-mir-21: 2.05"
Example Output:
Genomic analysis reveals high-level amplification of EGFR. Epigenetically, 
there is hyper-methylation at chr7:140453136. Transcriptomically, TP53 expression is low. 
Regulatory assessment shows upregulation of miR-21, consistent with oncogenic silencing of tumor suppressors.
Now analyze the user’s input row using identical logic: strip prefixes, standardize miRNA to “miR-xxx”, 
parse methylation loci, apply fixed thresholds, and output a single 80–120 word paragraph in the same style—bolding 
only significant findings in genomic → epigenetic → transcriptomic → regulatory order.
"""

PROMPT_TEMPLATE = """
DO NOT output raw Z-values, p-values, or confidence scores. DO NOT mention features with normal/neutral status 
(e.g., CNV in [-1.10, -0.10)). DO NOT reorder the biological layers—must 
be: Genomic → Epigenetic → Transcriptomic → Regulatory. DO NOT use absolute values for CNV; respect signed thresholds.
DO NOT include gene names with prefixes—clean to base symbol (e.g., “EGFR”, not “CNV_EGFR”). DO NOT exceed 120 words 
or fall below 80. Only bold biologically significant alterations per the strict diagnostic thresholds. 
Generate one precise academic paragraph.
"""

PROMPT_TEMPLATE = """
Follow these steps exactly:
Preprocess: Remove all prefixes (e.g., “mRNA_” → gene name). Convert “miRNA_hsa-mir-xxx” or “hsa-miR-xxx” → “miR-xxx”. 
For methylation, if name contains “chr”, extract “chrN:Position”; else keep gene name.
Classify CNV: Map signed value to: <−1.10=Deletion; [−1.10,−0.10)=Normal (ignore); [−0.10,0.90)=Gain; 
[0.90,1.90)=Amplification; ≥1.90=High-level Amplification.
Apply other thresholds: mRNA (High>1.30, Low<−1.92); Methylation (Hyper>1.50, Hypo<−1.47); 
miRNA (Up>1.99, Down<−1.25).
Compose: Write one 80–120 word paragraph in layer order. Bold only significant findings. Omit normals.
"""

PROMPT_TEMPLATE = """
Your primary task is flawless string normalization before interpretation. From the input CSV row:
Strip all assay prefixes (“mRNA_”, “CNV_”, etc.) to isolate canonical gene symbols.
Standardize any miRNA identifier (e.g., “miRNA_hsa-mir-155” or “hsa-miR-155”) → “miR-155”.
For methylation features: if the label contains a chromosome coordinate (e.g., “Methylation_chr17 41276045…”), 
extract only “chr17:41276045”; otherwise retain the gene name.
Then, apply the exact diagnostic thresholds (no deviations). Output a single clinical paragraph (80–120 words) 
in genomic→epigenetic→transcriptomic→regulatory sequence, with bolded significant alterations only.
"""

PROMPT_TEMPLATE = """
First, parse the input and output a JSON object with four keys: genomic, epigenetic, transcriptomic, regulatory. 
Each value is a list of strings like "amplification of EGFR" or "hyper-methylation at chr7:140453136", 
derived strictly from the Core Logic thresholds and preprocessing rules. 
Then, convert this JSON into a single cohesive academic paragraph (80–120 words), 
maintaining layer order and bolding each finding. Do not include empty categories or normal calls. 
Final output must be prose only—no JSON visible.
"""

PROMPT_TEMPLATE = """
Interpret the multi-omics profile as an integrated molecular signature driving oncogenesis. 
After cleaning gene names (remove prefixes, standardize miRNA to “miR-xxx”, parse methylation loci), 
identify driver events per layer using the fixed diagnostic thresholds. 
Emphasize concordant or compensatory interactions across genomic instability (CNV), 
epigenetic silencing, transcriptional dysregulation, and post-transcriptional control. 
Synthesize findings into one 80–120 word narrative, ordered by biological hierarchy, 
with bolded key alterations that define the tumor’s molecular phenotype. Exclude neutral findings.
"""

PROMPT_TEMPLATE = """
Input: one CSV row of multi-omics data. Preprocess: strip prefixes (e.g., “mRNA_”→gene), miRNA→“miR-xxx”, 
methylation→“chrN:Position” if coordinate given. 
Apply thresholds: CNV (signed: <−1.10=Del; [−0.10,0.90)=Gain; [0.90,1.90)=Amp; ≥1.90=High-Amp); 
mRNA (High>1.30, Low<−1.92); Methylation (Hyper>1.50, Hypo<−1.47); miRNA (Up>1.99, Down<−1.25). 
Output: one 80–120 word paragraph, order: Genomic→Epigenetic→Transcriptomic→Regulatory. 
Bold only significant findings. No normals, no numbers, no fluff.
"""

PROMPT_TEMPLATE = """
Draft an initial molecular pathology report from the input row, applying all preprocessing and threshold rules. 
Then, critically review your draft: remove any mention of normal/neutral findings, 
verify gene names are cleaned (no prefixes, miRNA standardized, methylation parsed), 
ensure strict layer order, and confirm bolding applies only to significant alterations per thresholds.
Delete raw values and redundant phrasing. Output only the final polished paragraph (80–120 words)—clinical, 
precise, and authoritative—with zero trace of the editing process.
"""







# 2nd prompt
PROMPT_TEMPLATE = """
Think step-by-step. First, preprocess the input string: remove prefixes (“mRNA_”, “CNV_”, etc.), 
convert miRNA names to “miR-xxx”, and extract methylation coordinates as “chrN:Position” if needed. 
Then, for each feature, apply the exact diagnostic thresholds from the Core Logic — do not output or display 
any numeric values, scores, or thresholds; only report whether the feature is significant or not. 
In a hidden scratchpad, list all significant findings per layer: Genomic (CNV), Epigenetic, Transcriptomic, 
Regulatory. Finally, synthesize these into one clinical paragraph (80–120 words), bolding all significant alterations, 
in strict biological order. Do not include normal or non-significant data. Never display numeric measurements, 
fold-changes, p-values, or threshold cutoffs in any output.
"""

# 3rd prompt
PROMPT_TEMPLATE = """
Think step-by-step. First, preprocess the input string: remove prefixes (“mRNA_”, “CNV_”, etc.), 
convert miRNA names to “miR-xxx”, and extract methylation coordinates as “chrN:Position” if needed. 
Then, for each feature, apply the exact diagnostic thresholds from the Core Logic — do not output or display 
any numeric values, scores, or thresholds; only report whether the feature is significant or not. 
In a hidden scratchpad, list all significant findings per layer: Genomic (CNV), Epigenetic, Transcriptomic, 
Regulatory. Finally, synthesize these into ONE clinical sentence (≤30 words) that directly links the bolded significant alterations 
to the most biologically plausible breast cancer subtype (e.g., Luminal A, HER2-enriched, Basal-like). 
Do not include normal or non-significant data. Never display numeric measurements, fold-changes, p-values, or threshold cutoffs.
"""

# 4th
PROMPT_TEMPLATE = """
Think step-by-step. First, preprocess the input string: remove prefixes (“mRNA_”, “CNV_”, etc.), 
convert miRNA names to “miR-xxx”, and extract methylation coordinates as “chrN:Position” if needed. 
Then, for each feature, apply the exact diagnostic thresholds from the Core Logic — do not output or display 
any numeric values, scores, or thresholds; only internally note whether the feature is significant or not. 

In a hidden scratchpad (not shown in output), list all significant findings per layer: Genomic (CNV), Epigenetic, 
Transcriptomic, Regulatory. 

Finally, synthesize these into ONE AND ONLY ONE CLINICAL SENTENCE (strictly ≤30 words) that directly links the bolded 
significant alterations 
to the most biologically plausible breast cancer subtype (e.g., Luminal A, HER2-enriched, Basal-like). 

CRITICAL RULES:
- NEVER display numeric measurements, fold-changes, p-values, or threshold cutoffs.
- NEVER include normal or non-significant data.
- NEVER add bullet points, headers, explanations, or extra sentences.
- OUTPUT MUST BE EXACTLY ONE SENTENCE — NOTHING BEFORE, NOTHING AFTER.

Begin.
"""

In [3]:
import os
import time
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv  
load_dotenv()

client = OpenAI(
    api_key = os.getenv("QWEN_API_KEY"), 
    base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1",
)

PROMPT_TEMPLATE = """
Role: Expert Molecular Pathologist.
Task: Generate a **high-density biological summary** for patient embedding based on STRICT diagnostic thresholds.

**Diagnostic Rules (Apply these strictly to filter noise):**
1. **mRNA**: High if Z > 1.30; Low if Z < -1.92. (Ignore -1.92 to 1.30).
2. **Methylation**: Hyper-methylated if Z > 1.50; Hypo-methylated if Z < -1.47. (Ignore others).
3. **miRNA**: Upregulated if Z > 1.99; Downregulated if Z < -1.25.
4. **CNV (Genomic Status)**:
   - Value < -1.10 -> **Deletion**
   - -1.10 <= Value < -0.10 -> **Normal (IGNORE)**
   - -0.10 <= Value < 0.90 -> **Gain**
   - 0.90 <= Value < 1.90 -> **Amplification**
   - Value >= 1.90 -> **High-level Amplification**

**Output Constraints:**
1. **Filter**: ONLY report features that meet the criteria above. IGNORE everything else.
2. **Causal Logic**: Connect genomic/epigenetic drivers to transcriptional outcomes (e.g., "High-level Amplification 
drives mRNA overexpression").
3. **No Fluff**: Start directly with the biology. No "The patient shows...".
4. **Length**: STRICTLY under 60 words.

**Desired Output Style**:
"High-level Amplification of [Gene A] drives significant mRNA overexpression. 
[Gene B] promoter hypermethylation mediates epigenetic silencing. 
Deletion of [Gene C] correlates with transcriptional downregulation. 
These features collectively indicate a [Subtype] phenotype with [Pathway] dysregulation."

**Data to Analyze**:
{omics_data_string}

**Output**:
"""


def main():

    input_file = '702_data_string.csv'

    df = pd.read_csv(input_file)
    
    print(f"Successfully read file with {len(df)} rows. Starting analysis...")
    

    analysis_results = []

    for index, row in df.iterrows():
        patient_id = row['PatientID']
        omics_data = row['Input_Data_String'] 
        print(f"Processing row {index + 1}: {patient_id} ...")
        
        try:
            # === Core API Call ===
            completion = client.chat.completions.create(
                model = "qwen3-max",  
                messages = [
                    {"role": "system", "content": PROMPT_TEMPLATE},
                    {"role": "user", "content": f"Here is the omics data row for analysis:\n{omics_data}"}
                ]
            )
            
            analysis_text = completion.choices[0].message.content
            
            analysis_results.append({
                "PatientID": patient_id,
                "Text_Genetate": analysis_text, 
                "Raw_Data": omics_data 
            })
            

        except Exception as e:
            print(f"Error processing {patient_id}: {e}")
            analysis_results.append({
                "PatientID": patient_id, 
                "Text_Genetate": f"Error: {e}"
            })
        

        time.sleep(0.5)

    output_df = pd.DataFrame(analysis_results)
    output_filename = "Text_node.csv"
    output_df.to_csv(output_filename, index = False, encoding = 'utf-8-sig') 


if __name__ == "__main__":
    main()

Successfully read file with 702 rows. Starting analysis...
Processing row 1: TCGA-EW-A2FS ...
Processing row 2: TCGA-OL-A6VR ...
Processing row 3: TCGA-E9-A226 ...
Processing row 4: TCGA-D8-A27H ...
Processing row 5: TCGA-D8-A3Z6 ...
Processing row 6: TCGA-B6-A1KN ...
Processing row 7: TCGA-BH-A2L8 ...
Processing row 8: TCGA-A8-A075 ...
Processing row 9: TCGA-JL-A3YX ...
Processing row 10: TCGA-D8-A3Z5 ...
Processing row 11: TCGA-OL-A66J ...
Processing row 12: TCGA-D8-A73U ...
Processing row 13: TCGA-AC-A3QQ ...
Processing row 14: TCGA-AN-A0XS ...
Processing row 15: TCGA-A2-A0T5 ...
Processing row 16: TCGA-D8-A1Y1 ...
Processing row 17: TCGA-OL-A66I ...
Processing row 18: TCGA-EW-A1OX ...
Processing row 19: TCGA-D8-A1XQ ...
Processing row 20: TCGA-OL-A6VO ...
Processing row 21: TCGA-LD-A74U ...
Processing row 22: TCGA-EW-A1IY ...
Processing row 23: TCGA-GM-A4E0 ...
Processing row 24: TCGA-E2-A573 ...
Processing row 25: TCGA-E9-A228 ...
Processing row 26: TCGA-AR-A1AI ...
Processing row

# Label Alignment

In [12]:
# Label Washing
import pandas as pd


df_labels = pd.read_csv('Patient_Labels.csv')
rows_to_drop = ['Normal', 'Normal-like', 'NA']
df_clean = df_labels.dropna(subset = ['Subtype'])
df_clean = df_clean[~df_clean['Subtype'].isin(rows_to_drop)]
df_clean.to_csv('Patient_Labels_Cleaned.csv', index = False)

In [17]:
# Extract
import pandas as pd
import re


file_path = 'QWEN_700.csv' 
df = pd.read_csv(file_path)


def pick_only_cancer_bold(text):
    bold_pattern = r"\*\*(.*?)\*\*"
    matches = re.findall(bold_pattern, text)
    
    if not matches:
        return None

    valid_matches = [m for m in matches if "cancer" in m.lower()]
    
    if valid_matches:
        return " | ".join(valid_matches)


df['LLM_subtype'] = df['Text_Genetate'].apply(pick_only_cancer_bold)
df_final = df.dropna(subset=['LLM_subtype'])
output_df = df_final[['PatientID', 'LLM_subtype']]
output_df.to_csv('LLM_subtype.csv', index = False, encoding = 'utf-8-sig')


In [18]:
# Mapping
import pandas as pd
import re

file_path = 'LLM_subtype.csv'
df = pd.read_csv(file_path)

def extract_prefix_before_cancer(text):
    if not isinstance(text, str):
        return None
    
    # 逻辑说明：
    # (.*?)        -> 捕获组1：我们要的目标（cancer前面的内容）
    # \s* -> 允许有空格
    # (?:breast)?  -> 非捕获组：可选的 "breast" 单词（如果有就忽略掉）
    # \s+          -> 必须有空格
    # cancer       -> 锚点单词
    pattern = r"(.*?)\s*(?:breast)?\s*cancer"
    match = re.search(pattern, text, re.IGNORECASE)
    
    if match:
        clean_subtype = match.group(1).strip()
        clean_subtype = re.sub(r"\s+(subtype|type|class|is|shows)$", "", clean_subtype, flags=re.IGNORECASE)
        return clean_subtype.strip()
    else:
        return None 

df['Clean_Subtype_Name'] = df['LLM_subtype'].apply(extract_prefix_before_cancer)

def map_to_label(text):
    if text is None: return "Unknown"
    t = text.lower()
    if "luminal a" in t: return "LumA"
    if "luminal b" in t: return "LumB"
    if "basal" in t: return "Basal"
    if "her2" in t: return "Her2"
    return "Unknown"

df['Mapped_Label'] = df['Clean_Subtype_Name'].apply(map_to_label)
df.to_csv('Extracted_Prefix_Result.csv', index = False, encoding = 'utf-8-sig')


In [21]:
# ACC
import pandas as pd


df_predict = pd.read_csv('Extracted_Prefix_Result.csv')
df_true = pd.read_csv('Patient_Labels_Cleaned.csv')

df = pd.merge(df_predict, df_true, on = 'PatientID')
accuracy = (df['Mapped_Label'] == df['Subtype']).mean()

print(f"ACC: {accuracy:.2%}")

ACC: 67.15%
