# file clinical

In [1]:
import pandas as pd
import json, ast
import numpy as np
from pandas import NA

def to_list(x):
    """Chuyển mọi kiểu về list các dict (nếu không hợp lệ thì trả list rỗng)."""
    # Trường hợp NaN, None, hoặc mảng rỗng
    if x is None:
        return []
    if isinstance(x, (float, int)) and pd.isna(x):
        return []
    if isinstance(x, np.ndarray):
        # Nếu là mảng numpy, lấy phần tử đầu tiên nếu có
        if len(x) == 0:
            return []
        x = x.tolist()
    if isinstance(x, list):
        return x
    if isinstance(x, dict):
        return [x]
    if isinstance(x, str):
        s = x.strip()
        if s == "":
            return []
        # Thử JSON
        try:
            obj = json.loads(s)
            if isinstance(obj, list):
                return obj
            if isinstance(obj, dict):
                return [obj]
        except Exception:
            pass
        # Thử literal_eval (chuỗi Python)
        try:
            obj = ast.literal_eval(s)
            if isinstance(obj, list):
                return obj
            if isinstance(obj, dict):
                return [obj]
        except Exception:
            return []
    return []

def extract_primary_diagnosis(ents):
    for d in ents:
        if isinstance(d, dict) and d.get("primary_diagnosis"):
            return d["primary_diagnosis"]
    return NA

import re

def clean_diagnosis(x):
    if not isinstance(x, str):
        return x

    # 1️⃣ Loại bỏ các phần mô tả thừa như ", NOS", ", not otherwise specified", ", unspecified", ", NEC", ...
    x = re.sub(
        r",?\s*(NOS|not otherwise specified|unspecified|NEC|n\.e\.c\.|n\.o\.s\.)\b\.?",
        "",
        x,
        flags=re.IGNORECASE
    )

    # 2️⃣ Xoá các ký tự thừa cuối (dấu chấm, phẩy, khoảng trắng)
    x = re.sub(r"[\s,\.]+$", "", x)

    # 3️⃣ Chuẩn hoá khoảng trắng giữa các từ
    x = re.sub(r"\s+", " ", x.strip())

    return x

In [2]:
df= pd.read_json('clinical.cart.2025-10-21.json')

In [3]:
df["disease_type"].value_counts()

disease_type
Ductal and Lobular Neoplasms             1052
Cystic, Mucinous and Serous Neoplasms      16
Complex Epithelial Neoplasms               14
Epithelial Neoplasms, NOS                   5
Adenomas and Adenocarcinomas                3
Squamous Cell Neoplasms                     2
Adnexal and Skin Appendage Neoplasms        1
Fibroepithelial Neoplasms                   1
Basal Cell Neoplasms                        1
Name: count, dtype: int64

In [4]:

val = "Ductal and Lobular Neoplasms"
mask = df["disease_type"].astype(str).str.strip().eq(val)
df_sub = df[mask].copy()

In [5]:
df_sub=df_sub[["submitter_id","diagnoses"]]
df_sub["primary_diagnosis"] = df_sub["diagnoses"].apply(to_list).apply(extract_primary_diagnosis)
print(df_sub["primary_diagnosis"].value_counts())

primary_diagnosis
Infiltrating duct carcinoma, NOS                            750
Lobular carcinoma, NOS                                      186
Not Reported                                                 33
Infiltrating duct and lobular carcinoma                      28
Infiltrating duct mixed with other types of carcinoma        18
Intraductal papillary adenocarcinoma with invasion            6
Infiltrating lobular mixed with other types of carcinoma      5
Medullary carcinoma, NOS                                      5
Intraductal carcinoma, noninfiltrating, NOS                   4
Invasive micropapillary carcinoma                             4
Lobular carcinoma in situ, NOS                                3
Paget disease and infiltrating duct carcinoma of breast       3
Clear cell carcinoma                                          2
Adenocarcinoma, NOS                                           2
Basal cell carcinoma, NOS                                     1
Myelodysplastic syndro

In [6]:
df_sub=df_sub[["submitter_id","primary_diagnosis"]]
pattern = r"(Infiltrating duct carcinoma|Lobular carcinoma)"

df_filtered = df_sub[df_sub["primary_diagnosis"].astype(str).str.contains(pattern, case=False, na=False)]
print(df_filtered["primary_diagnosis"].value_counts())

primary_diagnosis
Infiltrating duct carcinoma, NOS                           750
Lobular carcinoma, NOS                                     186
Infiltrating duct and lobular carcinoma                     28
Lobular carcinoma in situ, NOS                               3
Paget disease and infiltrating duct carcinoma of breast      3
Name: count, dtype: int64


  df_filtered = df_sub[df_sub["primary_diagnosis"].astype(str).str.contains(pattern, case=False, na=False)]


In [7]:
df_filtered["primary_diagnosis"] = df_filtered["primary_diagnosis"].apply(clean_diagnosis)
remove_list = [
    "Paget disease and infiltrating duct carcinoma of breast",
    "Lobular carcinoma in situ",
    "Infiltrating duct and lobular carcinoma"
]

df_filtered = df_filtered[~df_filtered["primary_diagnosis"].isin(remove_list)].copy()

# (Tuỳ chọn) Kiểm tra lại số lượng từng loại
print(df_filtered["primary_diagnosis"].value_counts())


primary_diagnosis
Infiltrating duct carcinoma    750
Lobular carcinoma              186
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["primary_diagnosis"] = df_filtered["primary_diagnosis"].apply(clean_diagnosis)


In [8]:
df_filtered

Unnamed: 0,submitter_id,primary_diagnosis
0,TCGA-E2-A1IU,Infiltrating duct carcinoma
3,TCGA-AN-A0AM,Infiltrating duct carcinoma
5,TCGA-A7-A26E,Infiltrating duct carcinoma
6,TCGA-A8-A07W,Infiltrating duct carcinoma
7,TCGA-D8-A1XY,Infiltrating duct carcinoma
...,...,...
1090,TCGA-A2-A0CP,Infiltrating duct carcinoma
1091,TCGA-PL-A8LX,Infiltrating duct carcinoma
1092,TCGA-A2-A3XZ,Infiltrating duct carcinoma
1093,TCGA-E9-A295,Lobular carcinoma


# file sample_sheet

In [9]:
sample_df= pd.read_csv('gdc_sample_sheet.2025-10-21.tsv',sep='\t')
sample_df

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Tissue Type,Tumor Descriptor,Specimen Type,Preservation Method
0,3396edba-0d2a-4485-ad0a-5114b38c1abe,2e64abe2-6024-4d28-9e09-560ce2a9fd15.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-BRCA,TCGA-GM-A2DL,TCGA-GM-A2DL-01A,Tumor,Primary,Solid Tissue,Unknown
1,d2e2915c-125e-45c6-b7f3-6a815b3d2859,96a45a70-d4b5-4090-b603-73b7379679d6.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-BRCA,TCGA-AC-A2QI,TCGA-AC-A2QI-01A,Tumor,Primary,Solid Tissue,Unknown
2,fb32f0b1-4fb7-43d5-8091-dc13a1f6d9e8,1d468785-141d-40ca-acb1-d6d85a8c9d7b.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-BRCA,TCGA-A8-A06R,TCGA-A8-A06R-01A,Tumor,Primary,Unknown,Unknown
3,de5e449b-bc57-4836-89e4-73a3ba24abdf,748e4eaa-2b96-4dce-a903-c7df733d7f50.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-BRCA,TCGA-EW-A1PD,TCGA-EW-A1PD-01A,Tumor,Primary,Solid Tissue,OCT
4,a58e2ae3-a236-4209-8292-70465216cb85,e614fbb4-7574-4704-9525-c0aea4c10fc6.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-BRCA,TCGA-AO-A12D,TCGA-AO-A12D-01A,Tumor,Primary,Unknown,OCT
...,...,...,...,...,...,...,...,...,...,...,...
1226,fd94a0ed-37d8-49a5-af96-e2160a9e6096,f8cfabf6-dc25-4141-be6c-817dd439a74b.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-BRCA,TCGA-BH-A18P,TCGA-BH-A18P-11A,Normal,Not Applicable,Solid Tissue,OCT
1227,37af19f9-56f9-48ec-97a9-5b0b6e364293,a37efe02-9662-4b90-8251-003ccb96aed4.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-BRCA,TCGA-C8-A8HP,TCGA-C8-A8HP-01A,Tumor,Primary,Solid Tissue,OCT
1228,03a1a105-4856-4e5c-b537-508bdbda50b5,02bbe1b1-44ff-426b-a40e-47a01bc4cf7d.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-BRCA,TCGA-EW-A3U0,TCGA-EW-A3U0-01A,Tumor,Primary,Solid Tissue,OCT
1229,5b4ec1fd-516d-4c32-a233-0699e591df1a,4331397f-eaf9-4e43-8fa2-e30ad3643aca.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-BRCA,TCGA-D8-A1J9,TCGA-D8-A1J9-01A,Tumor,Primary,Unknown,Unknown


In [10]:
merged_df = pd.merge(df_filtered, sample_df, left_on='submitter_id', right_on='Case ID', how='inner')

In [11]:
merged_df=merged_df[["Case ID","File Name","Tissue Type","primary_diagnosis"]]
merged_df

Unnamed: 0,Case ID,File Name,Tissue Type,primary_diagnosis
0,TCGA-E2-A1IU,22c2b380-799e-4fad-ae38-46a916c592d5.rna_seq.a...,Tumor,Infiltrating duct carcinoma
1,TCGA-AN-A0AM,fd6f4c9b-ee43-4939-8cfa-2e447aedbcf3.rna_seq.a...,Tumor,Infiltrating duct carcinoma
2,TCGA-A7-A26E,a02fb212-a6cc-40b2-9d31-481fc1ce0911.rna_seq.a...,Tumor,Infiltrating duct carcinoma
3,TCGA-A7-A26E,de01516e-43f0-4f96-8ac6-ab543a314829.rna_seq.a...,Tumor,Infiltrating duct carcinoma
4,TCGA-A7-A26E,e6bad6ec-c178-4684-99e8-2504781a022b.rna_seq.a...,Tumor,Infiltrating duct carcinoma
...,...,...,...,...
1046,TCGA-A2-A0CP,d6450e34-ebf4-4749-a27e-8e3139960afb.rna_seq.a...,Tumor,Infiltrating duct carcinoma
1047,TCGA-PL-A8LX,94367ad2-de29-4beb-8dab-f79174894cd2.rna_seq.a...,Tumor,Infiltrating duct carcinoma
1048,TCGA-A2-A3XZ,081b4937-85e7-44b3-9124-a612f4f2e0ca.rna_seq.a...,Tumor,Infiltrating duct carcinoma
1049,TCGA-E9-A295,5e36c4a2-fc6b-418f-80ab-dbd95b26a39e.rna_seq.a...,Tumor,Lobular carcinoma


In [12]:
merged_df["Tissue Type"].value_counts()

Tissue Type
Tumor     955
Normal     96
Name: count, dtype: int64

In [13]:
merged_df = merged_df[merged_df["Tissue Type"]!="Normal"].copy()

In [14]:
merged_df["Tissue Type"].value_counts()

Tissue Type
Tumor    955
Name: count, dtype: int64

In [15]:
merged_df=merged_df[["Case ID","File Name","primary_diagnosis"]]

In [142]:
merged_df.to_csv('breast_cancer_annotation.csv',index=False)