### Imports

In [276]:
import numpy as np 
import pandas as pd
from pathlib import Path
import re

### Constants

In [277]:
basedir = Path("/Users/tusharsingh/Work/Project/tcga-mldl")
expression_file = "data_mrna_seq_tpm.txt"
clinical_file = "data_clinical_patient.txt"

### Load Data

In [278]:
expr = pd.read_csv( basedir/"data"/expression_file, sep='\t', index_col=0)
clinical = pd.read_csv(basedir/"data"/clinical_file, sep='\t', index_col=None, comment="#")

In [279]:
#Transpose
expr = expr.T
# Preview
print("(patients × genes):", expr.shape)

(patients × genes): (518, 40796)


In [280]:
## Removing the -01A part from sample ids -> TCGA-05-4384-01A
expr.index = expr.index.to_series().str.extract(r'^([^-]+-[^-]+-[^-]+)')[0]
expr.head()

Entrez_Gene_Id,1,10,100,1000,10000,100008586,100009613,100009667,100009668,100009669,...,9988,9989,999,9990,9991,9992,9993,9994,9995,9997
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4244,0.0993,0.4415,10.1587,0.119,15.2678,0.0,0.0,0.0,0.1719,0.0,...,20.7754,55.4124,237.5399,10.0958,87.108,0.0,64.1097,9.3171,0.0,1.7253
TCGA-05-4249,0.1841,0.4847,5.0391,0.1847,3.3073,0.0,0.0564,0.0,0.0572,0.0,...,12.4129,63.2694,559.8383,8.4541,82.6535,0.0534,143.4828,8.159,0.0,1.271
TCGA-05-4250,0.0995,0.2212,16.8949,1.6356,12.243,0.0,0.3963,0.0,0.2296,0.0,...,14.1043,46.0081,374.7818,10.2064,91.2503,0.0,70.8899,5.6926,0.0,0.5762
TCGA-05-4382,0.1252,1.607,14.458,1.254,6.5365,0.0,0.0935,0.0476,0.3477,0.0,...,12.6604,57.0167,417.5074,8.8851,42.5564,0.0295,119.025,4.6893,0.3068,1.382
TCGA-05-4384,0.192,0.3414,4.8805,0.2939,5.7195,0.0,0.2185,0.0,0.3324,0.0,...,21.914,79.2318,210.6712,16.0004,86.5846,0.0,144.8578,8.4676,0.0,1.1118


In [281]:
print("Clinical columns:", clinical.columns.tolist())
clinical.head()

Clinical columns: ['PATIENT_ID', 'OTHER_PATIENT_ID', 'PRIMARY_SITE_PATIENT', 'DISEASE_TYPE', 'PROJECT_NAME', 'PROJECT_ID', 'SEX', 'RACE', 'ETHNICITY', 'VITAL_STATUS', 'YEAR_OF_DEATH', 'SMOKER_YEARS', 'SMOKING_PACK_YEARS', 'PRIMARY_DIAGNOSIS', 'YEAR_OF_DIAGNOSIS', 'PATH_M_STAGE', 'BIOPSY_SITE', 'AJCC_STAGING_EDITION', 'ICD_10', 'AGE', 'PATH_STAGE', 'MORPHOLOGY', 'PATH_T_STAGE', 'PRIOR_TREATMENT', 'PATH_N_STAGE', 'PRIOR_MALIGNANCY', 'PROJECT_STATE', 'OS_STATUS', 'OS_MONTHS', 'DFS_STATUS', 'DFS_MONTHS']


Unnamed: 0,PATIENT_ID,OTHER_PATIENT_ID,PRIMARY_SITE_PATIENT,DISEASE_TYPE,PROJECT_NAME,PROJECT_ID,SEX,RACE,ETHNICITY,VITAL_STATUS,...,MORPHOLOGY,PATH_T_STAGE,PRIOR_TREATMENT,PATH_N_STAGE,PRIOR_MALIGNANCY,PROJECT_STATE,OS_STATUS,OS_MONTHS,DFS_STATUS,DFS_MONTHS
0,TCGA-05-4244,34040b83-7e8a-4264-a551-b16621843e28,Bronchus and lung,Adenomas and Adenocarcinomas,Lung Adenocarcinoma,TCGA-LUAD,Male,,,Alive,...,8140/3,T2,False,N2,False,released,0:LIVING,0.0,,
1,TCGA-05-4245,03d09c05-49ab-4ba6-a8d7-e7ccf71fafd2,Bronchus and lung,Adenomas and Adenocarcinomas,Lung Adenocarcinoma,TCGA-LUAD,Male,,,Alive,...,8140/3,T2,False,N2,True,released,0:LIVING,23.981603,1:Recurred/Progressed,10.97
2,TCGA-05-4249,4addf05f-3668-4b3f-a17f-c0227329ca52,Bronchus and lung,Adenomas and Adenocarcinomas,Lung Adenocarcinoma,TCGA-LUAD,Male,,,Alive,...,8140/3,T2,False,N0,False,released,0:LIVING,50.032852,,
3,TCGA-05-4250,f98ecd8a-b878-4f53-b911-20cd8e17281c,Bronchus and lung,Adenomas and Adenocarcinomas,Lung Adenocarcinoma,TCGA-LUAD,Female,,,Dead,...,8140/3,T3,False,N1,False,released,1:DECEASED,3.975033,1:Recurred/Progressed,10.980702
4,TCGA-05-4382,3434b91a-c05f-460f-a078-7b1bb6e7085d,Bronchus and lung,Adenomas and Adenocarcinomas,Lung Adenocarcinoma,TCGA-LUAD,Male,,,Alive,...,8255/3,T2,False,N0,True,released,0:LIVING,19.940867,,


In [282]:
clinical_df = clinical[["PATIENT_ID","PATH_STAGE"]]
clinical_df.head()

Unnamed: 0,PATIENT_ID,PATH_STAGE
0,TCGA-05-4244,Stage IV
1,TCGA-05-4245,Stage IIIA
2,TCGA-05-4249,Stage IB
3,TCGA-05-4250,Stage IIIA
4,TCGA-05-4382,Stage IB


#### Merging with "patient_id"

In [283]:
# Make sure the column names are aligned
clinical_df = clinical_df.rename(columns={'PATIENT_ID': 'patient_id', 'PATH_STAGE': 'tumor_stage'})
clinical_df = clinical_df.dropna(subset=['tumor_stage'])
expr.index.name = 'patient_id'  
# Merge
merged = clinical_df.merge(expr, on='patient_id', how='inner')
merged = merged.set_index('patient_id')
print("Final merged shape:", merged.shape)
merged.head()

Final merged shape: (510, 40797)


Unnamed: 0_level_0,tumor_stage,1,10,100,1000,10000,100008586,100009613,100009667,100009668,...,9988,9989,999,9990,9991,9992,9993,9994,9995,9997
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4244,Stage IV,0.0993,0.4415,10.1587,0.119,15.2678,0.0,0.0,0.0,0.1719,...,20.7754,55.4124,237.5399,10.0958,87.108,0.0,64.1097,9.3171,0.0,1.7253
TCGA-05-4249,Stage IB,0.1841,0.4847,5.0391,0.1847,3.3073,0.0,0.0564,0.0,0.0572,...,12.4129,63.2694,559.8383,8.4541,82.6535,0.0534,143.4828,8.159,0.0,1.271
TCGA-05-4250,Stage IIIA,0.0995,0.2212,16.8949,1.6356,12.243,0.0,0.3963,0.0,0.2296,...,14.1043,46.0081,374.7818,10.2064,91.2503,0.0,70.8899,5.6926,0.0,0.5762
TCGA-05-4382,Stage IB,0.1252,1.607,14.458,1.254,6.5365,0.0,0.0935,0.0476,0.3477,...,12.6604,57.0167,417.5074,8.8851,42.5564,0.0295,119.025,4.6893,0.3068,1.382
TCGA-05-4384,Stage IIIA,0.192,0.3414,4.8805,0.2939,5.7195,0.0,0.2185,0.0,0.3324,...,21.914,79.2318,210.6712,16.0004,86.5846,0.0,144.8578,8.4676,0.0,1.1118


In [284]:
### Saving it for EDA
clinical = clinical.set_index("PATIENT_ID")
df_clinical_matched = clinical.loc[merged.index.intersection(clinical.index)].copy()
df_clinical_matched.index.name = "patient_id"
df_clinical_matched.to_csv(basedir/"results"/"data"/"clinical.csv")

In [285]:
merged.value_counts("tumor_stage")

tumor_stage
Stage IB      141
Stage IA      132
Stage IIIA     73
Stage IIB      71
Stage IIA      50
Stage IV       26
Stage IIIB     11
Stage I         5
Stage II        1
Name: count, dtype: int64

In [286]:
stage_map = {
    'STAGE I': 0,
    'STAGE IA': 0,
    'STAGE IB': 0,
    'STAGE II': 1,
    'STAGE IIA': 1,
    'STAGE IIB': 1,
    'STAGE IIIA': 1,
    'STAGE IIIB': 1,
    'STAGE IV': 1
}
merged['tumor_stage'] = merged['tumor_stage'].str.strip().str.upper()
merged['severity'] = merged['tumor_stage'].map(stage_map)
merged = merged.dropna(subset=['severity'])
merged['severity'] = merged['severity'].astype(int)

In [287]:
merged.value_counts("severity")

severity
0    278
1    232
Name: count, dtype: int64

In [288]:
X = merged.drop(["tumor_stage","severity"], axis = 1)
y = merged.severity

In [289]:
y.shape , X.shape

((510,), (510, 40796))

### Save Data

In [290]:
X.to_csv(basedir/"results"/"data"/"features.csv" )
y.to_csv(basedir/"results"/"data"/"target.csv")