In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import joblib

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit


In [2]:
expr = pd.read_parquet("C:\\Users\\DELL\\Desktop\\DeepLearning\\mRNA_coding.omics.parquet")
labels = pd.read_parquet("C:\\Users\\DELL\\Desktop\\DeepLearning\\label.parquet")

expr.shape, labels.shape

((9349, 19888), (10374, 9))

In [3]:
expr.head()

Unnamed: 0,caseID,ENSG00000211895,ENSG00000211896,ENSG00000211897,ENSG00000211898,ENSG00000211899,ENSG00000211677,ENSG00000211679,ENSG00000211685,ENSG00000211890,...,ENSG00000231202,ENSG00000225992,ENSG00000249978,ENSG00000226212,ENSG00000228668,ENSG00000251578,ENSG00000240578,ENSG00000239992,ENSG00000249912,ENSG00000241911
0,TCGA-HQ-A5ND-01A-11R-A26T-07,0.2622,2.8437,0.1289,0.0,0.1993,0.6558,0.3027,0.0,0.0815,...,0.0,0.0,0.066,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TCGA-G2-A3IB-01A-11R-A20F-07,0.3238,0.4446,0.112,0.0,0.082,0.443,0.1107,0.0,0.1312,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TCGA-ZF-AA5N-01A-11R-A42T-07,396.15799,88.736198,24.3566,0.2993,10.4734,72.044296,33.686199,0.2576,161.064102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TCGA-FD-A6TF-01A-52R-A32O-07,37.804501,894.137695,47.8643,2.5348,54.887402,295.739502,287.434387,1.0701,2.1523,...,0.0,0.0,0.0581,0.0686,0.0,0.059,0.0,0.0,0.0,0.0
4,TCGA-GC-A3WC-11A-11R-A22U-07,737.350281,132.165497,35.4646,15.9797,84.176102,212.235199,97.084602,0.1685,122.155098,...,0.0,0.0,0.1403,0.0828,0.0,0.1423,0.0,0.0,0.0,0.0


In [4]:
expr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9349 entries, 0 to 9348
Columns: 19888 entries, caseID to ENSG00000241911
dtypes: float32(19887), object(1)
memory usage: 709.3+ MB


In [5]:
expr.isnull().sum()

caseID             0
ENSG00000211895    0
ENSG00000211896    0
ENSG00000211897    0
ENSG00000211898    0
                  ..
ENSG00000251578    0
ENSG00000240578    0
ENSG00000239992    0
ENSG00000249912    0
ENSG00000241911    0
Length: 19888, dtype: int64

In [6]:
expr[expr.isnull().any(axis=1)]


Unnamed: 0,caseID,ENSG00000211895,ENSG00000211896,ENSG00000211897,ENSG00000211898,ENSG00000211899,ENSG00000211677,ENSG00000211679,ENSG00000211685,ENSG00000211890,...,ENSG00000231202,ENSG00000225992,ENSG00000249978,ENSG00000226212,ENSG00000228668,ENSG00000251578,ENSG00000240578,ENSG00000239992,ENSG00000249912,ENSG00000241911


In [7]:
expr['caseID'].value_counts()

caseID
TCGA-HQ-A5ND-01A-11R-A26T-07    1
TCGA-09-1669-01A-01R-1566-13    1
TCGA-24-1416-01A-01R-1565-13    1
TCGA-30-1892-01A-01R-1568-13    1
TCGA-13-0913-01A-01R-1564-13    1
                               ..
TCGA-G6-A8L8-01A-21R-A37O-07    1
TCGA-B8-4154-01A-01R-1188-07    1
TCGA-B0-5698-01A-11R-1672-07    1
TCGA-B0-4828-01A-01R-1277-07    1
TCGA-FL-A1YN-11A-11R-A32Y-07    1
Name: count, Length: 9349, dtype: int64

In [8]:
expr=expr.drop_duplicates()

In [9]:
expr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9349 entries, 0 to 9348
Columns: 19888 entries, caseID to ENSG00000241911
dtypes: float32(19887), object(1)
memory usage: 709.3+ MB


In [10]:
labels.head()

Unnamed: 0,sampleID,patient,cases,subtype,survival,vital_status,sample_type,cancer_type,surv_class
0,TCGA-BL-A0C8-01A,TCGA-BL-A0C8,mRNA|TCGA-BL-A0C8-01A-11R-A277-07|mRNA_coding|...,BLCA.1,1219.0,0.0,Tumour,TCGA-BLCA,Long Term Survivor (LTS)
1,TCGA-BT-A20N-01A,TCGA-BT-A20N,mRNA|TCGA-BT-A20N-01A-11R-A14Y-07|mRNA_coding|...,BLCA.1,795.0,1.0,Tumour,TCGA-BLCA,Long Term Survivor (LTS)
2,TCGA-BT-A20P-01A,TCGA-BT-A20P,mRNA|TCGA-BT-A20P-01A-11R-A14Y-07|mRNA_coding|...,BLCA.1,544.0,1.0,Tumour,TCGA-BLCA,Long Term Survivor (LTS)
3,TCGA-BT-A3PH-01A,TCGA-BT-A3PH,mRNA|TCGA-BT-A3PH-01A-11R-A220-07|mRNA_coding|...,BLCA.1,142.0,1.0,Tumour,TCGA-BLCA,Short Term Survivor (STS)
4,TCGA-BT-A42C-01A,TCGA-BT-A42C,mRNA|TCGA-BT-A42C-01A-11R-A23N-07|mRNA_coding|...,BLCA.1,873.0,0.0,Tumour,TCGA-BLCA,Long Term Survivor (LTS)


In [11]:
labels[labels.isnull().any(axis=1)]

Unnamed: 0,sampleID,patient,cases,subtype,survival,vital_status,sample_type,cancer_type,surv_class
5,TCGA-CF-A27C-01A,TCGA-CF-A27C,mRNA|TCGA-CF-A27C-01A-11R-A16R-07|mRNA_coding|...,BLCA.1,425.0,0.0,Tumour,TCGA-BLCA,
6,TCGA-CF-A3MF-01A,TCGA-CF-A3MF,mRNA|TCGA-CF-A3MF-01A-12R-A21D-07|mRNA_coding|...,BLCA.1,383.0,0.0,Tumour,TCGA-BLCA,
7,TCGA-CF-A3MG-01A,TCGA-CF-A3MG,mRNA|TCGA-CF-A3MG-01A-11R-A20F-07|mRNA_coding|...,BLCA.1,369.0,0.0,Tumour,TCGA-BLCA,
8,TCGA-CF-A3MH-01A,TCGA-CF-A3MH,mRNA|TCGA-CF-A3MH-01A-11R-A20F-07|mRNA_coding|...,BLCA.1,398.0,0.0,Tumour,TCGA-BLCA,
9,TCGA-CF-A3MI-01A,TCGA-CF-A3MI,mRNA|TCGA-CF-A3MI-01A-11R-A20F-07|mRNA_coding|...,BLCA.1,370.0,0.0,Tumour,TCGA-BLCA,
...,...,...,...,...,...,...,...,...,...
10368,TCGA-FL-A1YQ-11A,TCGA-FL-A1YQ,mRNA|TCGA-FL-A1YQ-11A-11R-A32Y-07|mRNA_coding|...,,,,Normal,Normal,Short Term Survivor (STS)
10369,TCGA-FL-A1YT-11A,TCGA-FL-A1YT,mRNA|TCGA-FL-A1YT-11A-12R-A32Y-07|mRNA_coding|...,,,,Normal,Normal,Short Term Survivor (STS)
10370,TCGA-FL-A1YU-11A,TCGA-FL-A1YU,mRNA|TCGA-FL-A1YU-11A-11R-A32Y-07|mRNA_coding|...,,,,Normal,Normal,Short Term Survivor (STS)
10371,TCGA-FL-A1YV-11A,TCGA-FL-A1YV,mRNA|TCGA-FL-A1YV-11A-12R-A32Y-07|mRNA_coding|...,,,,Normal,Normal,Short Term Survivor (STS)


### Joining the tables by sample ID (case ID - cases)

In [12]:
labels['cases_clean'] = labels['cases'].str.split('|').str[1]

In [13]:
labels.head()

Unnamed: 0,sampleID,patient,cases,subtype,survival,vital_status,sample_type,cancer_type,surv_class,cases_clean
0,TCGA-BL-A0C8-01A,TCGA-BL-A0C8,mRNA|TCGA-BL-A0C8-01A-11R-A277-07|mRNA_coding|...,BLCA.1,1219.0,0.0,Tumour,TCGA-BLCA,Long Term Survivor (LTS),TCGA-BL-A0C8-01A-11R-A277-07
1,TCGA-BT-A20N-01A,TCGA-BT-A20N,mRNA|TCGA-BT-A20N-01A-11R-A14Y-07|mRNA_coding|...,BLCA.1,795.0,1.0,Tumour,TCGA-BLCA,Long Term Survivor (LTS),TCGA-BT-A20N-01A-11R-A14Y-07
2,TCGA-BT-A20P-01A,TCGA-BT-A20P,mRNA|TCGA-BT-A20P-01A-11R-A14Y-07|mRNA_coding|...,BLCA.1,544.0,1.0,Tumour,TCGA-BLCA,Long Term Survivor (LTS),TCGA-BT-A20P-01A-11R-A14Y-07
3,TCGA-BT-A3PH-01A,TCGA-BT-A3PH,mRNA|TCGA-BT-A3PH-01A-11R-A220-07|mRNA_coding|...,BLCA.1,142.0,1.0,Tumour,TCGA-BLCA,Short Term Survivor (STS),TCGA-BT-A3PH-01A-11R-A220-07
4,TCGA-BT-A42C-01A,TCGA-BT-A42C,mRNA|TCGA-BT-A42C-01A-11R-A23N-07|mRNA_coding|...,BLCA.1,873.0,0.0,Tumour,TCGA-BLCA,Long Term Survivor (LTS),TCGA-BT-A42C-01A-11R-A23N-07


In [14]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10374 entries, 0 to 10373
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sampleID      10374 non-null  object 
 1   patient       10374 non-null  object 
 2   cases         10374 non-null  object 
 3   subtype       10374 non-null  object 
 4   survival      8885 non-null   float32
 5   vital_status  8935 non-null   float32
 6   sample_type   10374 non-null  object 
 7   cancer_type   10374 non-null  object 
 8   surv_class    7337 non-null   object 
 9   cases_clean   10374 non-null  object 
dtypes: float32(2), object(8)
memory usage: 729.6+ KB


In [15]:
labels=labels.drop_duplicates()

In [16]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10374 entries, 0 to 10373
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sampleID      10374 non-null  object 
 1   patient       10374 non-null  object 
 2   cases         10374 non-null  object 
 3   subtype       10374 non-null  object 
 4   survival      8885 non-null   float32
 5   vital_status  8935 non-null   float32
 6   sample_type   10374 non-null  object 
 7   cancer_type   10374 non-null  object 
 8   surv_class    7337 non-null   object 
 9   cases_clean   10374 non-null  object 
dtypes: float32(2), object(8)
memory usage: 729.6+ KB


In [17]:
expr['caseID'] = expr['caseID'].str.strip()
labels['cases'] = labels['cases_clean'].str.strip()

# set(expr['caseID']).intersection(labels['cases_clean'])

In [18]:
common_ids = set(expr['caseID']).intersection(set(labels['cases']))
len(common_ids)

9349

In [19]:
df = expr.merge(labels, left_on='caseID', right_on='cases_clean', how='left')

In [20]:
df.head()

Unnamed: 0,caseID,ENSG00000211895,ENSG00000211896,ENSG00000211897,ENSG00000211898,ENSG00000211899,ENSG00000211677,ENSG00000211679,ENSG00000211685,ENSG00000211890,...,sampleID,patient,cases,subtype,survival,vital_status,sample_type,cancer_type,surv_class,cases_clean
0,TCGA-HQ-A5ND-01A-11R-A26T-07,0.2622,2.8437,0.1289,0.0,0.1993,0.6558,0.3027,0.0,0.0815,...,TCGA-HQ-A5ND-01A,TCGA-HQ-A5ND,TCGA-HQ-A5ND-01A-11R-A26T-07,,274.0,1.0,Tumour,TCGA-BLCA,Short Term Survivor (STS),TCGA-HQ-A5ND-01A-11R-A26T-07
1,TCGA-G2-A3IB-01A-11R-A20F-07,0.3238,0.4446,0.112,0.0,0.082,0.443,0.1107,0.0,0.1312,...,TCGA-G2-A3IB-01A,TCGA-G2-A3IB,TCGA-G2-A3IB-01A-11R-A20F-07,BLCA.3,220.0,1.0,Tumour,TCGA-BLCA,Short Term Survivor (STS),TCGA-G2-A3IB-01A-11R-A20F-07
2,TCGA-ZF-AA5N-01A-11R-A42T-07,396.15799,88.736198,24.3566,0.2993,10.4734,72.044296,33.686199,0.2576,161.064102,...,TCGA-ZF-AA5N-01A,TCGA-ZF-AA5N,TCGA-ZF-AA5N-01A-11R-A42T-07,,168.0,1.0,Tumour,TCGA-BLCA,Short Term Survivor (STS),TCGA-ZF-AA5N-01A-11R-A42T-07
3,TCGA-FD-A6TF-01A-52R-A32O-07,37.804501,894.137695,47.8643,2.5348,54.887402,295.739502,287.434387,1.0701,2.1523,...,TCGA-FD-A6TF-01A,TCGA-FD-A6TF,TCGA-FD-A6TF-01A-52R-A32O-07,,69.0,1.0,Tumour,TCGA-BLCA,Short Term Survivor (STS),TCGA-FD-A6TF-01A-52R-A32O-07
4,TCGA-GC-A3WC-11A-11R-A22U-07,737.350281,132.165497,35.4646,15.9797,84.176102,212.235199,97.084602,0.1685,122.155098,...,TCGA-GC-A3WC-11A,TCGA-GC-A3WC,TCGA-GC-A3WC-11A-11R-A22U-07,,,,Normal,Normal,Short Term Survivor (STS),TCGA-GC-A3WC-11A-11R-A22U-07


In [21]:
df.shape

(9349, 19898)

### Normalization of expression (needs to be logarithmized and standardized)

In [22]:
RANDOM_STATE = 42

OUT = Path("../data/splits")
OUT.mkdir(parents=True, exist_ok=True)

In [23]:
df.columns

Index(['caseID', 'ENSG00000211895', 'ENSG00000211896', 'ENSG00000211897',
       'ENSG00000211898', 'ENSG00000211899', 'ENSG00000211677',
       'ENSG00000211679', 'ENSG00000211685', 'ENSG00000211890',
       ...
       'sampleID', 'patient', 'cases', 'subtype', 'survival', 'vital_status',
       'sample_type', 'cancer_type', 'surv_class', 'cases_clean'],
      dtype='object', length=19898)

In [24]:
# case_ID - Index
df = df.set_index("caseID")

#labels
y_full = df["cancer_type"].copy()

# just a genes
gene_cols = [c for c in df.columns if isinstance(c, str) and c.startswith("ENSG")]
X_full = df[gene_cols].copy()

print("X_full:", X_full.shape)
print("y_full:", y_full.shape)

X_full: (9349, 19887)
y_full: (9349,)


In [25]:
# float
X_full = X_full.apply(pd.to_numeric)
X_full = X_full.replace([np.inf, -np.inf], np.nan).fillna(0.0)

X_full.index = X_full.index.astype(str).str.strip()
y_full.index = y_full.index.astype(str).str.strip()

# remove rows with empty/NaN index
mask_idx = (X_full.index != "") & (~X_full.index.isna())
X_full = X_full[mask_idx]
y_full = y_full.loc[X_full.index]

print("after clean:", X_full.shape, y_full.shape)

after clean: (9349, 19887) (9349,)


In [26]:
# test = 1000
X_temp, X_test, y_temp, y_test = train_test_split(
    X_full, y_full,
    test_size=1000,
    stratify=y_full,
    random_state=RANDOM_STATE
)

In [27]:
# fine-tuning = 1000
X_pre, X_fine, y_pre, y_fine = train_test_split(
    X_temp, y_temp,
    test_size=1000,
    stratify=y_temp,
    random_state=RANDOM_STATE
)

In [28]:
print("pre-train:", X_pre.shape, y_pre.shape)   # (7349, ...)
print("fine-tune:", X_fine.shape, y_fine.shape) # (1000, ...)
print("test:", X_test.shape, y_test.shape)      # (1000, ...)

pre-train: (7349, 19887) (7349,)
fine-tune: (1000, 19887) (1000,)
test: (1000, 19887) (1000,)


In [29]:
# Fit scaler ONLY on pre-train to avoid leakage
scaler = StandardScaler(with_mean=True, with_std=True)

X_pre_scaled  = scaler.fit_transform(X_pre.values)
X_fine_scaled = scaler.transform(X_fine.values)
X_test_scaled = scaler.transform(X_test.values)

In [30]:
print("Scaled shapes:", X_pre_scaled.shape, X_fine_scaled.shape, X_test_scaled.shape)

Scaled shapes: (7349, 19887) (1000, 19887) (1000, 19887)


### Save the results

In [None]:
label_name = "cancer_type"  

def save_scaled_split(X_scaled, X_index, y_series, prefix):
    """
    X_scaled : numpy array (scaled matrix)
    X_index  : index from X_pre / X_fine / X_test
    y_series : labels Series
    prefix   : name of file
    """

    # transform in DataFrame
    df = pd.DataFrame(X_scaled, index=X_index, columns=X_pre.columns)

    # guarantie that the indices match
    labels = y_series.loc[X_index]

    # adding labels
    df[label_name] = labels.astype(str)

    # save
    df.to_parquet(f"{prefix}.parquet")
    print(f"Saved: {prefix}.parquet  shape={df.shape}")


# saving three normalized sets

save_scaled_split(
    X_pre_scaled,
    X_pre.index,
    y_pre,
    "pretrain"
)

save_scaled_split(
    X_fine_scaled,
    X_fine.index,
    y_fine,
    "finetune"
)

save_scaled_split(
    X_test_scaled,
    X_test.index,
    y_test,
    "test"
)

Saved: pretrain.parquet  shape=(7349, 19888)


In [None]:
import os

for root, dirs, files in os.walk(".", topdown=True):
    for f in files:
        if f.endswith(".parquet"):
            print(os.path.join(root, f))