Create a metadata file to help with sample extraction from the huge dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('denseDataOnlyDownload.tsv', delimiter='\t')

In [None]:
lst = ['Normal Tissue' ,'Solid Tissue Normal' ,'Primary Solid Tumor', 'Primary Tumor', 'Recurrent Solid Tumor', 'Recurrent Tumor', 'Recurrent Blood Derived Cancer - Peripheral Blood' , 'Recurrent Blood Derived Cancer - Bone Marrow', 'Primary Blood Derived Cancer - Peripheral Blood', 'Primary Blood Derived Cancer - Bone Marrow', 'Metastatic' ] 
mapping = {
     'Normal Tissue' : 'normal',
     'Solid Tissue Normal' : 'normal',
     'Primary Solid Tumor' : 'cancer',
     'Primary Tumor' : 'cancer',
     'Recurrent Solid Tumor' : 'cancer',
     'Recurrent Tumor' : 'cancer',
     'Recurrent Blood Derived Cancer - Peripheral Blood' : 'cancer',
     'Recurrent Blood Derived Cancer - Bone Marrow' : 'cancer',
     'Primary Blood Derived Cancer - Peripheral Blood' : 'cancer',
     'Primary Blood Derived Cancer - Bone Marrow' : 'cancer',
     'Metastatic' : 'cancer',
}

In [None]:
lst_name = []
with open ('column_names.txt', 'r') as file:
    for line in file:
        lst_name.append(line.strip())
lst_name = lst_name [1:]

Match the sample ID from the dataset to its metadata 

In [None]:
df = df[df['samples'].isin(lst_name)]

Only select the sample type in the lst 

In [None]:
df_filtered = df[df['_sample_type'].isin(lst)]

Map the sample type to either cancer or normal (relabel sample)

In [None]:
df_filtered['condition'] = df_filtered['_sample_type'].map(mapping)

In [None]:
from sklearn.model_selection import train_test_split

# Set the random seed for reproducibility
seed = 42

# Shuffle the DataFrame
df_shuffled = df_filtered.sample(frac=1, random_state=seed)

# Extract 5000 rows labeled as "cancer" and 5000 rows labeled as "normal"
df_cancer = df_shuffled[df_shuffled['condition'] == 'cancer'].sample(5000, random_state=seed)
df_normal = df_shuffled[df_shuffled['condition'] == 'normal'].sample(5000, random_state=seed)

# Split the "cancer" and "normal" data into training and testing sets
train_size = 3000
test_size = 2000

df_cancer_train, df_cancer_test = train_test_split(df_cancer, train_size=train_size, test_size=test_size, random_state=seed)
df_normal_train, df_normal_test = train_test_split(df_normal, train_size=train_size, test_size=test_size, random_state=seed)

In [None]:
columns_to_keep = ['samples', 'condition']

df_cancer_train = df_cancer_train.loc[:, columns_to_keep]
df_cancer_test = df_cancer_test.loc[:, columns_to_keep]
df_normal_train = df_normal_train.loc[:, columns_to_keep]
df_normal_test = df_normal_test.loc[:, columns_to_keep]

In [None]:
df_cancer_train.to_csv('cancer_train_metadata.csv', index=False)
df_cancer_test.to_csv('cancer_test_metadata.csv', index=False)
df_normal_train.to_csv('normal_train_metadata.csv', index=False)
df_normal_test.to_csv('normal_test_metadata.csv', index=False)