In [10]:

import pandas as pd
import numpy as np


In [None]:
def load_data_safely():
   
    print("Analyzing cancer data file structure...")
    with open('project-data-breast-cancer//pnas_tpm_96_nodup.txt', 'r') as f:
        lines = f.readlines()
    
    first_line = lines[0].strip()
    print(f"Cancer file first line: {first_line[:20]}...")
    
    if first_line.startswith('ENSG'):
        
        print("Cancer file: No sample names detected, creating S01-S96")
        cancer_data = pd.read_csv('project-data-breast-cancer//pnas_tpm_96_nodup.txt', sep='\t', index_col=0, header=None)
        sample_names = [f'S{i:02d}' for i in range(1, 97)]
        cancer_data.columns = sample_names
    else:
       
        print("Cancer file: Sample names detected in header")
        cancer_data = pd.read_csv('project-data-breast-cancer//pnas_tpm_96_nodup.txt', sep='\t', index_col=0)
    

    print("Analyzing normal data file structure...")
    with open('project-data-breast-cancer//pnas_normal_tpm.txt', 'r') as f:
        lines = f.readlines()
    
    first_line = lines[0].strip()
    print(f"Normal file first line: {first_line[:20]}...")
    
    if first_line.startswith('ENSG'):
  
        print("Normal file: No sample names detected, creating N01-N32")
        normal_data = pd.read_csv('project-data-breast-cancer//pnas_normal_tpm.txt', sep='\t', index_col=0, header=None)
        sample_names = [f'N{i:02d}' for i in range(1, 33)]
        normal_data.columns = sample_names
    else:
        print("Normal file: Sample names detected in header")
        normal_data = pd.read_csv('project-data-breast-cancer//pnas_normal_tpm.txt', sep='\t', index_col=0)
    
    return cancer_data, normal_data

In [None]:
def process_gene_expression_data_safe():

    
    print("Loading data files safely...")
    
    try:
        cancer_data, normal_data = load_data_safely()
        
        print(f"Cancer data loaded: {cancer_data.shape}")
        print(f"Normal data loaded: {normal_data.shape}")
        print(f"Cancer sample names: {list(cancer_data.columns[:5])}...")
        print(f"Normal sample names: {list(normal_data.columns[:5])}...")
        
    except Exception as e:
        print(f"Error loading data: {e}")
        return
    
    print("Processing data...")
    
    cancer_data_transposed = cancer_data.T  
    normal_data_transposed = normal_data.T  
    
    common_genes = cancer_data_transposed.columns.intersection(normal_data_transposed.columns)
    print(f"Number of common genes: {len(common_genes)}")
    
    cancer_data_filtered = cancer_data_transposed[common_genes]
    normal_data_filtered = normal_data_transposed[common_genes]
    
    cancer_labels = pd.DataFrame({'label': [1] * len(cancer_data_filtered)}, 
                                index=cancer_data_filtered.index)
    normal_labels = pd.DataFrame({'label': [0] * len(normal_data_filtered)}, 
                                index=normal_data_filtered.index)
    
    cancer_with_labels = pd.concat([cancer_data_filtered, cancer_labels], axis=1)
    normal_with_labels = pd.concat([normal_data_filtered, normal_labels], axis=1)
    
    combined_data = pd.concat([cancer_with_labels, normal_with_labels], axis=0)
    
    print(f"Final dataset shape: {combined_data.shape}")
    print(f"Number of cancer samples: {sum(combined_data['label'] == 1)}")
    print(f"Number of normal samples: {sum(combined_data['label'] == 0)}")
    
    output_filename = 'processed_gene_expression_data_safe.csv'
    combined_data.to_csv(output_filename, index=True)
    print(f"Data saved to: {output_filename}")
    
    print("\nDataset summary:")
    print(f"Total samples: {len(combined_data)}")
    print(f"Total genes: {len(common_genes)}")
    print(f"Sample names (first 10): {list(combined_data.index[:10])}")
    print(f"Gene names (first 10): {list(common_genes[:10])}")
    
    print(f"\nLabel distribution:")
    print(combined_data['label'].value_counts())
    
    return combined_data

In [None]:

processed_data_full =  process_gene_expression_data_safe()

Loading data files safely...
Analyzing cancer data file structure...
Cancer file first line: ENSG00000000003	7.07...
Cancer file: No sample names detected, creating S01-S96
Analyzing normal data file structure...
Normal file first line: N1	N2	N3	N4	N5	N6	N7...
Normal file: Sample names detected in header
Cancer data loaded: (60675, 96)
Normal data loaded: (60675, 32)
Cancer sample names: ['S01', 'S02', 'S03', 'S04', 'S05']...
Normal sample names: ['N1', 'N2', 'N3', 'N4', 'N5']...
Processing data...
Number of common genes: 60675
Final dataset shape: (128, 60676)
Number of cancer samples: 96
Number of normal samples: 32
Data saved to: processed_gene_expression_data_safe.csv

Dataset summary:
Total samples: 128
Total genes: 60675
Sample names (first 10): ['S01', 'S02', 'S03', 'S04', 'S05', 'S06', 'S07', 'S08', 'S09', 'S10']
Gene names (first 10): ['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938', 'ENSG00000000971', 'ENSG00000

In [None]:
# read processed data

In [14]:
processed_data = pd.read_csv('processed_gene_expression_data_safe.csv')
processed_data

Unnamed: 0.1,Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000283101,ENSG00000283103,ENSG00000283108,ENSG00000283110,ENSG00000283117,ENSG00000283118,ENSG00000283122,ENSG00000283123,ENSG00000283125,label
0,S01,7.071605,13.279391,0.000000,6.212355,34.038592,9.231355,32.811263,19.136966,12.631348,...,0.0,4.430132,0.000000,0.0,6.856902,16.599238,22.357146,0.000000,19.542796,1
1,S02,60.610797,47.424080,0.000000,2.218590,35.828348,4.395669,22.498053,0.000000,5.413170,...,0.0,0.000000,48.996429,0.0,9.795095,0.000000,4.562460,0.000000,0.000000,1
2,S03,58.255903,60.455497,23.040206,12.120963,55.926653,8.005046,13.657227,14.935300,16.430065,...,0.0,0.000000,14.871406,0.0,13.378542,0.000000,12.463189,0.000000,0.000000,1
3,S04,29.917356,6.482332,8.646690,6.065120,10.494273,0.000000,23.064209,18.683413,9.865584,...,0.0,0.000000,33.486271,0.0,13.388781,0.000000,6.236364,0.000000,0.000000,1
4,S05,24.500322,53.675826,20.456404,16.142519,14.482668,0.000000,59.112604,8.840272,55.432674,...,0.0,0.000000,26.407358,0.0,19.797049,57.509813,3.688509,0.000000,45.138720,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,N28,20.452887,64.812467,19.211649,5.053420,13.601410,3.337430,19.928695,37.360570,13.699906,...,0.0,0.000000,0.000000,0.0,0.000000,36.006926,27.712531,47.910042,0.000000,0
124,N29,16.581454,52.544413,38.937911,6.828136,4.725798,10.822811,3.462104,6.730835,7.774700,...,0.0,0.000000,10.053061,0.0,9.043886,0.000000,2.808369,0.000000,0.000000,0
125,N30,9.340377,0.000000,35.094127,7.692614,15.972327,6.096519,3.900425,0.000000,16.266748,...,0.0,0.000000,11.325832,0.0,6.792593,32.887121,9.491771,0.000000,0.000000,0
126,N31,5.074037,100.046777,19.064423,3.343129,19.281682,16.559268,15.538266,8.238725,6.797459,...,0.0,0.000000,0.000000,0.0,3.689987,0.000000,0.000000,23.771445,0.000000,0


In [None]:
# process validation data

In [None]:
def process_validation_data_csv():
    """
    Process validation data using CSV method with tab separator
    """
    
    print("=== Processing Validation Data (CSV Method) ===")
    
    print("\n1. Reading metadata files...")
    bc_meta = pd.read_excel('validation_bc_meta.xlsx')
    normal_meta = pd.read_excel('validation_normal_meta.xlsx')
    
    bc_samples = bc_meta['Mapping ID'].tolist()
    normal_samples = normal_meta['Mapping ID'].tolist()
    
    print(f"Breast cancer samples: {len(bc_samples)}")
    print(f"Normal samples: {len(normal_samples)}")
    print(f"Total samples: {len(bc_samples) + len(normal_samples)}")
    
    print("\n2. Reading TPM expression data...")
    
    tpm_data = pd.read_csv('validation_exon_tpm', sep='\t', index_col=0)
    
    print(f"TPM data shape: {tpm_data.shape}")
    print(f"Number of genes: {len(tpm_data.index)}")
    print(f"Number of samples: {len(tpm_data.columns)}")
    print(f"First 5 genes: {list(tpm_data.index[:5])}")
    print(f"First 5 samples: {list(tpm_data.columns[:5])}")
    
    print("\n3. Checking sample matching...")
    all_samples = bc_samples + normal_samples
    available_samples = [s for s in all_samples if s in tpm_data.columns]
    missing_samples = [s for s in all_samples if s not in tpm_data.columns]
    
    print(f"Available samples: {len(available_samples)}")
    print(f"Missing samples: {len(missing_samples)}")
    
    if len(missing_samples) > 0:
        print(f"Missing samples (first 10): {missing_samples[:10]}")
    
    print("\n4. Creating final dataset...")
    
    selected_data = tpm_data[available_samples]
    print(f"Selected data shape: {selected_data.shape}")
    
    final_data = selected_data.T
    print(f"Transposed shape: {final_data.shape}")
    
    print("\n5. Adding labels...")
    
    sample_types = []
    labels = []
    
    for sample_id in final_data.index:
        if sample_id in bc_samples:
            sample_types.append('breast_cancer')
            labels.append(1)  # Breast cancer
        elif sample_id in normal_samples:
            sample_types.append('normal')
            labels.append(0)  # Normal
    
    final_data.insert(0, 'sample_type', sample_types)
    final_data['label'] = labels
    
    print(f"Final data shape: {final_data.shape}")
    
    gene_cols = [col for col in final_data.columns if col.startswith('ENSG')]
    print(f"Number of genes: {len(gene_cols)}")
    print(f"Breast cancer samples: {sum(labels)}")
    print(f"Normal samples: {len(labels) - sum(labels)}")
    
    print("\n6. Handling missing values and saving...")
    
    final_data[gene_cols] = final_data[gene_cols].fillna(0)
    
    output_file = 'validation_ml_dataset.csv'
    final_data.to_csv(output_file, index=True)
    
    print(f"Saved as: {output_file}")
    
    print("\n7. Data preview:")
    preview_cols = ['sample_type'] + gene_cols[:3] + ['label']
    print(final_data[preview_cols].head())
    
    print("\n" + "="*50)
    print("DATA PROCESSING COMPLETED!")
    print("="*50)
    print(f"Total samples: {len(final_data)}")
    print(f"Gene features: {len(gene_cols)}")
    print(f"Breast cancer samples: {(final_data['label'] == 1).sum()}")
    print(f"Normal samples: {(final_data['label'] == 0).sum()}")
    print(f"Labels in last column (1=cancer, 0=normal)")
    print(f"\nUsage:")
    print(f"df = pd.read_csv('{output_file}', index_col=0)")
    print(f"X = df.iloc[:, 1:-1]  # Features (gene expression)")
    print(f"y = df.iloc[:, -1]    # Labels")
    
    return final_data

In [23]:
valid_dataset = process_validation_data_csv()


=== Processing Validation Data (CSV Method) ===

1. Reading metadata files...
Breast cancer samples: 83
Normal samples: 78
Total samples: 161

2. Reading TPM expression data...
TPM data shape: (60675, 161)
Number of genes: 60675
Number of samples: 161
First 5 genes: ['ENSG00000223972', 'ENSG00000227232', 'ENSG00000278267', 'ENSG00000243485', 'ENSG00000274890']
First 5 samples: ['BCSC_S5_01', 'BCSC_S5_02', 'BCSC_S5_03', 'BCSC_S5_04', 'BCSC_S5_05']

3. Checking sample matching...
Available samples: 161
Missing samples: 0

4. Creating final dataset...
Selected data shape: (60675, 161)
Transposed shape: (161, 60675)

5. Adding labels...
Final data shape: (161, 60677)
Number of genes: 60675
Breast cancer samples: 83
Normal samples: 78

6. Handling missing values and saving...
Saved as: validation_ml_dataset.csv

7. Data preview:
              sample_type  ENSG00000223972  ENSG00000227232  ENSG00000278267  \
BCSC_S5_40  breast_cancer              0.0              0.0              0.0   
BCSC

In [24]:
validation_data = pd.read_csv('validation_ml_dataset.csv')
validation_data

Unnamed: 0.1,Unnamed: 0,sample_type,ENSG00000223972,ENSG00000227232,ENSG00000278267,ENSG00000243485,ENSG00000274890,ENSG00000237613,ENSG00000268020,ENSG00000240361,...,ENSG00000274264,ENSG00000276946,ENSG00000278858,ENSG00000277905,ENSG00000275028,ENSG00000278806,ENSG00000274152,ENSG00000276666,ENSG00000277917,label
0,BCSC_S5_40,breast_cancer,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,529.902945,0.0,0.000000,0.0,0.0,0.0,0.0,1
1,BCSC_S5_41,breast_cancer,0.000000,0.000000,0.0,17.463718,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1
2,BCSC_S5_42,breast_cancer,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,139.113739,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1
3,BCSC_S5_43,breast_cancer,0.000000,0.000000,0.0,0.000000,0.0,9.903500,0.0,0.000000,...,0.000000,125.753815,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1
4,BCSC_S5_44,breast_cancer,0.000000,0.000000,0.0,0.000000,0.0,9.572097,0.0,0.000000,...,0.000000,0.000000,142.297400,0.0,138.909366,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,BCSC_S6_35,normal,5.044197,6.477929,0.0,8.571677,0.0,0.000000,0.0,0.000000,...,91.163354,0.000000,106.727829,0.0,0.000000,0.0,0.0,0.0,0.0,0
157,BCSC_S6_36,normal,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,12.825653,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0
158,BCSC_S6_37,normal,0.000000,7.465956,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,492.024703,0.0,0.000000,0.0,0.0,0.0,0.0,0
159,BCSC_S6_38,normal,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0


In [26]:
validation_data['ENSG00000223972'].head(10)

0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.000000
6    10.056905
7     0.000000
8     0.000000
9     0.000000
Name: ENSG00000223972, dtype: float64

In [27]:
validation_data.loc[validation_data['Unnamed: 0'] == 'BCSC_S5_01']

Unnamed: 0.1,Unnamed: 0,sample_type,ENSG00000223972,ENSG00000227232,ENSG00000278267,ENSG00000243485,ENSG00000274890,ENSG00000237613,ENSG00000268020,ENSG00000240361,...,ENSG00000274264,ENSG00000276946,ENSG00000278858,ENSG00000277905,ENSG00000275028,ENSG00000278806,ENSG00000274152,ENSG00000276666,ENSG00000277917,label
83,BCSC_S5_01,normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,356.983009,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
valid_df_sorted = validation_data.reindex(sorted(validation_data.columns), axis=1)


In [29]:
valid_df_sorted

Unnamed: 0.1,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000283108,ENSG00000283110,ENSG00000283117,ENSG00000283118,ENSG00000283122,ENSG00000283123,ENSG00000283125,Unnamed: 0,label,sample_type
0,9.581487,16.193307,14.400014,12.625902,27.671821,5.003114,13.870510,3.111496,8.214967,2.280349,...,9.294554,0.0,0.000000,0.000000,5.192954,17.955389,15.887401,BCSC_S5_40,1,breast_cancer
1,9.829358,0.000000,29.545080,12.952532,16.434977,7.698815,6.567387,22.343930,12.641231,11.696704,...,28.605010,0.0,0.000000,13.843522,0.000000,0.000000,32.596812,BCSC_S5_41,1,breast_cancer
2,2.944855,0.000000,11.064556,21.343035,6.714389,30.753987,4.918939,9.563136,14.202324,3.504308,...,0.000000,0.0,8.566337,0.000000,0.000000,0.000000,0.000000,BCSC_S5_42,1,breast_cancer
3,18.634303,0.000000,10.001960,12.277577,6.069566,27.800498,16.303994,8.644731,4.279463,6.335537,...,38.734865,0.0,11.615490,18.745910,18.034608,49.885811,0.000000,BCSC_S5_43,1,breast_cancer
4,7.718889,0.000000,0.000000,6.780989,17.599377,3.358776,10.028079,8.355451,9.651271,36.741181,...,0.000000,0.0,0.000000,0.000000,0.000000,48.216474,0.000000,BCSC_S5_44,1,breast_cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,23.157703,0.000000,43.504633,7.628954,16.133484,17.634362,9.670367,12.533737,11.375222,36.742826,...,9.360088,0.0,5.613651,0.000000,7.844352,18.081988,0.000000,BCSC_S6_35,0,normal
157,5.316919,0.000000,29.965486,5.254735,8.081860,24.292688,13.321673,0.000000,11.396539,9.490512,...,0.000000,0.0,30.932942,0.000000,7.204131,0.000000,0.000000,BCSC_S6_36,0,normal
158,2.224147,12.529822,16.713349,5.861692,3.380763,14.517136,7.430207,21.668113,8.342851,5.293365,...,0.000000,0.0,6.469857,0.000000,3.013596,0.000000,0.000000,BCSC_S6_37,0,normal
159,0.000000,7.656595,0.000000,7.163805,61.976456,0.000000,0.000000,0.000000,0.000000,19.407689,...,0.000000,0.0,19.767667,0.000000,0.000000,0.000000,0.000000,BCSC_S6_38,0,normal


In [30]:
valid_df_sorted = valid_df_sorted.drop('sample_type', axis = 1)
valid_df_sorted

Unnamed: 0.1,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000283103,ENSG00000283108,ENSG00000283110,ENSG00000283117,ENSG00000283118,ENSG00000283122,ENSG00000283123,ENSG00000283125,Unnamed: 0,label
0,9.581487,16.193307,14.400014,12.625902,27.671821,5.003114,13.870510,3.111496,8.214967,2.280349,...,3.601495,9.294554,0.0,0.000000,0.000000,5.192954,17.955389,15.887401,BCSC_S5_40,1
1,9.829358,0.000000,29.545080,12.952532,16.434977,7.698815,6.567387,22.343930,12.641231,11.696704,...,3.694666,28.605010,0.0,0.000000,13.843522,0.000000,0.000000,32.596812,BCSC_S5_41,1
2,2.944855,0.000000,11.064556,21.343035,6.714389,30.753987,4.918939,9.563136,14.202324,3.504308,...,0.000000,0.000000,0.0,8.566337,0.000000,0.000000,0.000000,0.000000,BCSC_S5_42,1
3,18.634303,0.000000,10.001960,12.277577,6.069566,27.800498,16.303994,8.644731,4.279463,6.335537,...,0.000000,38.734865,0.0,11.615490,18.745910,18.034608,49.885811,0.000000,BCSC_S5_43,1
4,7.718889,0.000000,0.000000,6.780989,17.599377,3.358776,10.028079,8.355451,9.651271,36.741181,...,9.671270,0.000000,0.0,0.000000,0.000000,0.000000,48.216474,0.000000,BCSC_S5_44,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,23.157703,0.000000,43.504633,7.628954,16.133484,17.634362,9.670367,12.533737,11.375222,36.742826,...,10.880666,9.360088,0.0,5.613651,0.000000,7.844352,18.081988,0.000000,BCSC_S6_35,0
157,5.316919,0.000000,29.965486,5.254735,8.081860,24.292688,13.321673,0.000000,11.396539,9.490512,...,0.000000,0.000000,0.0,30.932942,0.000000,7.204131,0.000000,0.000000,BCSC_S6_36,0
158,2.224147,12.529822,16.713349,5.861692,3.380763,14.517136,7.430207,21.668113,8.342851,5.293365,...,0.000000,0.000000,0.0,6.469857,0.000000,3.013596,0.000000,0.000000,BCSC_S6_37,0
159,0.000000,7.656595,0.000000,7.163805,61.976456,0.000000,0.000000,0.000000,0.000000,19.407689,...,0.000000,0.000000,0.0,19.767667,0.000000,0.000000,0.000000,0.000000,BCSC_S6_38,0


In [31]:
column_to_move = 'Unnamed: 0'
moved_column = valid_df_sorted.pop(column_to_move)
valid_df_sorted.insert(0, column_to_move, moved_column)
valid_df_sorted

Unnamed: 0.1,Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000283101,ENSG00000283103,ENSG00000283108,ENSG00000283110,ENSG00000283117,ENSG00000283118,ENSG00000283122,ENSG00000283123,ENSG00000283125,label
0,BCSC_S5_40,9.581487,16.193307,14.400014,12.625902,27.671821,5.003114,13.870510,3.111496,8.214967,...,0.0,3.601495,9.294554,0.0,0.000000,0.000000,5.192954,17.955389,15.887401,1
1,BCSC_S5_41,9.829358,0.000000,29.545080,12.952532,16.434977,7.698815,6.567387,22.343930,12.641231,...,0.0,3.694666,28.605010,0.0,0.000000,13.843522,0.000000,0.000000,32.596812,1
2,BCSC_S5_42,2.944855,0.000000,11.064556,21.343035,6.714389,30.753987,4.918939,9.563136,14.202324,...,0.0,0.000000,0.000000,0.0,8.566337,0.000000,0.000000,0.000000,0.000000,1
3,BCSC_S5_43,18.634303,0.000000,10.001960,12.277577,6.069566,27.800498,16.303994,8.644731,4.279463,...,0.0,0.000000,38.734865,0.0,11.615490,18.745910,18.034608,49.885811,0.000000,1
4,BCSC_S5_44,7.718889,0.000000,0.000000,6.780989,17.599377,3.358776,10.028079,8.355451,9.651271,...,0.0,9.671270,0.000000,0.0,0.000000,0.000000,0.000000,48.216474,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,BCSC_S6_35,23.157703,0.000000,43.504633,7.628954,16.133484,17.634362,9.670367,12.533737,11.375222,...,0.0,10.880666,9.360088,0.0,5.613651,0.000000,7.844352,18.081988,0.000000,0
157,BCSC_S6_36,5.316919,0.000000,29.965486,5.254735,8.081860,24.292688,13.321673,0.000000,11.396539,...,0.0,0.000000,0.000000,0.0,30.932942,0.000000,7.204131,0.000000,0.000000,0
158,BCSC_S6_37,2.224147,12.529822,16.713349,5.861692,3.380763,14.517136,7.430207,21.668113,8.342851,...,0.0,0.000000,0.000000,0.0,6.469857,0.000000,3.013596,0.000000,0.000000,0
159,BCSC_S6_38,0.000000,7.656595,0.000000,7.163805,61.976456,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,19.767667,0.000000,0.000000,0.000000,0.000000,0


In [32]:
valid_df_sorted.iloc[:,0]

0      BCSC_S5_40
1      BCSC_S5_41
2      BCSC_S5_42
3      BCSC_S5_43
4      BCSC_S5_44
          ...    
156    BCSC_S6_35
157    BCSC_S6_36
158    BCSC_S6_37
159    BCSC_S6_38
160    BCSC_S6_75
Name: Unnamed: 0, Length: 161, dtype: object

In [34]:
valid_df_sorted_rows = valid_df_sorted.sort_values(by = 'Unnamed: 0')
valid_df_sorted_rows

Unnamed: 0.1,Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000283101,ENSG00000283103,ENSG00000283108,ENSG00000283110,ENSG00000283117,ENSG00000283118,ENSG00000283122,ENSG00000283123,ENSG00000283125,label
83,BCSC_S5_01,16.137049,27.272615,12.126183,19.137982,2.452875,4.213098,1.796968,20.961408,3.458892,...,0.0,0.000000,0.000000,0.0,14.082396,68.181537,4.372962,0.000000,0.000000,0
84,BCSC_S5_02,3.190581,0.000000,23.975619,8.408709,9.699538,8.330044,7.105850,5.180554,11.967979,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,8.646123,0.000000,0.000000,0
85,BCSC_S5_03,5.408192,15.233633,0.000000,19.598114,22.606640,10.589874,22.583931,17.562585,10.143155,...,0.0,0.000000,0.000000,0.0,19.664969,0.000000,3.663900,0.000000,0.000000,0
86,BCSC_S5_04,0.000000,6.542969,26.182719,15.304636,10.592439,15.161458,28.453281,3.771636,11.202603,...,0.0,4.365595,22.533007,0.0,0.000000,32.714847,6.294700,43.529672,19.258100,0
87,BCSC_S5_05,13.832632,15.585338,20.789059,14.582243,10.512986,3.611456,13.863201,8.984030,11.859811,...,0.0,0.000000,13.418393,0.0,8.047593,0.000000,0.000000,51.843790,22.936375,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,BCSC_S6_71,16.975094,5.976867,23.917374,4.194141,17.739287,24.929421,7.088587,3.445312,10.233346,...,0.0,3.987881,10.291718,0.0,3.086195,0.000000,0.000000,19.881729,0.000000,1
80,BCSC_S6_72,11.372292,10.677711,49.849961,16.234527,4.321547,24.742537,9.497854,12.310143,10.156632,...,0.0,0.000000,9.193110,0.0,2.756754,0.000000,5.136276,17.759416,0.000000,1
81,BCSC_S6_73,23.722124,0.000000,69.323284,15.629788,20.032370,41.289516,10.272953,8.559481,9.886944,...,0.0,0.000000,12.784294,0.0,0.000000,0.000000,3.571352,0.000000,0.000000,1
82,BCSC_S6_74,8.169535,0.000000,10.231660,10.765318,16.557217,24.884081,24.259524,48.637935,11.673982,...,0.0,0.000000,0.000000,0.0,7.921497,38.352838,3.689756,0.000000,22.576991,1


In [35]:
valid_df_sorted_rows.to_csv('valid_sorted.csv', index = False)

In [36]:
# read train and valid processed data and combine them
train_df = pd.read_csv('processed_gene_expression_data_safe.csv')
train_df

Unnamed: 0.1,Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000283101,ENSG00000283103,ENSG00000283108,ENSG00000283110,ENSG00000283117,ENSG00000283118,ENSG00000283122,ENSG00000283123,ENSG00000283125,label
0,S01,7.071605,13.279391,0.000000,6.212355,34.038592,9.231355,32.811263,19.136966,12.631348,...,0.0,4.430132,0.000000,0.0,6.856902,16.599238,22.357146,0.000000,19.542796,1
1,S02,60.610797,47.424080,0.000000,2.218590,35.828348,4.395669,22.498053,0.000000,5.413170,...,0.0,0.000000,48.996429,0.0,9.795095,0.000000,4.562460,0.000000,0.000000,1
2,S03,58.255903,60.455497,23.040206,12.120963,55.926653,8.005046,13.657227,14.935300,16.430065,...,0.0,0.000000,14.871406,0.0,13.378542,0.000000,12.463189,0.000000,0.000000,1
3,S04,29.917356,6.482332,8.646690,6.065120,10.494273,0.000000,23.064209,18.683413,9.865584,...,0.0,0.000000,33.486271,0.0,13.388781,0.000000,6.236364,0.000000,0.000000,1
4,S05,24.500322,53.675826,20.456404,16.142519,14.482668,0.000000,59.112604,8.840272,55.432674,...,0.0,0.000000,26.407358,0.0,19.797049,57.509813,3.688509,0.000000,45.138720,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,N28,20.452887,64.812467,19.211649,5.053420,13.601410,3.337430,19.928695,37.360570,13.699906,...,0.0,0.000000,0.000000,0.0,0.000000,36.006926,27.712531,47.910042,0.000000,0
124,N29,16.581454,52.544413,38.937911,6.828136,4.725798,10.822811,3.462104,6.730835,7.774700,...,0.0,0.000000,10.053061,0.0,9.043886,0.000000,2.808369,0.000000,0.000000,0
125,N30,9.340377,0.000000,35.094127,7.692614,15.972327,6.096519,3.900425,0.000000,16.266748,...,0.0,0.000000,11.325832,0.0,6.792593,32.887121,9.491771,0.000000,0.000000,0
126,N31,5.074037,100.046777,19.064423,3.343129,19.281682,16.559268,15.538266,8.238725,6.797459,...,0.0,0.000000,0.000000,0.0,3.689987,0.000000,0.000000,23.771445,0.000000,0


In [37]:
valid_df = pd.read_csv('valid_sorted.csv')
valid_df

Unnamed: 0.1,Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000283101,ENSG00000283103,ENSG00000283108,ENSG00000283110,ENSG00000283117,ENSG00000283118,ENSG00000283122,ENSG00000283123,ENSG00000283125,label
0,BCSC_S5_01,16.137049,27.272615,12.126183,19.137982,2.452875,4.213098,1.796968,20.961408,3.458892,...,0.0,0.000000,0.000000,0.0,14.082396,68.181537,4.372962,0.000000,0.000000,0
1,BCSC_S5_02,3.190581,0.000000,23.975619,8.408709,9.699538,8.330044,7.105850,5.180554,11.967979,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,8.646123,0.000000,0.000000,0
2,BCSC_S5_03,5.408192,15.233633,0.000000,19.598114,22.606640,10.589874,22.583931,17.562585,10.143155,...,0.0,0.000000,0.000000,0.0,19.664969,0.000000,3.663900,0.000000,0.000000,0
3,BCSC_S5_04,0.000000,6.542969,26.182719,15.304636,10.592439,15.161458,28.453281,3.771636,11.202603,...,0.0,4.365595,22.533007,0.0,0.000000,32.714847,6.294700,43.529672,19.258100,0
4,BCSC_S5_05,13.832632,15.585338,20.789059,14.582243,10.512986,3.611456,13.863201,8.984030,11.859811,...,0.0,0.000000,13.418393,0.0,8.047593,0.000000,0.000000,51.843790,22.936375,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,BCSC_S6_71,16.975094,5.976867,23.917374,4.194141,17.739287,24.929421,7.088587,3.445312,10.233346,...,0.0,3.987881,10.291718,0.0,3.086195,0.000000,0.000000,19.881729,0.000000,1
157,BCSC_S6_72,11.372292,10.677711,49.849961,16.234527,4.321547,24.742537,9.497854,12.310143,10.156632,...,0.0,0.000000,9.193110,0.0,2.756754,0.000000,5.136276,17.759416,0.000000,1
158,BCSC_S6_73,23.722124,0.000000,69.323284,15.629788,20.032370,41.289516,10.272953,8.559481,9.886944,...,0.0,0.000000,12.784294,0.0,0.000000,0.000000,3.571352,0.000000,0.000000,1
159,BCSC_S6_74,8.169535,0.000000,10.231660,10.765318,16.557217,24.884081,24.259524,48.637935,11.673982,...,0.0,0.000000,0.000000,0.0,7.921497,38.352838,3.689756,0.000000,22.576991,1


In [38]:
train_df['label'].value_counts()

label
1    96
0    32
Name: count, dtype: int64

In [39]:
valid_df['label'].value_counts()

label
1    83
0    78
Name: count, dtype: int64

In [40]:
combined_df = pd.concat([train_df,valid_df],ignore_index=True)
combined_df

Unnamed: 0.1,Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000283101,ENSG00000283103,ENSG00000283108,ENSG00000283110,ENSG00000283117,ENSG00000283118,ENSG00000283122,ENSG00000283123,ENSG00000283125,label
0,S01,7.071605,13.279391,0.000000,6.212355,34.038592,9.231355,32.811263,19.136966,12.631348,...,0.0,4.430132,0.000000,0.0,6.856902,16.599238,22.357146,0.000000,19.542796,1
1,S02,60.610797,47.424080,0.000000,2.218590,35.828348,4.395669,22.498053,0.000000,5.413170,...,0.0,0.000000,48.996429,0.0,9.795095,0.000000,4.562460,0.000000,0.000000,1
2,S03,58.255903,60.455497,23.040206,12.120963,55.926653,8.005046,13.657227,14.935300,16.430065,...,0.0,0.000000,14.871406,0.0,13.378542,0.000000,12.463189,0.000000,0.000000,1
3,S04,29.917356,6.482332,8.646690,6.065120,10.494273,0.000000,23.064209,18.683413,9.865584,...,0.0,0.000000,33.486271,0.0,13.388781,0.000000,6.236364,0.000000,0.000000,1
4,S05,24.500322,53.675826,20.456404,16.142519,14.482668,0.000000,59.112604,8.840272,55.432674,...,0.0,0.000000,26.407358,0.0,19.797049,57.509813,3.688509,0.000000,45.138720,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,BCSC_S6_71,16.975094,5.976867,23.917374,4.194141,17.739287,24.929421,7.088587,3.445312,10.233346,...,0.0,3.987881,10.291718,0.0,3.086195,0.000000,0.000000,19.881729,0.000000,1
285,BCSC_S6_72,11.372292,10.677711,49.849961,16.234527,4.321547,24.742537,9.497854,12.310143,10.156632,...,0.0,0.000000,9.193110,0.0,2.756754,0.000000,5.136276,17.759416,0.000000,1
286,BCSC_S6_73,23.722124,0.000000,69.323284,15.629788,20.032370,41.289516,10.272953,8.559481,9.886944,...,0.0,0.000000,12.784294,0.0,0.000000,0.000000,3.571352,0.000000,0.000000,1
287,BCSC_S6_74,8.169535,0.000000,10.231660,10.765318,16.557217,24.884081,24.259524,48.637935,11.673982,...,0.0,0.000000,0.000000,0.0,7.921497,38.352838,3.689756,0.000000,22.576991,1


In [41]:
combined_df['label'].value_counts()

label
1    179
0    110
Name: count, dtype: int64

In [42]:
combined_df = combined_df.rename(columns={'Unnamed: 0':'sample_num'})
combined_df

Unnamed: 0,sample_num,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000283101,ENSG00000283103,ENSG00000283108,ENSG00000283110,ENSG00000283117,ENSG00000283118,ENSG00000283122,ENSG00000283123,ENSG00000283125,label
0,S01,7.071605,13.279391,0.000000,6.212355,34.038592,9.231355,32.811263,19.136966,12.631348,...,0.0,4.430132,0.000000,0.0,6.856902,16.599238,22.357146,0.000000,19.542796,1
1,S02,60.610797,47.424080,0.000000,2.218590,35.828348,4.395669,22.498053,0.000000,5.413170,...,0.0,0.000000,48.996429,0.0,9.795095,0.000000,4.562460,0.000000,0.000000,1
2,S03,58.255903,60.455497,23.040206,12.120963,55.926653,8.005046,13.657227,14.935300,16.430065,...,0.0,0.000000,14.871406,0.0,13.378542,0.000000,12.463189,0.000000,0.000000,1
3,S04,29.917356,6.482332,8.646690,6.065120,10.494273,0.000000,23.064209,18.683413,9.865584,...,0.0,0.000000,33.486271,0.0,13.388781,0.000000,6.236364,0.000000,0.000000,1
4,S05,24.500322,53.675826,20.456404,16.142519,14.482668,0.000000,59.112604,8.840272,55.432674,...,0.0,0.000000,26.407358,0.0,19.797049,57.509813,3.688509,0.000000,45.138720,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,BCSC_S6_71,16.975094,5.976867,23.917374,4.194141,17.739287,24.929421,7.088587,3.445312,10.233346,...,0.0,3.987881,10.291718,0.0,3.086195,0.000000,0.000000,19.881729,0.000000,1
285,BCSC_S6_72,11.372292,10.677711,49.849961,16.234527,4.321547,24.742537,9.497854,12.310143,10.156632,...,0.0,0.000000,9.193110,0.0,2.756754,0.000000,5.136276,17.759416,0.000000,1
286,BCSC_S6_73,23.722124,0.000000,69.323284,15.629788,20.032370,41.289516,10.272953,8.559481,9.886944,...,0.0,0.000000,12.784294,0.0,0.000000,0.000000,3.571352,0.000000,0.000000,1
287,BCSC_S6_74,8.169535,0.000000,10.231660,10.765318,16.557217,24.884081,24.259524,48.637935,11.673982,...,0.0,0.000000,0.000000,0.0,7.921497,38.352838,3.689756,0.000000,22.576991,1


In [43]:
combined_df.to_csv('combined_data.csv', index = False)

In [44]:
selected_columns = combined_df.iloc[:, list(range(3)) + list(range(-3, 0))]
selected_columns

Unnamed: 0,sample_num,ENSG00000000003,ENSG00000000005,ENSG00000283123,ENSG00000283125,label
0,S01,7.071605,13.279391,0.000000,19.542796,1
1,S02,60.610797,47.424080,0.000000,0.000000,1
2,S03,58.255903,60.455497,0.000000,0.000000,1
3,S04,29.917356,6.482332,0.000000,0.000000,1
4,S05,24.500322,53.675826,0.000000,45.138720,1
...,...,...,...,...,...,...
284,BCSC_S6_71,16.975094,5.976867,19.881729,0.000000,1
285,BCSC_S6_72,11.372292,10.677711,17.759416,0.000000,1
286,BCSC_S6_73,23.722124,0.000000,0.000000,0.000000,1
287,BCSC_S6_74,8.169535,0.000000,0.000000,22.576991,1
