In [24]:
# Import sklearn/tensorflow modules.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# TODO: import models after rereading ml book

# Import other modules.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from os import walk
from time import sleep
from IPython.display import clear_output, display
%matplotlib inline

cancer_train = pd.read_csv('data/data_set_ALL_AML_train.csv')
cancer_test = pd.read_csv('data/data_set_ALL_AML_independent.csv')
labels = pd.read_csv('data/actual.csv')

print(cancer_train.shape)
print(cancer_test.shape)
print(labels.shape)


(7129, 78)
(7129, 70)
(72, 2)


#### Explanation of the dataset:
* already split in train and test dataset
* `ALL` = acute lymphoblastic leukemia 
* `AML` = acute myeloid leukemia

#### Clean and prepare datasets

In [25]:
train_df = cancer_train.loc[:, '1':]
test_df = cancer_test.loc[:, '39':]
genes = cancer_train.loc[:, 'Gene Accession Number']

print(train_df.shape)
print(test_df.shape)
print(genes.shape)


(7129, 76)
(7129, 68)
(7129,)


In [26]:
def rename_columns(df):
    """Get's the correct patient ID for the call columns"""
    for col in df.columns:
        if "call" in col:
            loc = df.columns.get_loc(col)
            patient = df.columns[loc-1]
            df.rename(columns={col: f'Call_{patient}'}, inplace=True)
            
            
rename_columns(df=cancer_train)
rename_columns(df=cancer_test)

#check for duplicate columns
#print(cancer_test.groupby(["Gene Description"]).size().value_counts(),
#      cancer_train.groupby(["Gene Description"]).size().value_counts())

#Gene description and Gene accesion should be kept together, otherwise there will be duplicates.
cancer_train["Gene"] = cancer_train["Gene Accession Number"]
cancer_test["Gene"] = cancer_test["Gene Accession Number"]

#Transpose the dataset and fix the columns + label train and test set with new column
cancer_train = cancer_train.T
cancer_train.columns = cancer_train.iloc[-1]
cancer_train = cancer_train[2:-1]
cancer_train['dataset'] = 'train'

cancer_test = cancer_test.T
cancer_test.columns = cancer_test.iloc[-1]
cancer_test = cancer_test[2:-1]
cancer_test['dataset'] = 'test'


df = pd.concat([cancer_train, cancer_test], axis=0,join='inner', sort=False)
print(df.shape)

# Remove call columns
call_rows = [row for row in df.index if "Call" in row]
df.drop(call_rows, inplace=True)
print(df.shape)

display(df)

print(df.columns)
print(df.index)

(144, 7130)
(72, 7130)


Gene,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,AFFX-BioDn-3_at,AFFX-CreX-5_at,AFFX-CreX-3_at,AFFX-BioB-5_st,...,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at,dataset
1,-214,-153,-58,88,-295,-558,199,-176,252,206,...,511,-125,389,-37,793,329,36,191,-37,train
2,-139,-73,-1,283,-264,-400,-330,-168,101,74,...,837,-36,442,-17,782,295,11,76,-14,train
3,-76,-49,-307,309,-376,-650,33,-367,206,-215,...,1199,33,168,52,1138,777,41,228,-41,train
4,-135,-114,265,12,-419,-585,158,-253,49,31,...,835,218,174,-110,627,170,-50,126,-91,train
5,-106,-125,-76,168,-230,-284,4,-122,70,252,...,649,57,504,-26,250,314,14,56,-25,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,-62,-198,-5,141,-256,-206,-298,-218,-14,100,...,532,-34,239,-78,707,354,-22,260,5,test
66,-58,-217,63,95,-191,-230,-86,-152,-6,-249,...,297,36,358,2,423,41,0,1777,-49,test
63,-161,-215,-46,146,-172,-596,-122,-341,171,-147,...,639,-27,548,-39,809,445,-2,210,16,test
64,-48,-531,-124,431,-496,-696,-1038,-441,235,157,...,1141,-121,197,-108,466,349,0,284,-73,test


Index(['AFFX-BioB-5_at', 'AFFX-BioB-M_at', 'AFFX-BioB-3_at', 'AFFX-BioC-5_at',
       'AFFX-BioC-3_at', 'AFFX-BioDn-5_at', 'AFFX-BioDn-3_at',
       'AFFX-CreX-5_at', 'AFFX-CreX-3_at', 'AFFX-BioB-5_st',
       ...
       'U58516_at', 'U73738_at', 'X06956_at', 'X16699_at', 'X83863_at',
       'Z17240_at', 'L49218_f_at', 'M71243_f_at', 'Z78285_f_at', 'dataset'],
      dtype='object', name='Gene', length=7130)
Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25',
       '26', '27', '34', '35', '36', '37', '38', '28', '29', '30', '31', '32',
       '33', '39', '40', '42', '47', '48', '49', '41', '43', '44', '45', '46',
       '70', '71', '72', '68', '69', '67', '55', '56', '59', '52', '53', '51',
       '50', '54', '57', '58', '60', '61', '65', '66', '63', '64', '62'],
      dtype='object')


#### Data Analysis

In [29]:
print(labels['cancer'].value_counts())

ALL    47
AML    25
Name: cancer, dtype: int64