In [31]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MaxAbsScaler, OneHotEncoder

In [55]:
meta = pd.read_csv('metadata', sep='\t', header=None, usecols=(1, 4),
                  names=['tissue_type', 'filename'])
meta = meta[['filename', 'tissue_type']]
meta.set_index('filename', inplace=True)

In [56]:
meta.head()

Unnamed: 0_level_0,tissue_type
filename,Unnamed: 1_level_1
76a194e5-ba36-4e81-b0b4-6c22a69fe9b5.FPKM-UQ.txt,Prostate
e8caa240-51e5-41c2-8340-32613692be11.FPKM-UQ.txt,Uterus
5e79c5a9-5341-44e1-95fb-392020416b0c.FPKM-UQ.txt,Head and Neck
f67f1243-3126-4dea-98bf-186ad3b3e164.FPKM-UQ.txt,Lung
a842f65f-8e47-4ddb-81aa-a4acff6b4963.FPKM-UQ.txt,Kidney


In [35]:
def read_data_matrix(directories):
    """
    returns P * N dataframe
    """
    dataframes = []
    for idx, directory in enumerate(directories):
        files = os.listdir(directory)
        for f in files:
            if not f.endswith('.txt'):
                continue
            df = pd.read_csv(os.path.join(directory, f), 
                             sep='\t', header=None, 
                             index_col=0, # use gene name as index
                             names=['gene', f])
            dataframes.append(df)
    ret_df = pd.concat(dataframes, axis=1, join='outer')
    ret_df.replace(0, np.nan, inplace=True)
    ret_df.dropna(inplace=True)
    return ret_df

In [None]:
raw_data = read_data_matrix(['normal', 'tumor'])

In [96]:
# data cleaning
data = raw_data.T # N samples * P genes
data.index.name = ''
data.head()

Unnamed: 0,ENSG00000167578.15,ENSG00000078237.5,ENSG00000146083.10,ENSG00000198242.12,ENSG00000134108.11,ENSG00000167700.7,ENSG00000060642.9,ENSG00000182141.8,ENSG00000070087.12,ENSG00000153561.11,...,ENSG00000102265.10,ENSG00000133313.13,ENSG00000134470.18,ENSG00000066044.12,ENSG00000146587.16,ENSG00000107863.15,ENSG00000213782.6,ENSG00000146707.13,ENSG00000105063.17,ENSG00000123685.7
,,,,,,,,,,,,,,,,,,,,,
40633fb7-6665-4897-ba2d-76502e35dcd7.FPKM-UQ.txt,41750.382902,29613.284182,242694.822369,4404930.0,617138.206808,140585.532172,196000.665729,70319.409607,483915.9,351589.58978,...,1048973.0,291238.926603,62922.283775,220089.341351,121567.210407,222614.067196,8138.484935,94759.43331,192157.08034,6180.046836
a5f5e1b6-9947-442f-a106-9df71b3a2e75.FPKM-UQ.txt,130586.545351,99342.957541,155771.627865,4134060.0,775251.514905,358731.119593,143534.163649,30569.32381,943497.0,255604.120338,...,5046872.0,480726.888413,66035.949443,361731.291654,55191.476427,82130.468636,3468.442509,294805.427214,293803.777071,4295.601486
7a43da66-e219-4bc3-bddb-ef014247039f.FPKM-UQ.txt,35929.344943,42080.786011,218765.402959,1805149.0,704187.049221,268076.434384,203173.161043,37263.082117,1145943.0,406267.433117,...,750603.4,940821.791149,44125.526617,242554.589386,80236.772576,176382.078568,3313.375121,59382.485492,188092.386563,4888.05627
35a1df51-9daf-4d8b-ab87-222b8562244a.FPKM-UQ.txt,71857.934941,88079.139107,223454.130427,2232057.0,784351.55195,113692.663006,112726.358554,25641.68312,292236.9,251698.520315,...,2527977.0,514139.769404,165443.704645,230045.432902,59557.966856,208662.777117,6153.495185,89015.633421,305076.823055,30112.201408
e00ee63c-c3d2-4233-8d54-bce01194b003.FPKM-UQ.txt,60051.980995,89511.02358,214838.402284,2012515.0,906064.154904,52545.318664,160018.819801,45057.888307,177894.9,272423.464071,...,1288950.0,403705.220365,196741.850548,206068.334523,73066.349681,283974.940763,11439.902379,108846.071001,253022.905431,41154.975018


In [102]:
joined = meta.join(data, how='inner')

In [103]:
joined.shape # all files that have meta data are kept

(726, 12826)

In [105]:
joined.head()

Unnamed: 0,tissue_type,ENSG00000167578.15,ENSG00000078237.5,ENSG00000146083.10,ENSG00000198242.12,ENSG00000134108.11,ENSG00000167700.7,ENSG00000060642.9,ENSG00000182141.8,ENSG00000070087.12,...,ENSG00000102265.10,ENSG00000133313.13,ENSG00000134470.18,ENSG00000066044.12,ENSG00000146587.16,ENSG00000107863.15,ENSG00000213782.6,ENSG00000146707.13,ENSG00000105063.17,ENSG00000123685.7
00511204-3512-4a5e-b664-60271e968903.FPKM-UQ.txt,Breast,58706.097676,59308.311702,116287.227772,2579753.0,794740.667981,98870.083211,139351.744403,31090.38291,367873.7,...,840653.1,240754.1,143306.098393,213308.580201,61769.139209,591769.157173,6699.465741,67974.750447,169984.933794,9115.193832
011ee3e1-37bd-47c1-9092-a24dffd3d5f5.FPKM-UQ.txt,Kidney,55567.147367,98834.60241,192328.896144,1824004.0,721207.383752,213800.784191,268950.226904,30765.651437,1187112.0,...,1098040.0,1418258.0,68640.388542,251803.620298,57674.90012,128519.277952,3809.959233,83159.883967,244710.156932,9437.127183
019a5486-ad80-44b4-a9e3-7e7f93e8e6fa.FPKM-UQ.txt,Breast,32650.638627,77858.126921,165002.953526,3374387.0,702902.33691,93852.076548,152213.411411,60703.167963,440081.5,...,2444548.0,299712.7,136271.659045,233223.003487,103141.323122,321140.760208,7004.914217,137498.18804,155252.584082,22689.653807
01f17467-b7de-49d4-a9d7-00108f4de1f9.FPKM-UQ.txt,Breast,46263.64844,43432.105001,149757.369183,2553905.0,642524.310636,114740.186294,142536.822409,43634.181927,450335.8,...,2329672.0,198654.7,107037.914998,209128.076025,65932.440055,377940.01032,7278.856836,205427.66521,228189.60415,12098.714838
027ec9be-7e41-44df-8f61-1e5698303eac.FPKM-UQ.txt,Lung,111199.127008,89621.73919,251720.417448,2582028.0,709634.798448,168043.957816,105155.734942,21954.956239,154012.7,...,4271145.0,508895.4,133262.715484,241731.31004,45693.720917,149426.860742,6882.412177,63569.126888,353593.231754,39433.510038


In [107]:
# joined.to_csv('data/p4.csv', index=False)

In [114]:
meta.to_csv('data/metadata.csv', index=False)

# Produce X and Y

In [110]:
joined.drop(columns='tissue_type').values.shape

(726, 12825)

In [111]:
mat = joined.drop(columns='tissue_type').values
scaler = MaxAbsScaler()
mat = scaler.fit_transform(mat)

In [155]:
# reshape for Conv1d
mat = np.expand_dims(mat, axis=2).astype('float32')

In [156]:
# save 100 datapoints for testing
np.save('data/p4_train', mat[:-100])
np.save('data/p4_test', mat[-100:])

In [121]:
labels = joined['tissue_type'].values

In [122]:
labels

array(['Breast', 'Kidney', 'Breast', 'Breast', 'Lung', 'Kidney', 'Kidney',
       'Lung', 'Uterus', 'Prostate', 'Lung', 'Uterus', 'Bile Duct',
       'Uterus', 'Thyroid', 'Breast', 'Kidney', 'Prostate', 'Prostate',
       'Breast', 'Thyroid', 'Thyroid', 'Kidney', 'Breast', 'Kidney',
       'Thyroid', 'Lung', 'Colorectal', 'Prostate', 'Kidney',
       'Colorectal', 'Esophagus', 'Kidney', 'Breast', 'Colorectal',
       'Colorectal', 'Colorectal', 'Breast', 'Kidney', 'Kidney',
       'Bladder', 'Lung', 'Thyroid', 'Kidney', 'Thyroid', 'Head and Neck',
       'Prostate', 'Lung', 'Kidney', 'Stomach', 'Kidney', 'Kidney',
       'Esophagus', 'Kidney', 'Lung', 'Head and Neck', 'Kidney', 'Lung',
       'Kidney', 'Liver', 'Breast', 'Thyroid', 'Prostate', 'Kidney',
       'Liver', 'Stomach', 'Prostate', 'Stomach', 'Kidney', 'Breast',
       'Breast', 'Kidney', 'Colorectal', 'Lung', 'Thyroid', 'Lung',
       'Colorectal', 'Head and Neck', 'Prostate', 'Kidney', 'Liver',
       'Head and Neck', 'Lung

In [125]:
np.unique(labels), np.unique(labels).shape

(array(['Adrenal Gland', 'Bile Duct', 'Bladder', 'Breast', 'Cervix',
        'Colorectal', 'Esophagus', 'Head and Neck', 'Kidney', 'Liver',
        'Lung', 'Pancreas', 'Prostate', 'Soft Tissue', 'Stomach', 'Thymus',
        'Thyroid', 'Uterus'], dtype=object),
 (18,))

In [141]:
enc = OneHotEncoder(handle_unknown='ignore')
y = enc.fit_transform(labels.reshape(-1, 1))
y = y.toarray().astype('float32')

In [153]:
np.save('data/p4_train_labels', y[:-100])
np.save('data/p4_test_labels', y[-100:])