In [104]:
#basic data analysis packages
import numpy as np
import pandas as pd

#basic data visualization packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [105]:
# load exp, mirna data

"""
creating dict containing 
- filename (of data provided for the project)
- values: [fist column name, dataset name, separator]
"""

data = {'exp':['gene', 'gene_data', ' '], 
        'mirna': ['micro_rna', 'micro_rna_data', ' '],
        'gtex_data': ['ensemble_id', 'gtex_data', '\t']}

dataframes = {}

"""
load_dataframe will add relevant data to dataframes
"""

def load_dataframe(filename, values):
    title_row = pd.read_csv('{}.csv'.format(filename), header=None, nrows=1) #read table column names as row
    names = title_row.iloc[0, 0].replace('"', '').split(values[2]) #create colum title list
    if filename == 'gtex_data':
        names[0] = values[0]
    else:
        names = [values[0]] + names
    dataframes[values[1]] = pd.read_csv('{}.csv'.format(filename), names=names, skiprows=1, sep=values[2]) #read data    

for x in data.keys():
    load_dataframe(x, data[x])
 


In [106]:
# removing rows from 'exp' where genes begin in "X.." (following discussion with Nimrod, they are irrelevant)
dataframes['gene_data'] = dataframes['gene_data'][~dataframes['gene_data'].gene.str.contains("X..")] 

In [107]:
# load survival data

"""
survival data needs to be loaded in a different way (based on data observation)
we will load it and add it to our dataframes dictionary (for consistency)

the column named "Death" has been renamed as "isDead" for clarity
"""
survival = pd.read_csv('survival.csv', names=['PatientID', 'Survival', 'isDead'], skiprows=1, sep='\t')

dataframes['survival_data'] = survival

In [108]:
# sample exp data
dataframes['gene_data'].head()

Unnamed: 0,gene,TCGA.3N.A9WB.06,TCGA.3N.A9WC.06,TCGA.3N.A9WD.06,TCGA.BF.A1PU.01,TCGA.BF.A1PV.01,TCGA.BF.A1PX.01,TCGA.BF.A1PZ.01,TCGA.BF.A1Q0.01,TCGA.BF.A3DJ.01,...,TCGA.XV.AB01.06,TCGA.YD.A89C.06,TCGA.YD.A9TA.06,TCGA.YD.A9TB.06,TCGA.YG.AA3N.01,TCGA.YG.AA3O.06,TCGA.YG.AA3P.06,TCGA.Z2.A8RT.06,TCGA.Z2.AA3S.06,TCGA.Z2.AA3V.06
29,A1BG.1,381.0662,195.1822,360.8794,176.3994,216.847,285.0608,424.5848,327.8758,232.2936,...,428.9256,184.5297,88.4289,264.5184,238.8846,207.9831,33.5642,336.407,349.4478,717.2209
30,A1CF.29974,0.0,0.0,0.7092,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.3482,0.0,0.0,0.0,0.0,0.3876,0.0,0.0
31,A2BP1.54715,0.0,0.0,6.383,1.2987,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5522,0.338,66.0629,0.0,0.288,0.6539
32,A2LD1.87769,250.1979,160.7548,97.1986,163.2338,60.8727,125.69,78.8939,67.7049,92.9554,...,47.5207,102.4404,36.9476,146.3544,88.2441,86.2831,50.0426,37.3178,152.9786,50.7438
33,A2ML1.144568,7.2698,0.0,0.0,7.7922,0.5977,3277.0247,26.8887,5.6324,984.6631,...,0.0,0.0,0.0,0.0,1701.27,0.0,0.0,0.0,0.288,0.0


In [109]:
# sample mirna data
dataframes['micro_rna_data'].head()

Unnamed: 0,micro_rna,TCGA.3N.A9WC.06,TCGA.3N.A9WD.06,TCGA.BF.A1PU.01,TCGA.BF.A1PV.01,TCGA.BF.A1PX.01,TCGA.BF.A1PZ.01,TCGA.BF.A1Q0.01,TCGA.BF.A3DJ.01,TCGA.BF.A3DL.01,...,TCGA.XV.AB01.06,TCGA.YD.A89C.06,TCGA.YD.A9TA.06,TCGA.YD.A9TB.06,TCGA.YG.AA3N.01,TCGA.YG.AA3O.06,TCGA.YG.AA3P.06,TCGA.Z2.A8RT.06,TCGA.Z2.AA3S.06,TCGA.Z2.AA3V.06
0,hsa.let.7a.1,14676.834734,25230.440999,23563.756602,14330.771178,13740.462752,16199.355403,10114.61088,16712.764049,35403.380723,...,10317.62848,4797.673951,9765.591431,7878.460725,13157.578121,12493.132639,23455.671756,10805.087024,12216.751081,24025.124723
1,hsa.let.7a.2,29175.042697,50426.377848,47180.136901,28590.374972,27700.049991,32326.559988,20268.091283,33860.575382,70851.588875,...,20559.441088,9682.578337,19545.557812,15938.581844,26378.793396,24907.572969,46879.348693,21513.398318,24460.75902,47953.696317
2,hsa.let.7a.3,14729.348617,25459.749831,23733.441179,14277.736854,13996.43118,16282.583317,10174.93361,17208.99796,35473.928808,...,10285.834727,4897.31449,9836.841186,8034.707489,13248.062199,12403.510843,23803.37449,10905.7884,12680.117674,24210.389886
3,hsa.let.7b,15377.019845,34387.698695,54133.369009,14988.630767,49402.682161,15234.151107,8981.28173,37588.292821,73681.595915,...,12514.954736,1542.629784,8806.219729,13989.262271,11230.764427,14622.196769,10265.711191,8431.948143,1860.659154,20967.711434
4,hsa.let.7c,306.330986,10490.699334,1727.530141,1659.272408,3080.541241,1126.121575,848.324435,255.635651,3177.603335,...,339.503915,96.762834,502.810772,2268.16651,2037.979847,528.987186,10879.780431,451.620072,1644.875692,1120.047059


In [110]:
dataframes['gtex_data'].head()

Unnamed: 0,ensemble_id,GTEX-111YS-0006-SM-5NQBE,GTEX-1122O-0005-SM-5O99J,GTEX-1128S-0005-SM-5P9HI,GTEX-113IC-0006-SM-5NQ9C,GTEX-113JC-0006-SM-5O997,GTEX-117XS-0005-SM-5PNU6,GTEX-117YW-0005-SM-5NQ8Z,GTEX-1192W-0005-SM-5NQBQ,GTEX-11DXW-0006-SM-5NQ7Y,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
0,"""ENSG00000227232""",-0.169609,0.054703,0.009834,0.062832,-0.111598,0.013749,0.059378,-0.103492,-0.056906,...,-0.135461,0.031812,-0.033919,-0.083926,0.057796,-0.026684,0.07655,0.074503,-0.024322,0.008076
1,"""ENSG00000238009""",-0.268123,0.129532,-0.226318,-0.594374,-0.247985,0.44075,0.012203,0.023756,0.02301,...,-0.233336,-0.041431,0.150914,0.142757,0.181792,-0.067478,0.196848,0.086229,-0.274816,-0.030805
2,"""ENSG00000233750""",-0.168988,-0.183577,-0.164673,-0.587763,-0.170246,0.177095,0.092399,0.186742,0.050464,...,-0.647793,-0.179416,-0.039154,-0.157434,-0.072724,0.008101,0.326528,-0.103061,-0.018962,-0.09904
3,"""ENSG00000237683""",-0.015901,-0.114264,0.273116,-0.442626,-0.249558,0.040346,0.092957,0.060168,0.024587,...,-0.200647,-0.036321,0.093315,-0.097231,0.055875,-0.226099,-0.198103,-0.15189,-0.091329,-0.023483
4,"""ENSG00000268903""",0.033084,-0.136116,0.35769,-0.209781,-1.040812,0.290923,-0.224477,-0.395459,0.08035,...,-0.395148,0.01998,0.15177,0.050996,0.102515,-0.360847,-0.076868,0.020539,-0.0673,0.040908


In [111]:
# sample survival data
dataframes['survival_data'].head()

Unnamed: 0,PatientID,Survival,isDead
0,TCGA-3N-A9WB-06,518.0,1.0
1,TCGA-3N-A9WC-06,2022.0,0.0
2,TCGA-3N-A9WD-06,395.0,1.0
3,TCGA-BF-A1PU-01,387.0,0.0
4,TCGA-BF-A1PV-01,14.0,0.0


In [112]:
# load clinical data data

"""
the data has a problem in line 38, 39:
they are series with 3 elements (total values: 105, unlike total 103 in other rows)
we discluded this data
"""

clinical_titles = pd.read_csv('melanoma.txt', header=None, nrows=1) 
clinical_data_titles = clinical_titles.iloc[0, 0].split('\t')

clinical_data = pd.read_csv('melanoma.txt', header=None, names=clinical_data_titles, skiprows=[0, 38,39], sep='\t') 

dataframes['clinical_data'] = clinical_data

In [113]:
#sample clinical data
dataframes['clinical_data'].head()

Unnamed: 0,sampleID,_EVENT,_INTEGRATION,_OS,_OS_IND,_OS_UNIT,_PATIENT,_RFS,_RFS_IND,_RFS_UNIT,...,_GENOMIC_ID_data/public/TCGA/SKCM/miRNA_HiSeq_gene,_GENOMIC_ID_TCGA_SKCM_RPPA,_GENOMIC_ID_TCGA_SKCM_mutation_bcm_gene,_GENOMIC_ID_TCGA_SKCM_mutation_broad_gene,_GENOMIC_ID_TCGA_SKCM_gistic2,_GENOMIC_ID_TCGA_SKCM_mutation,_GENOMIC_ID_TCGA_SKCM_exp_HiSeqV2_exon,_GENOMIC_ID_TCGA_SKCM_PDMRNAseqCNV,_GENOMIC_ID_TCGA_SKCM_PDMRNAseq,_GENOMIC_ID_TCGA_SKCM_exp_HiSeqV2_percentile
0,TCGA-3N-A9WB-06,1.0,TCGA-3N-A9WB-06,518.0,1.0,days,TCGA-3N-A9WB,,,days,...,,,,TCGA-3N-A9WB-06A-11D-A38G-08,TCGA-3N-A9WB-06A-11D-A38F-01,,4c243ea9-dfe1-42f0-a887-3c901fb38542,,,4c243ea9-dfe1-42f0-a887-3c901fb38542
1,TCGA-3N-A9WC-06,0.0,TCGA-3N-A9WC-06,2022.0,0.0,days,TCGA-3N-A9WC,2022.0,0.0,days,...,TCGA-3N-A9WC-06,F9B0B31B-24AB-4F30-8EF8-4A4ED282EF04,,TCGA-3N-A9WC-06A-11D-A38G-08,TCGA-3N-A9WC-06A-11D-A38F-01,,a64ae1f5-a189-4173-be13-903bd7637869,,,a64ae1f5-a189-4173-be13-903bd7637869
2,TCGA-3N-A9WD-06,1.0,TCGA-3N-A9WD-06,395.0,1.0,days,TCGA-3N-A9WD,,,days,...,TCGA-3N-A9WD-06,6AFC3678-9499-49A6-AD7A-30F3D185B4BD,,TCGA-3N-A9WD-06A-11D-A38G-08,TCGA-3N-A9WD-06A-11D-A38F-01,,ac19f7cf-670b-4dcc-a26b-db0f56377231,,,ac19f7cf-670b-4dcc-a26b-db0f56377231
3,TCGA-BF-A1PU-01,0.0,TCGA-BF-A1PU-01,387.0,0.0,days,TCGA-BF-A1PU,,,days,...,TCGA-BF-A1PU-01,7E74CAFD-C47B-4F98-A349-BBD2AC699D2D,TCGA-BF-A1PU-01A-11D-A19A-08,TCGA-BF-A1PU-01A-11D-A19A-08,,TCGA-BF-A1PU-01A-11D-A19A-08,9ff4d588-da73-4e26-975d-2b098fe74984,,,9ff4d588-da73-4e26-975d-2b098fe74984
4,TCGA-BF-A1PV-01,0.0,TCGA-BF-A1PV-01,14.0,0.0,days,TCGA-BF-A1PV,14.0,0.0,days,...,TCGA-BF-A1PV-01,,TCGA-BF-A1PV-01A-11D-A19A-08,TCGA-BF-A1PV-01A-11D-A19A-08,,TCGA-BF-A1PV-01A-11D-A19A-08,a57841bb-fa68-4d84-85e0-2d5e4f3ce828,,,a57841bb-fa68-4d84-85e0-2d5e4f3ce828
