In [105]:
#basic data analysis packages
import numpy as np
import pandas as pd

#basic data visualization packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [111]:
# load exp, mirna data

"""
creating dict containing 
- filename (of data provided for the project)
- values: [fist column name, dataset name, separator]
"""

data = {'exp':['gene', 'gene_data', ' '], 
        'mirna': ['micro_rna', 'micro_rna_data', ' '],
        'gtex_data': ['ensemble_id', 'gtex_data', '\t']}

dataframes = {}

"""
load_dataframe will add relevant data to dataframes
"""

def load_dataframe(filename, values):
    title_row = pd.read_csv('{}.csv'.format(filename), header=None, nrows=1) #read table column names as row
    names = title_row.iloc[0, 0].replace('"', '').split(values[2]) #create colum title list
    if filename == 'gtex_data':
        names[0] = values[0]
    else:
        names = [values[0]] + names
    dataframes[values[1]] = pd.read_csv('{}.csv'.format(filename), names=names, skiprows=1, sep=values[2]) #read data    

for x in data.keys():
    load_dataframe(x, data[x])
 


In [112]:
# removing rows from 'exp' where genes begin in "X.." (following discussion with Nimrod, they are irrelevant)
dataframes['gene_data'] = dataframes['gene_data'][~dataframes['gene_data'].gene.str.contains("X..")] 

In [113]:
# load survival data

"""
survival data needs to be loaded in a different way (based on data observation)
we will load it and add it to our dataframes dictionary (for consistency)

the column named "Death" has been renamed as "isDead" for clarity
"""
survival = pd.read_csv('survival.csv', names=['PatientID', 'Survival', 'isDead'], skiprows=1, sep='\t')

dataframes['survival_data'] = survival

In [114]:
def normalize_data(dataframe, column_name):
    # perform log2 (val+1)
    dataframe = dataframe.apply(lambda x: np.log2(x + 1) if x.name != column_name else x)
    # find mean and std per row
    mean_vals = dataframe.mean(axis=1)
    std_vals = dataframe.std(axis=1)
    # subtract means and divide by std
    dataframe.iloc[:, 1:] = dataframe.iloc[:, 1:].subtract(mean_vals, axis=0).divide(std_vals, axis=0)
    return dataframe

dataframes['gene_data'] = normalize_data(dataframes['gene_data'], 'gene')
dataframes['micro_rna_data'] = normalize_data(dataframes['micro_rna_data'], 'micro_rna')

In [148]:
# transpose gene data, micro rna data

#transpose_data = [dataframes['gene_data'], dataframes['micro_rna_data']]

a = dataframes['gene_data'].T # transpose
a.columns = a.iloc[0, :] # use first row as column names
a = a.iloc[1:, :]
a.columns
# a.rename(columns={" gene ": "patient"})



Index(['A1BG.1', 'A1CF.29974', 'A2BP1.54715', 'A2LD1.87769', 'A2ML1.144568',
       'A2M.2', 'A4GALT.53947', 'A4GNT.51146', 'AAA1.404744', 'AAAS.8086',
       ...
       'ZUFSP.221302', 'ZW10.9183', 'ZWILCH.55055', 'ZWINT.11130',
       'ZYG11A.440590', 'ZYG11B.79699', 'ZZEF1.23140', 'ZZZ3.26009',
       'psiTPTE22.387590', 'tAKR.389932'],
      dtype='object', name='gene', length=19363)

In [91]:
# sample gene data
dataframes['gene_data'].head()

Unnamed: 0,29,30,31,32,33,34,35,36,37,38,...,20517,20518,20519,20520,20524,20525,20527,20528,20529,20530
gene,A1BG.1,A1CF.29974,A2BP1.54715,A2LD1.87769,A2ML1.144568,A2M.2,A4GALT.53947,A4GNT.51146,AAA1.404744,AAAS.8086,...,ZUFSP.221302,ZW10.9183,ZWILCH.55055,ZWINT.11130,ZYG11A.440590,ZYG11B.79699,ZZEF1.23140,ZZZ3.26009,psiTPTE22.387590,tAKR.389932
TCGA.3N.A9WB.06,0.705057,-0.280646,-0.413046,1.75705,0.221882,-2.15541,-1.05048,-0.991263,-0.150562,0.1785,...,-0.146231,0.116896,0.28618,1.52303,0.973086,-0.253298,-2.23982,-0.0997414,-0.878129,-0.284935
TCGA.3N.A9WC.06,-0.271497,-0.280646,-0.413046,1.03622,-0.711574,1.85513,0.141679,0.669905,-0.150562,-0.649788,...,0.0501261,-0.973146,-0.0479013,-0.5587,-0.440471,-0.120323,-1.23257,0.702474,-0.521455,2.52572
TCGA.3N.A9WD.06,0.625527,1.58431,2.57894,0.218885,-0.711574,-0.203402,0.818677,-0.991263,-0.150562,-0.869765,...,0.0990477,0.575522,-1.3043,-0.60213,1.6199,0.284513,-0.58714,0.0140356,2.48323,-0.284935
TCGA.BF.A1PU.01,-0.418943,-0.280646,0.832644,1.06113,0.248948,-1.12785,0.451753,0.0603083,-0.150562,-0.0433683,...,0.0852827,-0.980146,-0.0663148,-0.091513,-0.987508,-1.01398,0.810425,-1.72589,-0.930842,-0.284935


In [70]:
# sample mirna data
dataframes['micro_rna_data'].head()

Unnamed: 0,micro_rna,TCGA.3N.A9WC.06,TCGA.3N.A9WD.06,TCGA.BF.A1PU.01,TCGA.BF.A1PV.01,TCGA.BF.A1PX.01,TCGA.BF.A1PZ.01,TCGA.BF.A1Q0.01,TCGA.BF.A3DJ.01,TCGA.BF.A3DL.01,...,TCGA.XV.AB01.06,TCGA.YD.A89C.06,TCGA.YD.A9TA.06,TCGA.YD.A9TB.06,TCGA.YG.AA3N.01,TCGA.YG.AA3O.06,TCGA.YG.AA3P.06,TCGA.Z2.A8RT.06,TCGA.Z2.AA3S.06,TCGA.Z2.AA3V.06
0,hsa.let.7a.1,0.2373,1.413978,1.265548,0.185478,0.094122,0.451664,-0.571239,0.519428,2.149726,...,-0.52808,-2.190976,-0.647503,-1.113845,-2.1e-05,-0.112561,1.255562,-0.427824,-0.161147,1.307661
1,hsa.let.7a.2,0.222888,1.409208,1.264947,0.179001,0.110415,0.445268,-0.566822,0.54578,2.146487,...,-0.53588,-2.168277,-0.645517,-1.087783,0.004459,-0.119957,1.251081,-0.437552,-0.1592,1.300205
2,hsa.let.7a.3,0.224394,1.418898,1.26564,0.156424,0.112991,0.443217,-0.583,0.563999,2.142916,...,-0.559339,-2.178939,-0.656756,-1.098432,-0.006949,-0.150725,1.272062,-0.431599,-0.102585,1.309069
3,hsa.let.7b,-0.225533,0.833381,1.430398,-0.259191,1.310078,-0.237814,-0.933001,0.950472,1.836045,...,-0.496495,-3.250218,-0.958898,-0.349975,-0.638938,-0.291756,-0.757145,-1.016035,-3.003734,0.182476
4,hsa.let.7c,-1.571869,2.179965,0.263552,0.220736,0.877972,-0.19087,-0.491599,-1.763443,0.910929,...,-1.462939,-2.789086,-1.046594,0.552762,0.43909,-0.992765,2.218663,-1.160461,0.211481,-0.196613


In [71]:
dataframes['gtex_data'].head()

Unnamed: 0,ensemble_id,GTEX-111YS-0006-SM-5NQBE,GTEX-1122O-0005-SM-5O99J,GTEX-1128S-0005-SM-5P9HI,GTEX-113IC-0006-SM-5NQ9C,GTEX-113JC-0006-SM-5O997,GTEX-117XS-0005-SM-5PNU6,GTEX-117YW-0005-SM-5NQ8Z,GTEX-1192W-0005-SM-5NQBQ,GTEX-11DXW-0006-SM-5NQ7Y,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
0,"""ENSG00000227232""",-0.169609,0.054703,0.009834,0.062832,-0.111598,0.013749,0.059378,-0.103492,-0.056906,...,-0.135461,0.031812,-0.033919,-0.083926,0.057796,-0.026684,0.07655,0.074503,-0.024322,0.008076
1,"""ENSG00000238009""",-0.268123,0.129532,-0.226318,-0.594374,-0.247985,0.44075,0.012203,0.023756,0.02301,...,-0.233336,-0.041431,0.150914,0.142757,0.181792,-0.067478,0.196848,0.086229,-0.274816,-0.030805
2,"""ENSG00000233750""",-0.168988,-0.183577,-0.164673,-0.587763,-0.170246,0.177095,0.092399,0.186742,0.050464,...,-0.647793,-0.179416,-0.039154,-0.157434,-0.072724,0.008101,0.326528,-0.103061,-0.018962,-0.09904
3,"""ENSG00000237683""",-0.015901,-0.114264,0.273116,-0.442626,-0.249558,0.040346,0.092957,0.060168,0.024587,...,-0.200647,-0.036321,0.093315,-0.097231,0.055875,-0.226099,-0.198103,-0.15189,-0.091329,-0.023483
4,"""ENSG00000268903""",0.033084,-0.136116,0.35769,-0.209781,-1.040812,0.290923,-0.224477,-0.395459,0.08035,...,-0.395148,0.01998,0.15177,0.050996,0.102515,-0.360847,-0.076868,0.020539,-0.0673,0.040908


In [73]:
# sample survival data
dataframes['survival_data'].head()

Unnamed: 0,PatientID,Survival,isDead
0,TCGA-3N-A9WB-06,518.0,1.0
1,TCGA-3N-A9WC-06,2022.0,0.0
2,TCGA-3N-A9WD-06,395.0,1.0
3,TCGA-BF-A1PU-01,387.0,0.0
4,TCGA-BF-A1PV-01,14.0,0.0


In [74]:
# load clinical data data

"""
the data has a problem in line 38, 39:
they are series with 3 elements (total values: 105, unlike total 103 in other rows)
we discluded this data
"""

clinical_titles = pd.read_csv('melanoma.txt', header=None, nrows=1) 
clinical_data_titles = clinical_titles.iloc[0, 0].split('\t')

clinical_data = pd.read_csv('melanoma.txt', header=None, names=clinical_data_titles, skiprows=[0, 38,39], sep='\t') 

dataframes['clinical_data'] = clinical_data

In [75]:
#sample clinical data
dataframes['clinical_data'].head()

Unnamed: 0,sampleID,_EVENT,_INTEGRATION,_OS,_OS_IND,_OS_UNIT,_PATIENT,_RFS,_RFS_IND,_RFS_UNIT,...,_GENOMIC_ID_data/public/TCGA/SKCM/miRNA_HiSeq_gene,_GENOMIC_ID_TCGA_SKCM_RPPA,_GENOMIC_ID_TCGA_SKCM_mutation_bcm_gene,_GENOMIC_ID_TCGA_SKCM_mutation_broad_gene,_GENOMIC_ID_TCGA_SKCM_gistic2,_GENOMIC_ID_TCGA_SKCM_mutation,_GENOMIC_ID_TCGA_SKCM_exp_HiSeqV2_exon,_GENOMIC_ID_TCGA_SKCM_PDMRNAseqCNV,_GENOMIC_ID_TCGA_SKCM_PDMRNAseq,_GENOMIC_ID_TCGA_SKCM_exp_HiSeqV2_percentile
0,TCGA-3N-A9WB-06,1.0,TCGA-3N-A9WB-06,518.0,1.0,days,TCGA-3N-A9WB,,,days,...,,,,TCGA-3N-A9WB-06A-11D-A38G-08,TCGA-3N-A9WB-06A-11D-A38F-01,,4c243ea9-dfe1-42f0-a887-3c901fb38542,,,4c243ea9-dfe1-42f0-a887-3c901fb38542
1,TCGA-3N-A9WC-06,0.0,TCGA-3N-A9WC-06,2022.0,0.0,days,TCGA-3N-A9WC,2022.0,0.0,days,...,TCGA-3N-A9WC-06,F9B0B31B-24AB-4F30-8EF8-4A4ED282EF04,,TCGA-3N-A9WC-06A-11D-A38G-08,TCGA-3N-A9WC-06A-11D-A38F-01,,a64ae1f5-a189-4173-be13-903bd7637869,,,a64ae1f5-a189-4173-be13-903bd7637869
2,TCGA-3N-A9WD-06,1.0,TCGA-3N-A9WD-06,395.0,1.0,days,TCGA-3N-A9WD,,,days,...,TCGA-3N-A9WD-06,6AFC3678-9499-49A6-AD7A-30F3D185B4BD,,TCGA-3N-A9WD-06A-11D-A38G-08,TCGA-3N-A9WD-06A-11D-A38F-01,,ac19f7cf-670b-4dcc-a26b-db0f56377231,,,ac19f7cf-670b-4dcc-a26b-db0f56377231
3,TCGA-BF-A1PU-01,0.0,TCGA-BF-A1PU-01,387.0,0.0,days,TCGA-BF-A1PU,,,days,...,TCGA-BF-A1PU-01,7E74CAFD-C47B-4F98-A349-BBD2AC699D2D,TCGA-BF-A1PU-01A-11D-A19A-08,TCGA-BF-A1PU-01A-11D-A19A-08,,TCGA-BF-A1PU-01A-11D-A19A-08,9ff4d588-da73-4e26-975d-2b098fe74984,,,9ff4d588-da73-4e26-975d-2b098fe74984
4,TCGA-BF-A1PV-01,0.0,TCGA-BF-A1PV-01,14.0,0.0,days,TCGA-BF-A1PV,14.0,0.0,days,...,TCGA-BF-A1PV-01,,TCGA-BF-A1PV-01A-11D-A19A-08,TCGA-BF-A1PV-01A-11D-A19A-08,,TCGA-BF-A1PV-01A-11D-A19A-08,a57841bb-fa68-4d84-85e0-2d5e4f3ce828,,,a57841bb-fa68-4d84-85e0-2d5e4f3ce828


In [None]:
#Option 1: cluster all features

In [82]:
# dealing with nulls
#dataframes['micro_rna_data']['TCGA.BF.A1PV.01'].isnull

dataframes['micro_rna_data'].loc[dataframes['micro_rna_data']['TCGA.BF.A1PV.01'].isnull()].shape

(145, 453)

In [77]:
# spectral clustering
from sklearn.cluster import SpectralClustering

X = dataframes['micro_rna_data'].iloc[:, 1:]
clustering = SpectralClustering(n_clusters=2, assign_labels="discretize", random_state=0).fit(X)
clustering

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').