# PROCESSING

In [1]:
import pandas as pd
import numpy as np

In [63]:
gene_df = pd.read_csv('data/gene_list.txt',header=None)
gene_list = gene_df[0].tolist()

In [64]:
clin_patient = pd.read_csv('data/data_clinical_patient.txt', sep='\t', comment='#')
clin_sample = pd.read_csv('data/data_clinical_sample.txt', sep='\t', comment='#')
mut = pd.read_csv('data/data_mutations.txt', sep='\t', comment='#')
cna = pd.read_csv('data/data_cna.txt', sep='\t')
rna = pd.read_csv('data/data_mrna_illumina_microarray_zscores_ref_diploid_samples.txt', sep='\t', comment='#')
meth = pd.read_csv('data/data_methylation_promoters_rrbs.txt', sep='\t')

## mut 

In [65]:
print('number of genes: ',len(mut.Hugo_Symbol.unique()))
print('number of patients: ',len(mut.Tumor_Sample_Barcode.unique()))

number of genes:  173
number of patients:  2369


Select mutations involving genes that are present in the breast cancer gene list

In [66]:
mut_filter = mut[['Hugo_Symbol','Tumor_Sample_Barcode']].copy()
mut_filter = mut_filter[mut_filter['Hugo_Symbol'].isin(gene_list)]
mut_filter = mut_filter.drop_duplicates()
print('Number of mutated genes that are present in breast cancer gene list: ',len(mut_filter.Hugo_Symbol.unique()))
print('Number of patients with small mutations in the selected genes: ',len(mut_filter.Tumor_Sample_Barcode.unique()))
mut_filter

Number of mutated genes that are present in breast cancer gene list:  28
Number of patients with small mutations in the selected genes:  2135


Unnamed: 0,Hugo_Symbol,Tumor_Sample_Barcode
0,TP53,MTS-T0058
10,TP53,MTS-T0059
11,ERBB2,MTS-T0059
14,EP300,MTS-T0059
23,PIK3CA,MTS-T0061
...,...,...
17257,PIK3CA,MB-0899
17261,TP53,MB-0901
17265,PIK3CA,MB-0904
17267,PIK3CA,MB-0906


From the filtered table, create a new binary-valued matrix with patients as rows and genes as columns. Entries are 0 if no mutation is present in the selected gene for the corresponding patient, and 1 if mutations are present. No difference is reported if more than one mutation is present in the same gene.

In [67]:
mut_matrix = pd.crosstab(mut_filter['Tumor_Sample_Barcode'], mut_filter['Hugo_Symbol'])
#mut_matrix = pd.pivot(mut_filter,index='Tumor_Sample_Barcode',values='Tumor_Sample_Barcode',columns='Hugo_Symbol').notnull()
mut_matrix.index.name = None
mut_matrix.columns.name = None
mut_matrix

Unnamed: 0,AKT1,ARID1A,ARID1B,BAP1,BRCA1,BRCA2,BRIP1,CASP8,CDH1,CDKN1B,...,NCOR1,NOTCH1,PBRM1,PIK3CA,PTEN,RB1,SMARCD1,STK11,TBX3,TP53
MB-0002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
MB-0005,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
MB-0006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
MB-0008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
MB-0010,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTS-T2428,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
MTS-T2429,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
MTS-T2430,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
MTS-T2431,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


Check presence of missing data, change name to columns and add 'mut' column

In [68]:
# change column names
mut_matrix.columns = 'mut_' + mut_matrix.columns
# add 'mut' column indicating which patients have small mutations in the selected genes
mut_matrix['mut'] = True
mut_matrix

Unnamed: 0,mut_AKT1,mut_ARID1A,mut_ARID1B,mut_BAP1,mut_BRCA1,mut_BRCA2,mut_BRIP1,mut_CASP8,mut_CDH1,mut_CDKN1B,...,mut_NOTCH1,mut_PBRM1,mut_PIK3CA,mut_PTEN,mut_RB1,mut_SMARCD1,mut_STK11,mut_TBX3,mut_TP53,mut
MB-0002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,True
MB-0005,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,True
MB-0006,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,True
MB-0008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,True
MB-0010,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTS-T2428,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,True
MTS-T2429,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,True
MTS-T2430,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,True
MTS-T2431,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,True


## cna

In [69]:
print('number of genes: ',len(cna.Hugo_Symbol.unique()))
print('number of patients: ',len(cna.columns)-2)

number of genes:  22542
number of patients:  2173


Filtering cna table according to genes present in breast cancer genes

In [70]:
# delete 'Entrez_Gene_Id' column
cna_filter = cna.drop('Entrez_Gene_Id', axis=1).copy()
# filter rows corresponding to selected genes 
cna_filter = cna_filter[cna_filter['Hugo_Symbol'].isin(gene_list)]
print('Number of genes with copy numebr aberrations that are present in breast cancer gene list: ',len(cna_filter.Hugo_Symbol.unique()))
print('Number of patients with copy number aberrations in the selected genes: ',len(cna_filter.columns.unique())-1)

Number of genes with copy numebr aberrations that are present in breast cancer gene list:  63
Number of patients with copy number aberrations in the selected genes:  2173


Create matrix with patients as rows and genes as columns and check presence of missing data

In [71]:
# transpose index and columns
cna_filter = cna_filter.T
cna_filter.columns = cna_filter.iloc[0]
cna_filter = cna_filter.drop('Hugo_Symbol',axis=0)
cna_filter.columns.name = None
cna_filter = cna_filter.astype(int)
# check presence of missing data
nan = cna_filter.isnull().values.any()
print('Presence of missing data: ',nan)

cna_filter

Presence of missing data:  False


Unnamed: 0,AKT1,AR,ARID1A,ARID1B,ASPM,ATM,ATRIP,BAP1,BARD1,BRCA1,...,RAD54L,RB1,RRAS2,SALL4,SMARCD1,STK11,TBX3,TP53,XRCC2,ZMYM3
MB-0000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MB-0039,0,0,0,-1,0,-1,0,0,0,0,...,0,-1,0,0,0,0,0,0,-1,0
MB-0045,1,0,-1,-1,0,-1,-1,-1,0,0,...,-1,-1,0,0,-1,0,0,-1,0,0
MB-0046,0,-1,0,0,2,0,0,0,0,-1,...,0,-1,0,1,0,-1,1,-1,0,-1
MB-0048,0,1,0,0,1,-1,0,0,0,-1,...,0,0,0,0,0,-1,0,-1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-6020,0,0,0,0,1,-1,1,1,0,-1,...,0,0,0,1,0,0,0,-1,0,0
MB-6213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MB-6230,-1,0,0,-1,1,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
MB-7148,0,0,0,0,1,0,-1,-1,0,0,...,0,0,0,0,0,0,0,0,0,0


Create new matrix with patients as rows and information about deletions and duplications stored in two different columns for each gene. Add 'cna' column.

In [72]:
cna_matrix = pd.DataFrame()
for i in cna_filter.columns:
    
    label_dup = 'cna_dup_'+i
    cna_dup = pd.DataFrame(np.where(cna_filter[i]>0,1,0))
    cna_dup.columns = [label_dup]

    label_del = 'cna_del_'+i
    cna_del = pd.DataFrame(np.where(cna_filter[i]<0,1,0))
    cna_del.columns = [label_del]
    
    cna_matrix = pd.concat([cna_matrix,cna_dup,cna_del],axis=1)
cna_matrix.index = cna_filter.index

# add 'cna' column indicating which patients have copy number variations in the selected genes
cna_matrix['cna'] = True
cna_matrix

  cna_matrix['cna'] = True


Unnamed: 0,cna_dup_AKT1,cna_del_AKT1,cna_dup_AR,cna_del_AR,cna_dup_ARID1A,cna_del_ARID1A,cna_dup_ARID1B,cna_del_ARID1B,cna_dup_ASPM,cna_del_ASPM,...,cna_del_STK11,cna_dup_TBX3,cna_del_TBX3,cna_dup_TP53,cna_del_TP53,cna_dup_XRCC2,cna_del_XRCC2,cna_dup_ZMYM3,cna_del_ZMYM3,cna
MB-0000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
MB-0039,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,True
MB-0045,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,True
MB-0046,0,0,0,1,0,0,0,0,1,0,...,1,1,0,0,1,0,0,0,1,True
MB-0048,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,1,1,0,1,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-6020,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,True
MB-6213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
MB-6230,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,True
MB-7148,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,True


## rna

In [73]:
print('number of genes: ',len(rna.Hugo_Symbol.unique()))
print('number of patients: ',len(rna.columns.unique())-2)

number of genes:  20387
number of patients:  1980


Filter rna table according to brast cancer gene list

In [74]:
# delete 'Entrez_Gene_Id' column
rna_filter = rna.drop('Entrez_Gene_Id', axis=1).copy()
# filter rows corresponding to selected genes 
rna_filter = rna_filter[rna_filter['Hugo_Symbol'].isin(gene_list)]
print('Number of breast cancer genes with rna expression data: ',len(rna_filter.Hugo_Symbol.unique()))
print('Number of patients with rna expression data: ',len(rna_filter.columns.unique())-1)

Number of breast cancer genes with rna expression data:  63
Number of patients with rna expression data:  1980


Verify presence of dupicated genes with different espression values and remove them. Check presence of missing data.

In [75]:
# verify presence of duplicated genes
rna_filter = rna_filter.drop_duplicates()
dupl_gene_list = []
for i in rna_filter['Hugo_Symbol'].unique():
    rna_single_gene = rna_filter.query("Hugo_Symbol=='"+i+"'")
    if (len(rna_single_gene)>1) and (len(rna_single_gene.drop_duplicates())>1):
        dupl_gene_list.append(i)
print('Genes with different expression values for each patients: ',dupl_gene_list)
# delete genes 
rna_filter.index = rna_filter.Hugo_Symbol
if len(dupl_gene_list) > 0:
    for i in dupl_gene_list:
        rna_filter = rna_filter.drop(i,axis=0)
        
# check presence of missing data
nan = rna_filter.isnull().values.any()
print('Presence of missing data: ',nan)

Genes with different expression values for each patients:  ['ARID1B']
Presence of missing data:  False


Create new matrix with patients as rows and genes as columns. Add 'rna' column.

In [76]:
rna_matrix = rna_filter.T.copy()
rna_matrix.columns = rna_matrix.iloc[0]
rna_matrix.columns.name = None
rna_matrix = rna_matrix.drop('Hugo_Symbol')
rna_matrix.columns = 'rna_' + rna_matrix.columns
rna_matrix

Unnamed: 0,rna_NCOR1,rna_HGF,rna_FOXA1,rna_MSH6,rna_CDKN1B,rna_CDH1,rna_FADD,rna_RAD50,rna_BRCA2,rna_IRS4,...,rna_ARID1A,rna_RB1,rna_MSH2,rna_RRAS2,rna_FBLN2,rna_ATM,rna_IKZF3,rna_TBX3,rna_CHEK2,rna_EP300
MB-0362,1.0688,-0.615,0.6157,-1.1732,-0.9085,0.2545,2.0512,0.8194,0.656,-0.6345,...,0.9674,0.0136,-0.7184,-0.8378,0.2111,-1.8378,-0.6141,-0.6923,-0.4167,0.6118
MB-0346,0.7793,0.0869,0.0284,-0.8897,-1.6337,1.797,0.3542,0.4059,1.234,-0.5464,...,-0.4781,-0.5372,-1.1661,0.0306,-1.3844,-2.8885,1.2891,-0.1955,1.8244,0.7471
MB-0386,0.1734,2.4248,0.3589,-2.4632,-1.063,-0.3024,-0.4019,0.2079,-0.6626,1.0924,...,0.2096,0.6206,-1.3387,-0.1813,1.1368,-1.2912,0.1893,0.2539,-0.7337,0.5146
MB-0574,0.7201,-0.6548,0.2738,-2.1186,-1.4174,1.513,1.7116,1.3754,-1.0709,-1.9681,...,-0.7116,1.8941,-0.7967,-0.6906,-0.2951,-1.2406,-0.7358,0.7687,0.7877,0.5379
MB-0185,-0.2742,1.684,0.4365,0.1175,-0.7677,0.4657,0.4333,-1.1464,-0.0708,-0.218,...,0.0249,0.3192,-0.546,1.3484,-1.4457,-0.7969,1.1604,0.6501,4.2078,0.7796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-5453,0.8373,-2.453,-0.3096,0.5525,1.4565,-0.1619,0.2785,0.8496,-0.5676,-0.34,...,0.8886,-0.0685,0.5833,0.3933,-1.3792,1.6544,-1.188,0.6562,-0.5058,-0.5387
MB-5471,-0.406,-1.2909,0.3128,0.5762,0.6839,0.5722,-0.7621,-0.4317,0.7238,0.44,...,0.0488,0.1549,-0.3421,-0.544,0.5705,0.7298,-0.5398,-0.0435,2.5491,1.2823
MB-5127,1.7295,0.9662,0.5625,0.8419,2.1601,0.7309,1.6882,-0.0815,0.3955,-0.5292,...,-0.5412,0.734,0.9264,-1.9107,-1.6665,0.4067,-0.1244,-0.8301,0.2351,-0.9663
MB-4313,-2.9275,-0.5509,-0.7498,-0.6392,-0.8716,-0.8985,-1.9611,-3.0446,0.4947,3.9558,...,-5.0725,-1.0741,-1.3687,-1.3555,-0.2023,-1.1483,0.3596,0.0385,-2.3226,-4.377


In [77]:
# scaling to [0-1]
rna_matrix = (rna_matrix-rna_matrix.min().min())/(rna_matrix.max().max()-rna_matrix.min().min())
# add 'rna' column indicating which patients have rna expression data
rna_matrix['rna'] = True
rna_matrix

Unnamed: 0,rna_NCOR1,rna_HGF,rna_FOXA1,rna_MSH6,rna_CDKN1B,rna_CDH1,rna_FADD,rna_RAD50,rna_BRCA2,rna_IRS4,...,rna_RB1,rna_MSH2,rna_RRAS2,rna_FBLN2,rna_ATM,rna_IKZF3,rna_TBX3,rna_CHEK2,rna_EP300,rna
MB-0362,0.493942,0.393584,0.466936,0.360315,0.376091,0.445408,0.552494,0.479077,0.469338,0.392422,...,0.43105,0.387422,0.380305,0.442821,0.320704,0.393638,0.388977,0.405403,0.466704,True
MB-0346,0.476687,0.435419,0.431932,0.377212,0.332868,0.537343,0.45135,0.454432,0.503788,0.397673,...,0.398221,0.360738,0.432063,0.347727,0.25808,0.507072,0.418587,0.538976,0.474768,True
MB-0386,0.440574,0.574761,0.45163,0.283429,0.366883,0.412216,0.406286,0.442631,0.390747,0.495348,...,0.467228,0.350451,0.419434,0.497994,0.353282,0.441522,0.445372,0.38651,0.46091,True
MB-0574,0.473158,0.391212,0.446558,0.303968,0.34576,0.520416,0.532253,0.512215,0.366412,0.312938,...,0.543131,0.382755,0.389079,0.412651,0.356298,0.386385,0.476055,0.477188,0.462299,True
MB-0185,0.413897,0.530608,0.456255,0.437243,0.384483,0.457996,0.456065,0.361912,0.42602,0.417246,...,0.449264,0.397697,0.510606,0.344074,0.382743,0.499401,0.468986,0.681031,0.476705,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-5453,0.480144,0.284037,0.411787,0.463169,0.517049,0.42059,0.446838,0.480877,0.39641,0.409975,...,0.426157,0.465005,0.453681,0.348037,0.528844,0.359433,0.46935,0.400093,0.398132,True
MB-5471,0.406041,0.3533,0.448883,0.464582,0.471001,0.464343,0.384817,0.404509,0.473379,0.456464,...,0.439472,0.40985,0.397816,0.464242,0.473737,0.398067,0.427647,0.58217,0.506666,True
MB-5127,0.53332,0.487826,0.463765,0.480418,0.558985,0.473802,0.530859,0.425382,0.453812,0.398698,...,0.473987,0.485454,0.316359,0.330914,0.454479,0.422825,0.380764,0.444252,0.372646,True
MB-4313,0.255756,0.397405,0.38555,0.392142,0.378291,0.376687,0.313355,0.248777,0.459724,0.666011,...,0.366221,0.348663,0.34945,0.418182,0.361799,0.451672,0.432534,0.291809,0.169364,True


In [78]:
rna_matrix.max().max()
#rna_matrix.min().min()

1.0

## meth 

In [79]:
print('number of genes: ',len(meth.Hugo_Symbol.unique()))
print('number of patients: ',len(meth.columns.unique())-2)

number of genes:  13184
number of patients:  1417


Filter methylation data using the breast cancer gene list.

In [80]:
meth_filter = meth[meth['Hugo_Symbol'].isin(gene_list)].copy()
meth_filter = meth_filter.T.copy()
meth_filter.columns = meth_filter.iloc[0]
meth_filter.columns.name = None
meth_filter = meth_filter.drop('Hugo_Symbol')
meth_filter.columns = 'meth_' + meth_filter.columns
print('Number of breast cancer genes with methylation data: ',len(meth_filter.columns.unique()))
print('Number of patients with methylation data: ',len(meth_filter.index.unique()))

Number of breast cancer genes with methylation data:  54
Number of patients with methylation data:  1418


Check the presence of missing data and perform imputation by substituting missing data of a selected gene with the mean value over patients of the other data corresponding to the same gene. 

In [81]:
# check presence of missing data
nan = meth_filter.isnull().values.any()
print('Presence of missing data: ',nan)
nan_tot = meth_filter.isnull().sum().sum()
perc = nan_tot*100/(len(meth_filter.columns)*len(meth_filter))
print('Percentage of nan over the total amount of data: ',perc)

Presence of missing data:  True
Percentage of nan over the total amount of data:  4.347542182521026


In [82]:
# imputation with mean value over patients
meth_matrix = meth_filter.copy()
for i in meth_matrix.columns:
    if meth_matrix[i].isnull().values.any() == False:
        continue
    mean = meth_matrix[i][meth_matrix[i].notna()].mean()
    meth_matrix[i][meth_matrix[i].isna()] = mean
nan = meth_matrix.isnull().values.any()
print('Presence of missing data after imputation: ',nan)

Presence of missing data after imputation:  False


In [83]:
# scaling to [0-1]
meth_matrix = (meth_matrix-meth_matrix.min().min())/(meth_matrix.max().max()-meth_matrix.min().min())
# add 'meth' column indicating which patients have methylation data
meth_matrix['meth'] = True
meth_matrix

Unnamed: 0,meth_AKT1,meth_AR,meth_ARID1A,meth_ARID1B,meth_ASPM,meth_ATM,meth_ATRIP,meth_BAP1,meth_BARD1,meth_BRCA1,...,meth_RB1,meth_RRAS2,meth_SALL4,meth_SMARCD1,meth_STK11,meth_TBX3,meth_TP53,meth_XRCC2,meth_ZMYM3,meth
MB-0006,0.012346,0.345133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014706,...,0.002094,0.007246,0.374549,0.0,0.0,0.079208,0.0,0.00365,0.348101,True
MB-0028,0.042254,0.095238,0.008048,0.002353,0.011111,0.0,0.005814,0.018692,0.002066,0.0,...,0.005792,0.0,0.737374,0.0,0.0,0.058824,0.0,0.007143,0.145455,True
MB-0035,0.005587,0.15,0.0,0.001468,0.0,0.0,0.003279,0.015933,0.0,0.047583,...,0.0,0.0,0.875949,0.016287,0.0,0.0,0.0,0.0,0.507246,True
MB-0046,0.0,0.214286,0.0,0.00108,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.379421,0.0,0.0,0.0,0.0,0.004902,0.239583,True
MB-0050,0.032615,0.263158,0.00141,0.0,0.004098,0.0,0.002632,0.0,0.0,0.047583,...,0.001374,0.005063,0.572327,0.0,0.0,0.0,0.0,0.0,0.098446,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-7289,0.025,0.314815,0.001196,0.003783,0.007067,0.0,0.001653,0.0,0.002601,0.063291,...,0.0,0.0,0.695364,0.005848,0.0,0.0,0.0,0.00607,0.242915,True
MB-7291,0.0,0.30303,0.001185,0.0,0.0,0.0,0.001435,0.015933,0.002861,0.0,...,0.001116,0.0,0.579268,0.002268,0.0,0.028571,0.0,0.001408,0.201754,True
MB-7292,0.032615,0.291393,0.006309,0.007782,0.0,0.0,0.005076,0.015933,0.003623,0.02439,...,0.0,0.02509,0.568627,0.0,0.00641,0.0,0.004141,0.00995,0.21374,True
MB-7293,0.017857,0.291393,0.001594,0.0,0.0,0.009901,0.0,0.0,0.001592,0.047583,...,0.0,0.0,0.468599,0.0,0.0,0.0,0.0,0.002878,0.39196,True


In [84]:
meth_matrix.max().max()

1.0

## Merge all genomic data

In [85]:
matrix = mut_matrix.merge(cna_matrix, left_index=True, right_index=True, how='outer')
matrix = matrix.merge(rna_matrix, left_index=True, right_index=True, how='outer')
matrix = matrix.merge(meth_matrix, left_index=True, right_index=True, how='outer')

matrix.loc[matrix['mut'] != True, 'mut'] = False
matrix.loc[matrix['cna'] != True, 'cna'] = False
matrix.loc[matrix['rna'] != True, 'rna'] = False
matrix.loc[matrix['meth'] != True, 'meth'] = False

# change order of the columns
cols = matrix.columns.tolist()
cols_type = ['mut','cna','rna','meth']
for i in cols_type:
    cols.remove(i)
cols = cols+cols_type

matrix = matrix[cols]
matrix

Unnamed: 0,mut_AKT1,mut_ARID1A,mut_ARID1B,mut_BAP1,mut_BRCA1,mut_BRCA2,mut_BRIP1,mut_CASP8,mut_CDH1,mut_CDKN1B,...,meth_SMARCD1,meth_STK11,meth_TBX3,meth_TP53,meth_XRCC2,meth_ZMYM3,mut,cna,rna,meth
MB-0000,,,,,,,,,,,...,,,,,,,False,True,True,False
MB-0002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,True,True,True,False
MB-0005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,True,True,True,False
MB-0006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.079208,0.0,0.00365,0.348101,True,True,True,True
MB-0008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,True,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTS-T2428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,True,False,False,False
MTS-T2429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,,,,,,,True,False,False,False
MTS-T2430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,True,False,False,False
MTS-T2431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,True,False,False,False


Add label indicating the tumor classification

In [89]:
matrix_clin_sample = clin_sample[['TMB_NONSYNONYMOUS','GRADE','ER_STATUS','HER2_STATUS','PR_STATUS']].copy()
matrix_clin_sample.index = clin_sample.PATIENT_ID
matrix = matrix.merge(matrix_clin_sample, left_index=True, right_index=True, how='inner')

matrix_clin_patient = clin_patient[['COHORT','ER_IHC','HER2_SNP6','INTCLUST','CLAUDIN_SUBTYPE','THREEGENE','HISTOLOGICAL_SUBTYPE']].copy()
matrix_clin_patient.index = clin_patient.PATIENT_ID
matrix = matrix.merge(matrix_clin_patient, left_index=True, right_index=True, how='inner')

matrix

Unnamed: 0,mut_AKT1,mut_ARID1A,mut_ARID1B,mut_BAP1,mut_BRCA1,mut_BRCA2,mut_BRIP1,mut_CASP8,mut_CDH1,mut_CDKN1B,...,ER_STATUS,HER2_STATUS,PR_STATUS,COHORT,ER_IHC,HER2_SNP6,INTCLUST,CLAUDIN_SUBTYPE,THREEGENE,HISTOLOGICAL_SUBTYPE
MB-0000,,,,,,,,,,,...,Positive,Negative,Negative,1.0,Positve,NEUTRAL,4ER+,claudin-low,ER-/HER2-,Ductal/NST
MB-0002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Positive,Negative,Positive,1.0,Positve,NEUTRAL,4ER+,LumA,ER+/HER2- High Prolif,Ductal/NST
MB-0005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Positive,Negative,Positive,1.0,Positve,NEUTRAL,3,LumB,,Ductal/NST
MB-0006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Positive,Negative,Positive,1.0,Positve,NEUTRAL,9,LumB,,Mixed
MB-0008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Positive,Negative,Positive,1.0,Positve,NEUTRAL,9,LumB,ER+/HER2- High Prolif,Mixed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTS-T2428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Positive,,,1.0,Positve,,,,,
MTS-T2429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,Positive,,,1.0,Positve,,,,,
MTS-T2430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
MTS-T2431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


Save in .txt file

In [90]:
# add column with patient id
matrix['PATIENT_ID'] = matrix.index
matrix.to_csv('data/data_processed.txt', sep='\t', index=False)
#pd.read_csv('data/data_final.txt', sep='\t')

  matrix['PATIENT_ID'] = matrix.index


In [None]:
count=0
for i in cna.Hugo_Symbol.unique():
    c = cna[cna['Hugo_Symbol']==i].Hugo_Symbol
    if len(c) != 1:
        print(cna[cna['Hugo_Symbol']==i])
        count+=1
count
        #lista_duplicati.append(i)
#len(lista_duplicati)


list_rna_duplicates = []
for i in rna.Hugo_Symbol.unique():
    c = rna[rna['Hugo_Symbol']==i].Hugo_Symbol
    if len(c) != 1:
        list_rna_duplicates.append(i)
len(list_rna_duplicates)


**TMB_NONSYNONYMOUS**
total number of somatic/acquired mutations per coding area of a tumor genome (Mut/Mb)

**COHORT**
Cohort is a group of subjects who share a defining characteristic (It takes a value from 1 to 5)

**GRADE**
Determined by pathology by looking the nature of the cells, do they look aggressive or not (It takes a value from 1 to 3)

**ER_STATUS**
Cancer cells are positive or negative for estrogen receptors

**ER_IHC**
To assess if estrogen receptors are expressed on cancer cells by using immune-histochemistry (a dye used in pathology that targets specific antigen, if it is there, it will give a color, it is not there, the tissue on the slide will be colored) (positive/negative)

**HER2_STATUS**
Whether the cancer is positive or negative for HER2

**HER2_SNP6**
To assess if the cancer positive for HER2 or not by using advance molecular techniques (Type of next generation sequencing)

**PR_STATUS**
Cancer cells are positive or negative for progesterone receptors

**INTCLUST**
Molecular subtype of the cancer based on some gene expression (It takes a value from '4ER+', '3', '9', '7', '4ER-', '5', '8', '10', '1', '2', '6')

**CLAUDIN_SUBTYPE**
Tumor profiling test that helps show whether some estrogen receptor-positive (ER-positive), HER2-negative breast cancers are likely to metastasize (when breast cancer spreads to other organs). The claudin-low breast cancer subtype is defined by gene expression characteristics, most prominently: Low expression of cell–cell adhesion genes, high expression of epithelial–mesenchymal transition (EMT) genes, and stem cell-like/less differentiated gene expression patterns

**THREEGENE**
Three Gene classifier subtype It takes a value from 'ER-/HER2-', 'ER+/HER2- High Prolif', nan, 'ER+/HER2- Low Prolif','HER2+'

**HISTOLOGICAL_SUBTYPE**
Type of the cancer based on microscopic examination of the cancer tissue (It takes a value of 'Ductal/NST', 'Mixed', 'Lobular', 'Tubular/ cribriform', 'Mucinous', 'Medullary', 'Other', 'Metaplastic' )