In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif

In this study, we aimed to replicate the work from paper https://doi.org/10.1016/j.ygeno.2020.06.035 that analysed mRNA expression dataset to discover a validated set of genes with differential expression in the cases with spread to adjacent lymph nodes (N1) compared to local type (N0) of prostate cancer (PCa).

In [2]:
# After downloading the data, use your own pathways
file_path = '/home/DAVIDSON/jiqian/Workspace/IS/data2/data_curated_trans.csv'
data = pd.read_csv(file_path)

A TCGA dataset of Prostate Adenocarcinoma (TCGA, PanCancer Atlas) was provided by the paper https://data.mendeley.com/datasets/fdb8f5hjyd/1, which is retrived from cBioPortal according to the paper. However, we failed to find the same source of data.

We inferred that RNA expression values had been standardized against the gene's expression distribution in a reference population and had been reported as log2 values.

CNA data has been reported as +2, +1, 0, −1, or −2. 


In [3]:
data

Unnamed: 0,PATIENT_ID,PATH_N_STAGE,NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT,PATH_T_STAGE,PRIOR_DX,RADIATION_THERAPY,AGE,M_UBE2Q2P2,M_HMGB1P1,M_LOC155060,...,hsa-mir-1321,hsa-mir-361,hsa-mir-548m,hsa-mir-652,hsa-mir-220a,hsa-mir-513c,hsa-mir-513b,hsa-mir-513a-1,hsa-mir-513a-2,hsa-mir-224
0,TCGA-2A-A8VL,2.0,2.0,1.0,2.0,2.0,51.0,-0.0361,,0.3014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TCGA-2A-A8VO,3.0,2.0,2.0,2.0,2.0,57.0,-0.3004,,0.6841,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TCGA-2A-A8VT,1.0,2.0,2.0,2.0,1.0,47.0,3.1758,,2.8191,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,TCGA-2A-A8VV,2.0,2.0,1.0,2.0,2.0,52.0,-0.1102,,-0.1719,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TCGA-2A-A8VX,2.0,2.0,2.0,2.0,2.0,70.0,-0.7052,,0.3580,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,TCGA-ZG-A9M4,1.0,2.0,2.0,2.0,2.0,65.0,0.6768,,10.6747,...,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0
485,TCGA-ZG-A9MC,1.0,2.0,2.0,2.0,1.0,69.0,-0.2252,,-0.0288,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
486,TCGA-ZG-A9N3,1.0,2.0,2.0,2.0,2.0,73.0,-0.8350,,-0.5767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
487,TCGA-ZG-A9ND,2.0,2.0,2.0,2.0,2.0,55.0,0.4274,,-0.0456,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [6]:
#remove irrelavant columns
data = data.drop(['PATIENT_ID', 'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT', 'PATH_T_STAGE',
                  'PRIOR_DX', 'RADIATION_THERAPY', 'AGE'], axis=1)

In [7]:
data

Unnamed: 0,PATH_N_STAGE,M_UBE2Q2P2,M_HMGB1P1,M_LOC155060,M_RNU12-2P,M_SSX9,M_CXORF67,M_EFCAB8,M_SRP14P1,M_LOC391343,...,hsa-mir-1321,hsa-mir-361,hsa-mir-548m,hsa-mir-652,hsa-mir-220a,hsa-mir-513c,hsa-mir-513b,hsa-mir-513a-1,hsa-mir-513a-2,hsa-mir-224
0,2.0,-0.0361,,0.3014,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,-0.1102,,-0.1719,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,-0.7052,,0.3580,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2.0,1.4828,,-0.0582,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2.0,-0.1719,,0.1259,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,1.0,-0.5763,,-0.5543,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
482,1.0,0.5183,,-0.0826,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
484,1.0,0.6768,,10.6747,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0
485,1.0,-0.2252,,-0.0288,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create a tabel with mRNA expression of different genes (features) and stage of cancer (labels) 

We know that the samples had been assigned as either N1 or N0 groups. The N1 group included the samples from the patients with PCa 
with the involvement of lymph nodes whereas N0 group included the samples 
from the patients with PCa without the involvement of any lymph nodes. 
The NA samples were removed from the study.

In [8]:
data['PATH_N_STAGE'].value_counts()

2.0    340
1.0     77
Name: PATH_N_STAGE, dtype: int64

According to the paper, there should be 347 N0 patients and 77 N1 patients. Therefore, we conclude that 

In [9]:
N1 = data[data['PATH_N_STAGE'] == 1.0]
N0 = data[data['PATH_N_STAGE'] == 2.0]
data = pd.concat([N0,N1])
#note thar with concat, indexing of data was different
data

Unnamed: 0,PATH_N_STAGE,M_UBE2Q2P2,M_HMGB1P1,M_LOC155060,M_RNU12-2P,M_SSX9,M_CXORF67,M_EFCAB8,M_SRP14P1,M_LOC391343,...,hsa-mir-1321,hsa-mir-361,hsa-mir-548m,hsa-mir-652,hsa-mir-220a,hsa-mir-513c,hsa-mir-513b,hsa-mir-513a-1,hsa-mir-513a-2,hsa-mir-224
0,2.0,-0.0361,,0.3014,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,-0.1102,,-0.1719,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,-0.7052,,0.3580,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2.0,1.4828,,-0.0582,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2.0,-0.1719,,0.1259,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,1.0,-0.5763,,-0.5543,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
482,1.0,0.5183,,-0.0826,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
484,1.0,0.6768,,10.6747,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0
485,1.0,-0.2252,,-0.0288,,-0.0491,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We wish to select data for mRNA expression, which are columns that starts wtih "M_"

In [10]:
PATH_N_STAGE = data['PATH_N_STAGE']
# select columns with prefix 'M_'as mRNA expression
data = data.filter(like='M_')
data.insert(loc=0, column='PATH_N_STAGE', value=PATH_N_STAGE) 
data

Unnamed: 0,PATH_N_STAGE,M_UBE2Q2P2,M_HMGB1P1,M_LOC155060,M_RNU12-2P,M_SSX9,M_CXORF67,M_EFCAB8,M_SRP14P1,M_LOC391343,...,M_ZWILCH,M_ZWINT,M_ZXDA,M_ZXDB,M_ZXDC,M_ZYG11A,M_ZYG11B,M_ZYX,M_ZZEF1,M_ZZZ3
0,2.0,-0.0361,,0.3014,,-0.0491,,,,,...,-0.3588,-0.6952,-0.6728,-1.3532,-0.0299,-0.5507,-0.9856,-0.3259,0.5299,-1.6666
3,2.0,-0.1102,,-0.1719,,-0.0491,,,,,...,-0.1447,-0.4123,-0.8120,-0.1139,-0.4753,-0.2675,-1.3015,0.2438,-0.7655,0.9903
4,2.0,-0.7052,,0.3580,,-0.0491,,,,,...,1.5699,0.5423,-1.7109,-0.6793,-0.5877,0.7986,-0.9206,-0.3501,-1.3085,-0.4572
5,2.0,1.4828,,-0.0582,,-0.0491,,,,,...,2.9688,0.1896,0.2491,0.4027,-0.2384,0.9772,0.0215,-1.3029,-1.0051,1.6254
6,2.0,-0.1719,,0.1259,,-0.0491,,,,,...,0.7023,0.3496,0.2206,0.9659,1.0893,-0.7623,1.0353,-0.4055,0.2360,0.6438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,1.0,-0.5763,,-0.5543,,-0.0491,,,,,...,-0.4111,-0.3825,-0.2148,-0.6769,-0.1111,0.4151,-0.3884,1.3184,-0.3554,-0.2667
482,1.0,0.5183,,-0.0826,,-0.0491,,,,,...,-0.9603,-0.1455,-0.0074,-0.2516,-0.3009,0.7224,-0.4032,0.2512,0.1328,-0.0146
484,1.0,0.6768,,10.6747,,-0.0491,,,,,...,-1.1159,-0.3622,0.1483,-0.2998,0.8452,0.0677,-1.1697,-0.9595,-0.2514,-1.4445
485,1.0,-0.2252,,-0.0288,,-0.0491,,,,,...,-0.7841,-0.6561,0.1232,0.2636,1.9797,-0.1537,0.2651,1.1690,0.4163,-1.1685


Therefore, we have a table containing 20472 features (mRNA expression data) and 417 patients.