## BBC - Project
### Data Exploration

In [1]:
DATA_PATH = './data/'

In [2]:
import numpy as np
from Bio import Geo

def load_geo(myfile):
    handle = open(myfile)
    records = Geo.parse(handle)
    return records
    
records = load_geo(DATA_PATH + 'GSE21510_family.soft')

The following cell should be executed multiple times to run through the data

In [3]:
nr = next(records)
print(nr)
print('\n---------------------- ENTITY_ATTRIBUTES:------------------------------\n')
print(nr.entity_attributes)

GEO Type: DATABASE
GEO Id: GeoMiame
Database_email: geo@ncbi.nlm.nih.gov

Database_institute: NCBI NLM NIH

Database_name: Gene Expression Omnibus (GEO)

Database_web_link: http://www.ncbi.nlm.nih.gov/geo

Column Header Definitions


---------------------- ENTITY_ATTRIBUTES:------------------------------

{'Database_name': 'Gene Expression Omnibus (GEO)', 'Database_institute': 'NCBI NLM NIH', 'Database_web_link': 'http://www.ncbi.nlm.nih.gov/geo', 'Database_email': 'geo@ncbi.nlm.nih.gov'}


In [4]:
# re-loading the data
records = load_geo(DATA_PATH + 'GSE21510_family.soft')
series_sample_id = []
sample_titles = []
genes = []
nb_cols = 0
nb_rows = 0
data = []
for r in records:
    rea = r.entity_attributes
    if 'Series_geo_accession' in rea:
        if rea['Series_geo_accession'] == 'GSE21510':
            series_sample_id = rea['Series_sample_id']
            nb_cols = len(series_sample_id)
    if 'Sample_title' in rea:
        sample_titles.append(rea['Sample_title'])
        if 'sample_table_begin' in rea:
            nb_rows = rea['Sample_data_row_count'] 
            data.append(r.table_rows)
                    

data will become a 2d numpy array of shape (54676, 149) == (NB_GENES+1, NB_SAMPLES+1).  
First row will contain the sample ids, first column will contain the ID_REFs of the genes.

In [5]:
all_data = np.ndarray((int(nb_rows)+1, int(nb_cols)+1), dtype=object)
# labels
all_data[0, 0] = 'ID_REF'
all_data[0, 1:] = np.array(series_sample_id)

for i, d in enumerate(data):
    values = np.array(d[1:])
    if (i == 0):
        all_data[1:, 0] = values[:, 0]
    all_data[1:, i+1] = values[:, 1]
data = np.array(all_data)

In [6]:
data.shape

(54676, 149)

In [7]:
print(series_sample_id)
print('')
print(sample_titles)
print('')
print(len(data), len(data[0]))
print(data[0:10, 0])

['GSM537330', 'GSM537331', 'GSM537332', 'GSM537333', 'GSM537334', 'GSM537335', 'GSM537336', 'GSM537337', 'GSM537338', 'GSM537339', 'GSM537340', 'GSM537341', 'GSM537342', 'GSM537343', 'GSM537344', 'GSM537345', 'GSM537346', 'GSM537347', 'GSM537348', 'GSM537349', 'GSM537350', 'GSM537351', 'GSM537352', 'GSM537353', 'GSM537354', 'GSM537355', 'GSM537356', 'GSM537357', 'GSM537358', 'GSM537359', 'GSM537360', 'GSM537361', 'GSM537362', 'GSM537363', 'GSM537364', 'GSM537365', 'GSM537366', 'GSM537367', 'GSM537368', 'GSM537369', 'GSM537370', 'GSM537371', 'GSM537372', 'GSM537373', 'GSM537374', 'GSM537375', 'GSM537376', 'GSM537377', 'GSM537378', 'GSM537379', 'GSM537380', 'GSM537381', 'GSM537382', 'GSM537383', 'GSM537384', 'GSM537385', 'GSM537386', 'GSM537387', 'GSM537388', 'GSM537389', 'GSM537390', 'GSM537391', 'GSM537392', 'GSM537393', 'GSM537394', 'GSM537395', 'GSM537396', 'GSM537397', 'GSM537398', 'GSM537399', 'GSM537400', 'GSM537401', 'GSM537402', 'GSM537403', 'GSM537404', 'GSM537405', 'GSM537406'

In [8]:
import re

samples = {}
patient_names = []
for i_t, title in enumerate(sample_titles):
    split = title.split(',')
    if(re.search('cancer', title)):
        samples[series_sample_id[i_t]] = 'cancer'
    else:
        samples[series_sample_id[i_t]] = 'normal'
        
    patient_names.append(split[0])
        
samples

{'GSM537330': 'cancer',
 'GSM537331': 'cancer',
 'GSM537332': 'cancer',
 'GSM537333': 'cancer',
 'GSM537334': 'cancer',
 'GSM537335': 'cancer',
 'GSM537336': 'cancer',
 'GSM537337': 'cancer',
 'GSM537338': 'cancer',
 'GSM537339': 'cancer',
 'GSM537340': 'cancer',
 'GSM537341': 'cancer',
 'GSM537342': 'cancer',
 'GSM537343': 'cancer',
 'GSM537344': 'cancer',
 'GSM537345': 'cancer',
 'GSM537346': 'cancer',
 'GSM537347': 'cancer',
 'GSM537348': 'cancer',
 'GSM537349': 'cancer',
 'GSM537350': 'cancer',
 'GSM537351': 'cancer',
 'GSM537352': 'cancer',
 'GSM537353': 'cancer',
 'GSM537354': 'cancer',
 'GSM537355': 'cancer',
 'GSM537356': 'cancer',
 'GSM537357': 'cancer',
 'GSM537358': 'cancer',
 'GSM537359': 'cancer',
 'GSM537360': 'cancer',
 'GSM537361': 'cancer',
 'GSM537362': 'cancer',
 'GSM537363': 'cancer',
 'GSM537364': 'cancer',
 'GSM537365': 'cancer',
 'GSM537366': 'cancer',
 'GSM537367': 'cancer',
 'GSM537368': 'cancer',
 'GSM537369': 'cancer',
 'GSM537370': 'cancer',
 'GSM537371': 'c

In [9]:
print(sample_titles[104])

patient 107, normal, homogenized [OPG study]


### Prepare the data structure to run the SAM tests

Creating a dummy dataset in the Excel file for the SAM analysis

In [10]:
import pandas as pd

In [11]:
ar = np.ndarray((7, 5), dtype=object)
# ar[0][0] = 'N/A'
# ar[0][1] = 'N/A'
ar[0, 2:] = np.array([1, 2, 1], dtype='int32')
ar[1:, 0] = np.array(['g1', 'g2', 'g3', 'g4', 'g5', 'g6'])
ar[1:, 1] = np.array(['1001_at', '1002_at', '1003_at', '1004_at', '1005_at', '1006_at'])
ar[1:, 2:] = np.random.rand(6, 3)
ar

array([[None, None, 1, 2, 1],
       ['g1', '1001_at', 0.023971953109292454, 0.20594139793929878,
        0.34941609751566827],
       ['g2', '1002_at', 0.754147848194236, 0.38297658765075804,
        0.03337512156943967],
       ['g3', '1003_at', 0.9490814321324719, 0.16152569302383213,
        0.08172325437723582],
       ['g4', '1004_at', 0.4926470576851303, 0.7187075590320299,
        0.8814058488325429],
       ['g5', '1005_at', 0.13515786069993374, 0.989909819521922,
        0.9774488424916804],
       ['g6', '1006_at', 0.4770191332620297, 0.6940144964056626,
        0.6300210284333895]], dtype=object)

In [12]:
ar_frame = pd.DataFrame(ar)

In [13]:
ar_frame

Unnamed: 0,0,1,2,3,4
0,,,1.0,2.0,1.0
1,g1,1001_at,0.023972,0.205941,0.349416
2,g2,1002_at,0.754148,0.382977,0.0333751
3,g3,1003_at,0.949081,0.161526,0.0817233
4,g4,1004_at,0.492647,0.718708,0.881406
5,g5,1005_at,0.135158,0.98991,0.977449
6,g6,1006_at,0.477019,0.694014,0.630021


In [14]:
writer = pd.ExcelWriter('ar1.xlsx')
ar_frame.to_excel(writer, sheet_name='ar', index=False, header=False)
writer.save()

Now format the real dataset.  
The Excel files containing data are already generated -> commented

In [15]:
def to_sam_data(data):
    sam_data = np.ndarray((int(nb_rows)+1, int(nb_cols)+2), dtype=object)
    sam_data[:, 1:] = data
    # create the first column: gene IDs
    for i in np.arange(1, sam_data.shape[0]):
        sam_data[i, 0] = 'g' + str(i)
    sam_data[0, 1] = None
    for i_c, c in enumerate(sam_data[0, 2:]):
        if(samples[c] == 'cancer'):
            sam_data[0, i_c+2] = 2
        else:
            sam_data[0, i_c+2] = 1
    return sam_data

In [16]:
# sam_data = to_sam_data(data)

In [17]:
# sam_frame = pd.DataFrame(sam_data)
# writer = pd.ExcelWriter('colorectal_cancer.xlsx')
# sam_frame.to_excel(writer, sheet_name='cancer', index=False, header=False)
# writer.save()

We have extracted the significant genes from the original dataset so that we have around 500 genes left.  
Then we saved the significant genes in 2 Excel files: sig_genes and neg_sig_genes

In [18]:
# positive significant
sig_genes_frame = pd.read_excel(open('sig_genes.xlsx', 'rb'))
sig_genes_frame

Unnamed: 0,Row,Gene ID,Gene Name,Score(d),Numerator(r),Denominator(s+s0),Fold Change,q-value(%)
0,31830,222549_at,g31829,23.393728,4.944231,0.211349,1.858560,0
1,35065,225806_at,g35064,21.681580,4.087929,0.188544,1.882985,0
2,22247,212942_s_at,g22246,21.375612,5.388269,0.252076,2.127137,0
3,40131,230875_s_at,g40130,21.293030,4.201300,0.197309,1.689584,0
4,14149,204700_x_at,g14148,21.119135,2.629669,0.124516,1.407738,0
5,10929,201479_at,g10928,21.023521,2.496684,0.118757,1.252292,0
6,18849,209434_s_at,g18848,20.878641,3.156227,0.151170,1.521188,0
7,27539,218252_at,g27538,20.576550,3.394665,0.164977,1.511293,0
8,21468,212160_at,g21467,20.440679,2.333815,0.114175,1.277153,0
9,28463,219177_at,g28462,20.363365,2.332021,0.114520,1.360867,0


In [19]:
def get_sig_genes(sig_genes_frame, data):
    '''
    This function extracts the significant genes from the dataset
    sig_genes_frame: pandas dataframe containing values of significant genes (SAM result)
    data: the original dataset
    '''
    # extract the indices of all significant genes
    idx_sig_genes = sig_genes_frame.values[:, 0]
    # create a numpy array that will contain the data of all significant genes
    # nb of rows: nb of significant genes
    # nb of cols: nb of samples in the original dataset
    sig_genes = np.ndarray((idx_sig_genes.shape[0]+1, data.shape[1]), dtype=object)
    # add the header line
    sig_genes[0, :] = data[0, :]
    # fill the rest with the ID_REF of the gene (first col) and the gene expression values
    for i, sig_index in enumerate(idx_sig_genes):
        sig_genes[i+1, :] = data[sig_index-1, :]
    return sig_genes

In [20]:
# positive significant genes
sig_genes = get_sig_genes(sig_genes_frame, data)
sig_genes.shape

(194, 149)

In [21]:
# negative significant genes
neg_sig_genes_frame = pd.read_excel(open('neg_sig_genes.xlsx', 'rb'))
neg_sig_genes_frame

Unnamed: 0,Row,Gene ID,Gene Name,Score(d),Numerator(r),Denominator(s+s0),Fold Change,q-value(%)
0,16946,207502_at,g16945,-24.587828,-5.722159,0.232723,0.444672,0
1,28955,219669_at,g28954,-24.587407,-5.148021,0.209376,0.534745,0
2,19024,209612_s_at,g19023,-24.536851,-4.935035,0.201127,0.444100,0
3,19025,209613_s_at,g19024,-24.232185,-4.462410,0.184152,0.435183,0
4,40044,230788_at,g40043,-23.905884,-4.586701,0.191865,0.511890,0
5,14649,205200_at,g14648,-23.846603,-3.081920,0.129239,0.635384,0
6,16451,207003_at,g16450,-23.608192,-5.196195,0.220101,0.573003,0
7,29195,219909_at,g29194,-23.101830,-4.547841,0.196861,0.484035,0
8,20823,211494_s_at,g20822,-23.050639,-3.979604,0.172646,0.550064,0
9,41,1552296_at,g40,-22.970081,-4.821557,0.209906,0.469868,0


In [22]:
neg_sig_genes = get_sig_genes(neg_sig_genes_frame, data)
neg_sig_genes.shape

(250, 149)

So far we have extracted the gene expression for all positive and negative significant genes. Now we will merge the two and create a data structure in input format for the __Trefle Classifier__

In [38]:
def merge_sigenes(pos_sig_genes, neg_sig_genes, sample_labels):
    '''
    This function merges arrays containing data of the positive and negative significant genes 
    and creates the array for the Trefle classifier
    '''
    # the shape is the sum of number of lines of arrays containing negative and positive significant genes -1 
    # (because both contain the header with the sample names)
    sig_genes_all = np.ndarray((pos_sig_genes.shape[0]+neg_sig_genes.shape[0]-1, pos_sig_genes.shape[1]), dtype=object)
    sig_genes_all[0:pos_sig_genes.shape[0], :] = pos_sig_genes
    sig_genes_all[pos_sig_genes.shape[0]:, :] = neg_sig_genes[1:, :] # without header line

    # Transpose, so every line becomes a vector of features
    # Reserve one column for the labels
    sig_genes_t = np.ndarray((pos_sig_genes.shape[1], pos_sig_genes.shape[0]+neg_sig_genes.shape[0]), dtype=object)
    sig_genes_t[:, 0:sig_genes_t.shape[1]-1] = sig_genes_all.T

    # Label the samples: Malign = 1, Benign = 0
    b_cnt = 0
    for i, sample in enumerate(sig_genes_t):
        if (i == 0):
            sig_genes_t[i, sig_genes_t.shape[1]-1] = 'label'
        else:
            if(sample_labels[sig_genes_t[i, 0]] == 'cancer'):
                sig_genes_t[i, sig_genes_t.shape[1]-1] = 1
            else:
                sig_genes_t[i, sig_genes_t.shape[1]-1] = 0
                b_cnt += 1
    print(b_cnt)
    return sig_genes_t

In [39]:
sig_genes_t = merge_sigenes(sig_genes, neg_sig_genes, samples)
sig_genes_df = pd.DataFrame(sig_genes_t)
sig_genes_df

25


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,434,435,436,437,438,439,440,441,442,443
0,ID_REF,222549_at,225806_at,212942_s_at,230875_s_at,204700_x_at,201479_at,209434_s_at,218252_at,212160_at,...,209763_at,209791_at,213030_s_at,219722_s_at,1554691_a_at,211848_s_at,210301_at,241910_x_at,207251_at,label
1,GSM537330,11.18204744,8.654007946,11.5958136,10.78885867,9.970058164,13.26515873,10.30946487,10.79228985,10.9790911,...,4.385058586,7.163496669,6.133836087,6.409016661,6.471690366,5.407069999,5.771060145,6.579975696,4.020783328,1
2,GSM537331,12.05889929,8.570927803,8.078051196,10.84165751,8.926294779,12.31549486,8.541125393,10.28592502,10.30147795,...,4.608451234,7.179010927,7.209484805,6.449631771,6.180758558,5.519771773,5.899706441,6.402625768,4.200473671,1
3,GSM537332,12.20760134,8.88127098,12.00316315,11.15447693,9.328659717,12.53940326,9.705434261,10.1458872,10.97528834,...,4.159437788,7.990475281,7.373144375,8.83281199,6.113779816,5.249316012,5.638621009,6.233672531,4.106327029,1
4,GSM537333,10.10776161,9.669961284,11.52090695,9.931037961,8.992218311,12.54182663,9.969552761,9.541108789,11.04544616,...,4.548334474,9.561013955,6.443398102,5.710409597,6.459583864,5.136545827,5.580786637,6.154057045,4.047665357,1
5,GSM537334,11.83347823,8.54325762,9.102151499,10.32935868,9.553853044,12.25560094,9.499728856,10.19866325,11.00491197,...,4.409519906,7.440873925,5.652310431,6.246096399,6.404186702,6.214226432,5.731559804,6.186046895,4.234884863,1
6,GSM537335,12.05951236,9.114890763,9.414644981,11.48184062,9.314749297,12.5085425,10.09920405,11.13265214,11.25041237,...,4.539115542,6.781416751,6.264596756,7.092256264,5.785716109,5.06562676,5.445177045,6.685326309,4.421340238,1
7,GSM537336,7.525914339,8.076110964,12.36997219,10.06114487,7.938799139,11.93880691,9.617497437,8.953276983,10.84142395,...,3.801821721,8.502481353,6.869098396,8.307073043,7.160047396,4.618498621,5.836586746,6.101565231,4.248786199,1
8,GSM537337,10.95448265,8.840053413,9.808921861,10.42909422,8.758957342,12.68416821,9.05050804,10.16450644,10.5476683,...,4.96024753,8.517111873,6.526638779,6.837988838,6.495719325,8.847287283,5.6873279,5.990007493,4.135739028,1
9,GSM537338,12.30607122,8.421929106,8.836538943,10.6527966,8.953626302,12.70577132,9.639116871,10.42768544,10.64716972,...,4.966710005,6.989519686,7.01579861,6.365624674,6.524275118,6.739127153,5.806423441,6.088321325,4.317008762,1


In [25]:
# sig_genes_all.csv will contain the data of 444 significant genes according to SAM extraction method
sig_genes_df.to_csv('sig_genes_all.csv', index=False, header=False)

Now we will prepare the dataset containing ~22k significant genes extracted with SAM. This dataset will be later used for the RFE feature extraction method

In [26]:
# Read from previously generated Excel file to the pandas dataframe
pos_sig_genes_22k_df = pd.read_excel(open('pos_sigenes_22k.xlsx', 'rb'))
neg_sig_genes_22k_df = pd.read_excel(open('neg_sigenes_22k.xlsx', 'rb'))

# Extract the values corresponding to the significant genes from the original dataset
pos_sig_genes_22k = get_sig_genes(pos_sig_genes_22k_df, data)
neg_sig_genes_22k = get_sig_genes(neg_sig_genes_22k_df, data)

# Merge positive and negative significant genes into single array
sig_genes_all_22k = merge_sigenes(pos_sig_genes_22k, neg_sig_genes_22k, samples)

# RFE method can be applied with this array (with the use of functions below) 
sig_genes_all_22k = sig_genes_all_22k.T # transpose
sig_genes_all_22k = sig_genes_all_22k[0:sig_genes_all_22k.shape[0]-1, :] # drop the last line containing labels

In [27]:
sig_genes_all_22k.shape

(22212, 149)

### Support Vector Machines + Recursive Feature Elimination

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4175386/

In [28]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

Transform the data to feed it to the RFE selector

X is an input for RFE: every line of X represents a sample (patient). Every column represents the feature.
y contains the labels for every sample: 1 if the patient has cancer, 0 otherwise 

In [29]:
def rfe_data(data, sample_labels):
    '''
    data: numpy array every column of which represents a sample and the row represents a feature
        first column contains the feature id (ID_REF of the gene in our case)
        first row contains the sample id 
    sample_labels: dictionnary having sample ids as keys and 'cancer'/'no cancer' as values
    returns (X, y) tuple where X contains the values of the feature set
        every line of X contains values corresponding to one sample (sample in our case)
        columns of X represent the features (genes in our case)
        y contains the labels (1-Malign, 0-Benign)
    '''
    X = data[1:, 1:].T
    y = np.zeros(len(sample_labels))
    
    for i, y_i in enumerate(y):
        if(sample_labels[data[0, i+1]] == 'cancer'):
            y[i] = 1
        else:
            y[i] = 0
    return (X, y)

In [30]:
X, y = rfe_data(sig_genes_all_22k, samples)

In [31]:
X.shape

(148, 22211)

In [32]:
y

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [33]:
# Create the RFE object and rank each gene
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=400, step=100)
# The following function will filter the input so that only the significant {n_features_to_select} or less genes remain
rfe_selected = rfe.fit_transform(X, y)

In [34]:
rfe_selected.shape

(148, 400)

In [35]:
selected_indices = rfe.get_support(indices=True)
selected_indices

array([    0,     1,     2,     3,     4,     6,     7,    10,    11,
          12,    16,    17,    20,    22,    25,    28,    29,    30,
          31,    41,    43,    44,    45,    49,    50,    55,    64,
          80,    85,    94,   103,   109,   113,   121,   127,   129,
         132,   140,   150,   162,   163,   181,   191,   209,   212,
         214,   227,   233,   235,   243,   244,   272,   274,   319,
         324,   333,   354,   366,   373,   378,   401,   406,   424,
         445,   450,   467,   476,   481,   488,   490,   523,   542,
         567,   615,   627,   641,   642,   643,   649,   658,   669,
         675,   683,   694,   798,   831,   844,   865,   875,   965,
         971,   983,  1022,  1052,  1081,  1104,  1114,  1133,  1138,
        1141,  1182,  1198,  1209,  1322,  1333,  1342,  1410,  1420,
        1463,  1532,  1564,  1596,  1766,  1799,  1842,  1864,  1871,
        1938,  2016,  2050,  2070,  2163,  2191,  2334,  2369,  2416,
        2461,  2528,

In [40]:
idx_selected = selected_indices+1 # maka a shift because the first col contains the sample names
nrows = X.shape[0]+1 # +1 line for the sample ids
ncols = selected_indices.shape[0]+2 # +2 columns for the gene id and the label
rfe_selected_data = np.ndarray((nrows, ncols), dtype=object)
rfe_selected_data[:, 0] = data[0,:ncols] # sample ids

# Label the samples: Malign = 1, Benign = 0
b_cnt = 0
for i, sample in enumerate(rfe_selected_data):
    if (i == 0):
        rfe_selected_data[i, ncols-1] = 'label'
    else:
        if(samples[rfe_selected_data[i, 0]] == 'cancer'):
            rfe_selected_data[i, ncols-1] = 1
        else:
            b_cnt += 1
            rfe_selected_data[i, ncols-1] = 0
print(b_cnt)

# gene ids
rfe_selected_data[0, 1:ncols-1] = data[idx_selected,0]
# Now fill the array with the values corresponding to the selected genes
rfe_selected_data[1:, 1:ncols-1] = rfe_selected
rfe_selected_df = pd.DataFrame(rfe_selected_data)
rfe_selected_df

25


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,392,393,394,395,396,397,398,399,400,401
0,ID_REF,1007_s_at,1053_at,117_at,121_at,1255_g_at,1316_at,1320_at,1438_at,1487_at,...,210783_x_at,211377_x_at,211461_at,211581_x_at,211585_at,211602_s_at,211616_s_at,212043_at,212612_at,label
1,GSM537330,11.18204744,8.654007946,11.5958136,10.78885867,9.970058164,10.30946487,10.79228985,12.2049361,9.631743521,...,5.591213226,6.227322056,7.392751757,6.224789113,5.689202384,8.825054163,4.5774541,8.677586148,9.52411921,1
2,GSM537331,12.05889929,8.570927803,8.078051196,10.84165751,8.926294779,8.541125393,10.28592502,11.92396692,8.738657178,...,9.389268231,6.628290673,9.206933288,6.919133655,5.44737371,7.733068859,5.68360146,9.253717764,10.64291367,1
3,GSM537332,12.20760134,8.88127098,12.00316315,11.15447693,9.328659717,9.705434261,10.1458872,12.70236567,9.278909508,...,4.672494423,5.974763894,8.415521277,6.363705641,5.779687126,8.974140417,4.366214864,9.455260142,9.140912995,1
4,GSM537333,10.10776161,9.669961284,11.52090695,9.931037961,8.992218311,9.969552761,9.541108789,11.43061019,9.14319428,...,9.102809899,6.592707195,8.624147947,5.925591381,6.856468708,11.26411019,5.386560672,9.393032415,9.602141554,1
5,GSM537334,11.83347823,8.54325762,9.102151499,10.32935868,9.553853044,9.499728856,10.19866325,10.22286291,9.413488425,...,7.871904622,7.019065264,8.827480851,5.830366773,5.798082039,7.738825142,6.570926242,8.909785192,10.46381864,1
6,GSM537335,12.05951236,9.114890763,9.414644981,11.48184062,9.314749297,10.09920405,11.13265214,9.454139017,9.404364981,...,8.267092178,7.85044615,8.996063599,6.575039074,5.34535887,8.087454399,6.809127323,9.028036362,9.936006819,1
7,GSM537336,7.525914339,8.076110964,12.36997219,10.06114487,7.938799139,9.617497437,8.953276983,11.71047557,7.626420622,...,4.811251213,7.139248116,8.729923653,7.404631813,7.54427196,10.18273804,10.03320836,8.806878733,7.929835109,1
8,GSM537337,10.95448265,8.840053413,9.808921861,10.42909422,8.758957342,9.05050804,10.16450644,11.4751955,8.652624617,...,5.516149878,6.856876812,8.231140808,7.793435743,5.582625566,6.426627973,5.495323596,9.87955883,10.11734796,1
9,GSM537338,12.30607122,8.421929106,8.836538943,10.6527966,8.953626302,9.639116871,10.42768544,11.31486494,8.829740187,...,7.816269057,6.968128079,8.239821565,6.885398002,5.907605086,6.889094351,6.094550218,8.703073655,10.10424099,1


In [37]:
# sig_genes_all.csv will contain the data of 444 significant genes according to SAM extraction method
rfe_selected_df.to_csv('rfe_result.csv', index=False, header=False)