### Feature selection algorithms aplied to determine most important biomarkers to diagnosis colon cancer

 Demonstrative notebook

In [3]:
# packages to import
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet,Lasso,Lars,OrthogonalMatchingPursuit
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,recall_score,precision_score

In [27]:
# Demonstration of a small part of the set used to train the models
dataFrame.head()

Unnamed: 0,TEX11,BHMT2,STC2,D21S2056E,GTF2H1,PSME3,RRAD,CCNH,ALDOA,SH3BP1,...,TMEM66,DOC-1R,MGC18216,SERPIND1,PIGQ,NRIP1,HIGD2A,RPL10,TGFBR2,RABL2B
PAT 7657,2.15557,1.55961,1.652982,1.579475,1.636618,1.797736,1.839806,1.131839,1.490569,1.126158,...,2.033001,2.928262,2.909495,2.178864,3.308695,3.906101,1.468596,4.230699,4.25367,1.717029
PAT 7938,2.594781,2.501282,3.43022,2.452831,2.455439,2.967438,3.251557,4.156617,3.286721,2.728267,...,2.96922,6.356543,6.265912,2.541806,5.18936,5.398594,2.420806,9.743717,6.829954,1.946595
PAT 7942,1.32109,1.125496,1.434565,1.193589,0.975665,1.390202,1.271447,1.82672,1.242207,1.033188,...,1.250513,2.633882,1.850188,1.053447,1.623159,2.051022,1.032258,2.590952,2.793577,0.68056
PAT 8014,1.199055,1.182967,1.628302,1.032696,1.000866,1.380063,1.312044,1.117691,1.476219,1.826212,...,2.153624,2.492472,2.52543,2.142806,3.295712,2.293355,2.907453,3.610939,3.752397,2.715629
PAT 8015,0.635057,0.427649,0.795654,0.438304,0.344568,0.485543,0.458305,0.29156,0.686847,1.27926,...,2.092384,1.88441,1.767004,2.147815,2.052154,1.083346,1.766682,1.208958,1.824611,1.039966


In [6]:
# Demonstration of a small part of target

# It is important to mention that the labels 
# referring to the diagnosis phase of the disease are in column 0
dataFrameTarget.head(5)

Unnamed: 0,0,1,2
PAT 7657,1.0,1.0,1.0
PAT 7938,1.0,1.0,1.0
PAT 7942,1.0,1.0,1.0
PAT 8014,1.0,1.0,1.0
PAT 8015,2.0,2.0,2.0


In [32]:
# Handling data to remove healthy patients and pick only labels from diagnostic phase
pat = dataFrame.drop(dataFrame.index[51:], axis = 0) 
patTarget = dataFrameTarget.drop(dataFrameTarget.index[51:], axis = 0)
diagnosticoPat = pd.DataFrame(patTarget.iloc[:,0])

#### Testing the developed class and applying the chosen models to the data

In [182]:
lasso = Lasso(alpha=0.1, fit_intercept=False, max_iter=50000, tol=0.0001)
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=1, tol=0.00000001, fit_intercept=False)
lar = Lars(fit_intercept=False, n_nonzero_coefs=5)
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, fit_intercept=False,max_iter=50000,tol=0.0001)

In [183]:
metrics = Metrics()

Results of LASSO:

In [184]:
metrics.leave_one_out(lasso,pat,diagnosticoPat.applymap(transform).values.ravel())

Accuracy: 0.6862745098039216 
Precision: 0.7948717948717948
Recall: 0.7948717948717948
F1 Score: 0.7948717948717948


In [185]:
metrics.chosen_variables(lasso,pat,diagnosticoPat.applymap(transform).values.ravel())

Index(['VIL2', 'MAPK13', 'GJB2', 'NUP54', 'CXCL2', 'BATF', 'HSPA2', 'TYROBP',
       'TRIP10', 'CycE2', 'HEXIM1', 'TNFSF14', 'NPPB', 'IFI35', 'IL10RB',
       'RNP24', 'VTI2', 'LALBA', 'CSNK2A2', 'FGF13', 'CDC42', 'PRAME',
       'CDKN1A', 'MED8', 'CDH15', 'CITED1', 'BAG3'],
      dtype='object', name='Unnamed: 0')

Results of Lar:

In [186]:
metrics.leave_one_out(lar,pat,diagnosticoPat.applymap(transform).values.ravel())

Accuracy: 0.7647058823529411 
Precision: 0.7647058823529411
Recall: 1.0
F1 Score: 0.8666666666666666


In [187]:
metrics.chosen_variables(lar,pat,diagnosticoPat.applymap(transform).values.ravel())

Index(['TNFSF14', 'IFI35', 'IL10RB', 'VTI2', 'NOV'], dtype='object', name='Unnamed: 0')

Results of ElasticNet:

In [188]:
metrics.leave_one_out(elastic_net,pat,diagnosticoPat.applymap(transform).values.ravel())

Accuracy: 0.7450980392156863 
Precision: 0.8421052631578947
Recall: 0.8205128205128205
F1 Score: 0.8311688311688312


In [189]:
metrics.chosen_variables(elastic_net,pat,diagnosticoPat.applymap(transform).values.ravel())

Index(['VIL2', 'GMDS', 'GJB2', 'EIF2B2', 'NUP54', 'PDHB', 'UCHL1', 'SERPING1',
       'ANKRD1', 'ELF5', 'BATF', 'TYROBP', 'TRIP10', 'TMED1', 'CCL27',
       'TNFSF14', '76P', 'NPPB', 'CST3', 'IFI35', 'IL10RB', 'NFIC', 'CFB',
       'PABPC1', 'HBE1', 'VTI2', 'HSPA8', 'PIP', 'LALBA', 'CSNK2A2', 'BRE',
       'FGF13', 'CDC42', 'EDF1', 'KLRC1', 'CDKN1A', 'ACO2', 'MED8', 'CITED1',
       'BAG3', 'ETS2', 'ENSA'],
      dtype='object', name='Unnamed: 0')

Results of OMP:

In [190]:
metrics.leave_one_out(omp,pat,diagnosticoPat.applymap(transform).values.ravel())

Accuracy: 0.7058823529411765 
Precision: 0.8529411764705882
Recall: 0.7435897435897436
F1 Score: 0.7945205479452054


In [191]:
metrics.chosen_variables(omp,pat,diagnosticoPat.applymap(transform).values.ravel())

Index(['VIL2', 'GMDS', 'UGDH', 'STUB1', 'SCYB6', 'RUVBL1', 'MOSPD1', 'IL4',
       'C5orf15', 'TNA', 'STAR', 'UCHL1', 'p53', 'TCEA1', 'NMUR2', 'ANKRD1',
       'RPL32', 'CTCF', 'CCDC47', 'SESN2', 'GGT1', 'TYROBP', 'NPPB', 'TNF',
       'PFN1', 'CRIP2', 'PRL', 'IFI35', 'IL10RB', 'CCNI', 'ST3GAL1', 'VTI2',
       'GPC3', 'MRPL40', 'EPS8', 'S100A9', 'CLIC2', 'OPTN', 'NSUN5', 'CLIC1',
       'SNRPD2', 'ADH5', 'PPIH', 'INSL5', 'NOV', 'IL8RA', 'RNASEH1', 'PHYH',
       'WNT2', 'ENSA', 'ETV3'],
      dtype='object', name='Unnamed: 0')

##### For a deeper analysis it is necessary, through the tools presented so far, to make a search varying the hyperparameters.