In [10]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV, ElasticNet

Load the data

In [2]:
bac = pd.read_csv('data/ecs171.dataset.txt', sep = '\s+' )
print(bac.shape)
bac.head()

(194, 4502)


Unnamed: 0,ID,Strain,Medium,Stress,GenePerturbed,GrowthRate,b3356,b2922,b2519,b3823,...,b4684,b4689,b4685,b4683,b4682,b4699,b4698,b4704,b4703,b4635
0,T8129,MG1655,MD001,none,oxyR_KO,0.637,0.30479,0.506615,0.289473,0.289473,...,0,0,0,0,0,0,0,0,0,
1,T8130,MG1655,MD001,none,oxyR_KO,0.637,0.361095,0.582885,0.259295,0.259295,...,0,0,0,0,0,0,0,0,0,8.0
2,T8131,MG1655,MD001,none,oxyR_KO,0.637,0.29592,0.477482,0.304459,0.304459,...,0,0,0,0,0,0,0,0,0,
3,T8135,MG1655,MD001,none,soxS_KO,0.724,0.332041,0.554672,0.308533,0.308533,...,0,0,0,0,0,0,0,0,0,0.0
4,T8136,MG1655,MD001,none,soxS_KO,0.724,0.323373,0.442617,0.318442,0.318442,...,0,0,0,0,0,0,0,0,0,


Notice the last column does not always have any value, so I decided to drop it. And since there are more than 4000 attributes, and base on the basic knowledge of the gene, it should not make too much difference.

In [3]:
bac.drop(columns = 'b4635', axis = 1, inplace = True)

In [4]:
y = bac.GrowthRate
x = bac.iloc[:,6: ]
x.head()

Unnamed: 0,b3356,b2922,b2519,b3823,b3824,b3353,b1500,b2923,b2513,b2512,...,b4673,b4684,b4689,b4685,b4683,b4682,b4699,b4698,b4704,b4703
0,0.30479,0.506615,0.289473,0.289473,0.092213,0.109991,0.289473,0.289473,0.316641,0.248555,...,0,0,0,0,0,0,0,0,0,0
1,0.361095,0.582885,0.259295,0.259295,0.104293,0.1491,0.259295,0.259295,0.350822,0.256931,...,0,0,0,0,0,0,0,0,0,0
2,0.29592,0.477482,0.304459,0.304459,0.084021,0.121631,0.304459,0.304459,0.333197,0.251729,...,0,0,0,0,0,0,0,0,0,0
3,0.332041,0.554672,0.308533,0.308533,0.109976,0.146904,0.308533,0.308533,0.318897,0.267445,...,0,0,0,0,0,0,0,0,0,0
4,0.323373,0.442617,0.318442,0.318442,0.109609,0.128908,0.318442,0.318442,0.293445,0.253606,...,0,0,0,0,0,0,0,0,0,0


## 1. Elastic Net with Cross-Vlaidation:

In [225]:
class myEnet():
    ## define my elastic class, so it would be easier to extend any method or requirments.
    def __init__(self, **kw):
        '''Pass any keywords required in ElatisticNetCV.'''
        self._model = ElasticNetCV(**kw)
    
    def __call__(self, x, y):
        '''
        Call the firt method of the elastic model by calling the object itself.
        '''
        self._model.fit(x, y)
        return self._model

    def get_scores(self):
        parameters = self._model.get_params()
        l1_ratio = parameters['l1_ratio']
        alphas = parameters['alphas']
        mse = self._model.mse_path_
        mean_mse = np.mean(mse, axis = 2)
        frame = pd.DataFrame(mean_mse, index = l1_ratio, columns = alphas)
        frame.index.names = ["l1_ratio"]
        frame.columns.names = ['alpha']
        return frame
    def get_n_attributes(self):
        return sum(self._model.coef_ != 0)

    def get_index_attributes(self):
        return np.where(self._model.coef_ != 0)

    


In [226]:
# Tuning the parameters. 
# l1 ratio: .1, .25, .5, .75, .9, 1
# Numbers of alpha = 6
# There are 30 combinations in total.
enet = myEnet(cv = 5, n_jobs = -1, random_state = 256, l1_ratio = [ .1, .25, .5, .75, .9, 1], alphas = [0.5, 1, 2, 4, 8, 16], max_iter = 8000, verbose = True)
model = enet(x, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
....................................................................................................................................................................................[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.7s finished


In [227]:
enet.get_scores()

alpha,0.5,1.0,2.0,4.0,8.0,16.0
l1_ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,0.055702,0.055702,0.055771,0.055743,0.053729,0.056487
0.25,0.055702,0.055702,0.055702,0.055779,0.055749,0.054794
0.5,0.055702,0.055702,0.055702,0.055702,0.055779,0.055749
0.75,0.055702,0.055702,0.055702,0.055702,0.055702,0.055768
0.9,0.055702,0.055702,0.055702,0.055702,0.055702,0.055782
1.0,0.055702,0.055702,0.055702,0.055702,0.055702,0.055779


As the score table shows, the minimal mse is obtained when alpha = 8 and l1_ratio = 0.1.

In [228]:
enet.get_n_attributes()

8

In [231]:
enet.get_index_attributes()

(array([ 152,  159,  400,  723, 2409, 2576, 2718, 3088]),)

Based on the output above, when set alpha = 8 and l1_ratio to be .1, there will be 8 attributes that are treated as informative in the model, and their indices are: 152,  159,  400,  723, 2409, 2576, 2718, 3088. And the five-fold cross validation error is 0.053729.

## 2. 

In [232]:
from sklearn.utils import resample

Assume the distribution of the error term is normal distribution. 

In [None]:
def bootStrap(x, y, n_samples):
    

In [238]:
x_sampled, y_sampled = resample(x,y, n_samples = 2000)

In [240]:
x_sampled

Unnamed: 0,b3356,b2922,b2519,b3823,b3824,b3353,b1500,b2923,b2513,b2512,...,b4673,b4684,b4689,b4685,b4683,b4682,b4699,b4698,b4704,b4703
29,0.371718,0.547738,0.114015,0.088037,0.160511,0.307009,1.139102,0.000000,0.407671,0.346156,...,0,0,0,0,0,0,0,0,0,0
21,2.576907,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6.254210,0.000000,...,0,0,0,0,0,0,0,0,0,0
173,0.026878,0.032896,0.000000,0.000000,0.016990,0.013722,0.010435,0.012775,0.026635,0.030571,...,0,0,0,0,0,0,0,0,0,0
2,0.295920,0.477482,0.304459,0.304459,0.084021,0.121631,0.304459,0.304459,0.333197,0.251729,...,0,0,0,0,0,0,0,0,0,0
66,0.238176,0.247356,0.000000,0.000000,0.357283,0.902854,0.113167,0.113167,0.248575,0.271077,...,0,0,0,0,0,0,0,0,0,0
135,0.539999,0.000000,0.000000,2.243065,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
27,0.367334,0.445310,0.104875,0.052913,0.127384,0.335075,0.000000,0.000000,0.440232,0.382123,...,0,0,0,0,0,0,0,0,0,0
34,0.159939,0.140409,0.047102,0.047639,0.054442,0.074683,6.592642,0.109283,0.164604,0.127435,...,0,0,0,0,0,0,0,0,0,0
116,0.753685,0.000000,0.000000,0.000000,0.753688,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
142,0.160605,0.092519,0.000000,0.000000,0.011732,0.045165,0.393984,0.010732,0.132069,0.163293,...,0,0,0,0,0,0,0,0,0,0
