In [1]:
import numpy as np
import pandas as pd
import string
import sklearn.linear_model as sl
import scipy.stats as st

In [2]:
url1 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/x.quad.0.txt'
url2 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/x.quad.70.txt'
url3 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/h0_0.rs35.txt'
url4 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/h1_0.rs35.txt'
url5 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/h2_0.rs35.txt'
url6 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/h3_0.rs35.txt'
url7 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/h4_0.rs35.txt'
url8 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/h0_70.rs35.txt'
url9 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/h1_70.rs35.txt'
url10 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/h2_70.rs35.txt'
url11 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/h3_70.rs35.txt'
url12 = 'http://www4.stat.ncsu.edu/~boos/var.select/sim/h4_70.rs35.txt'

# 0 - 20: indepedent standard normal
# 21 - 41: squares
# 42 - 251: pairwise interactions
x1 = pd.read_csv(url1, header = None, delim_whitespace = True)
x2 = pd.read_csv(url2, header = None, delim_whitespace = True)

# response
y1_h0 = pd.read_csv(url3, delim_whitespace = True)
y1_h1 = pd.read_csv(url4, delim_whitespace = True)
y1_h2 = pd.read_csv(url5, delim_whitespace = True)
y1_h3 = pd.read_csv(url6, delim_whitespace = True)
y1_h4 = pd.read_csv(url7, delim_whitespace = True)

y2_h0 = pd.read_csv(url8, delim_whitespace = True)
y2_h1 = pd.read_csv(url9, delim_whitespace = True)
y2_h2 = pd.read_csv(url10, delim_whitespace = True)
y2_h3 = pd.read_csv(url11, delim_whitespace = True)
y2_h4 = pd.read_csv(url12, delim_whitespace = True)

In [3]:
def reg_subset(x,y):
    lm = sl.LinearRegression()
    
    in_ = []
    out_ = list(range(x.shape[1]))
    rss = [sum((y-np.mean(y))**2)]
    (n,m) = x.shape
    
    if(m>=n):
        ml = n-5
    else:
        ml = m   
    vm = np.array(range(ml))

    for pi in range(x.shape[1]):
        rss_find = []
        for i in out_:
            fit_X = pd.DataFrame(x.ix[:, in_ + [i]])
            lm.fit(fit_X, y)
            pred = lm.predict(fit_X)
            rss_find.append(sum((pred-y)**2))
        min_rss = np.min(rss_find)
        min_var = out_[rss_find.index(min_rss)]
        rss.append(min_rss)
        in_.append(min_var)
        del out_[rss_find.index(min_rss)]
        in_var = [x.columns[i] for i in in_]
    rss = np.array(rss)
    pv_org = 1 - st.f.cdf((rss[vm] - rss[vm+1])*(n-(vm+2))/rss[vm+1],
                     1,n-(vm+2))
    return (np.array(in_),np.array(in_var),rss, pv_org)

In [4]:
def gic(rss, m, n, z):
    """gic gives model size including intercept of min BIC model"""
    t1 = z*np.arange(2, m + 3) + n*np.log(rss/n) + n + 2*n*np.log(np.sqrt(2*np.pi))
    t2 = np.argmin(t1) + 1
    return(t2)

In [5]:
def bic_sim(x, y):
    """chooses from FAS using minimum BIC"""
    lm = sl.LinearRegression()
    (n,m) = x.shape
    out_x = reg_subset(x, y)
    rss = out_x[2]
    vorder = out_x[0]
    if m > n:
        rss = rss[0:60]
        m = 60
    # model size including intercept
    bic = gic(rss, m, n, np.log(n))
    if bic > 1:
        index = vorder[0:bic - 1]
        x = x.ix[:,index]
        x_ind = index
    else:
        x_ind = 0
    if bic == 1:
        intercept = pd.DataFrame(np.ones(n))
        mod = lm.fit(intercept, y)  
        fitted = lm.predict(intercept)
        residual = sum((y - fitted)**2)
        df_residual = n - (bic - 1) - 1
    else:
        mod = lm.fit(x, y)
        fitted = lm.predict(x)
        residual = sum((y - fitted)**2)
        df_residual = n - (bic - 1) - 1
    return dict({'fitted': fitted, 'residual': residual, 'df_residual': df_residual, 'size': bic - 1, 'index': x_ind + 1})

In [6]:
# bic simulation
x1_matrix = x1.ix[:,0:20]
x2_matrix = x2.ix[:,0:20]
hbeta = np.zeros((5, 14))
hbeta[0,:] = np.repeat(-1, 14)
hbeta[1,:] = np.concatenate([np.array([7, 14]), np.repeat(-1, 12)])
hbeta[2,:] = np.concatenate([np.array([6, 7, 8, 13, 14, 15]), np.repeat(-1, 8)])
hbeta[3,:] = np.concatenate([np.array([5, 6, 7, 8, 9, 12, 13, 14, 15, 16]), np.repeat(-1, 4)])
hbeta[4,:] = np.array([4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17])

In [20]:
res = np.zeros((5, 14))
me_out = np.zeros((100, 4))
me_out1_21 = np.zeros((100, 4, 5))
me_out2_21 = np.zeros((100, 4, 5))
jac = np.zeros(100)
data = [y1_h0, y1_h1, y1_h2, y1_h3, y1_h4]
for m in range(5):
    hdata = data[m]
    hdata_y = np.array(hdata.ix[:,2]).reshape([100, 150])
    mu = hdata.ix[0:149,1]                                     # true means
    beta = hbeta[m,:]
    
    for i in range(100):
        print(i)
        out_bic = bic_sim(x1_matrix, hdata_y[i,:])
        size = out_bic['size']                                 # number of fitted x's
        correct = len(np.intersect1d(out_bic['index'], beta))  # number correct
        false = size - correct                                 # number false
        me_out[i, 0] = np.mean((out_bic['fitted'] - mu)**2)    # model error
        me_out[i, 1] = size
        me_out[i, 2] = false
        me_out[i, 3] = out_bic['residual']/out_bic['df_residual']
    
    me_out1_21[:,:,1] = np.round(me_out, 4)
    model_size = sum(beta != -1)                               # true model size
     
    for i in range(100):
        if me_out[i, 2] + model_size > 0:
            jac[i] = (me_out[i, 1] - me_out[i, 2])/(me_out[i, 2] + model_size)
        else:
            jac[i] = 1
    res[m, 0] = 0                                          # rho
    res[m, 1] = m                                       # model h0, h1
    res[m, 2] = np.mean(me_out[:,1])                      # model size
    res[m, 3] = np.std(me_out[:,1]/10)                     # se of the mean
    res[m, 4] = np.mean(me_out[:,0])                      # me
    res[m, 5] = np.std(me_out[:,0])/10
    res[m, 6] = np.mean(me_out[:,2]/(1 + me_out[:,1]))       # fsr
    res[m, 7] = np.std(me_out[:,2]/(1 + me_out[:,1]))/10
    res[m, 8] = np.mean(me_out[:,3])                         # mse of selected model
    res[m, 9] = np.std(me_out[:,3])/10
    if model_size > 0:
        csr = (me_out[:,1] - me_out[:,2])/model_size
    else:
        csr = np.ones(100)
    res[m, 10] = np.mean(csr)
    res[m, 11] = np.std(csr)/10
    res[m, 12] = np.mean(jac)
    res[m, 13] = np.std(jac)/10

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

In [23]:
bic1 = pd.DataFrame({'rho': res[:,0], 'H': res[:,1], 'size': res[:,2], 'me': res[:,4], 
                     'fsr_mr': res[:,6], 'mse': res[:,8], 'csr': res[:,10], 'jac': res[:,12],
                     'se_size': res[:,3], 'se_me': res[:,5], 'se_fsr': res[:,7], 'se_mse': res[:,9],
                     'se_csr': res[:,11], 'se_jac': res[:,13]})
print(np.round(bic1, 3))

     H    csr  fsr_mr    jac     me    mse  rho  se_csr  se_fsr  se_jac  \
0  0.0  1.000   0.243  0.560  0.033  0.960  0.0   0.000   0.028   0.050   
1  1.0  1.000   0.132  0.829  0.045  0.989  0.0   0.000   0.015   0.020   
2  2.0  0.572   0.091  0.532  0.088  0.975  0.0   0.013   0.012   0.014   
3  3.0  0.449   0.065  0.431  0.123  1.005  0.0   0.010   0.009   0.010   
4  4.0  0.419   0.036  0.411  0.153  1.016  0.0   0.009   0.007   0.009   

   se_me  se_mse  se_size  size  
0  0.004   0.012    0.088  0.62  
1  0.003   0.013    0.072  2.58  
2  0.005   0.011    0.106  3.95  
3  0.006   0.011    0.128  4.94  
4  0.006   0.013    0.139  6.15  
