In [4]:
import numpy as np
import pandas as pd
import os
import shutil
from subprocess import Popen, PIPE
from IPython.display import clear_output
import time

In [999]:
PYPATH = '/home/shibal/anaconda3/envs/jasa/bin/python'
FILEPATH = '/home/shibal/Additive-Models-with-Structured-Interactions/SparseAMsWithInteractions/src/AMsWithInteractionsStrongHierarchy/AMsWithInteractionsStrongHierarchy-Synthetic.py'
PATH = "/pool001/shibal/results-synthetic"

dataset = 'large-synthetic-correlated'
version = 31
r = 1.0
Ki = 10
Kij = 6
dist = 'normal'
correlation = 0.1
train_size = 1000
use_sparse = True

In [1000]:
def make_bash_file(seed, version, dist, r):
    bash_folder_path = f"{PATH}/bashes/{dataset}/{dist}/v{version}_r{r}"
    bash_file_path = os.path.join(bash_folder_path, "seed{}.sh".format(seed))
    log_path = f"{PATH}/logs/{dataset}/{dist}/v{version}_r{r}/seed{seed}"
    os.makedirs(bash_folder_path,exist_ok=True)
    os.makedirs(log_path,exist_ok=True)
    with open(bash_file_path,"w") as f:
        f.write("#!/bin/bash\n")
        f.write("#SBATCH --cpus-per-task=4\n")
        f.write("#SBATCH --time=1-00:00\n")
        f.write("#SBATCH --mem=32G\n")
        f.write("#SBATCH -p sched_mit_sloan_batch\n")
#         f.write("#SBATCH -p sched_mit_sloan_interactive\n")
        f.write("#SBATCH --mail-type=FAIL\n")
        f.write("#SBATCH --mail-user=shibal@mit.edu\n")
        f.write(f"#SBATCH -o {log_path}/seed{seed}_%j.out\n")
        f.write(f"#SBATCH -e {log_path}/_seed{seed}_%j.err\n\n")
        f.write("module load sloan/python/modules/python-3.6/gurobipy/9.0.1\n\n")
        if use_sparse:
            f.write(f"{PYPATH} -u {FILEPATH}  --dataset {dataset} --correlation {correlation} --dist {dist} --seed {seed} --train_size {train_size} --version {version} --r {r} --Ki {Ki} --Kij {Kij} --use_sparse |& tee -a {log_path}/output_{train_size}.txt\n\n") 
        else:
            f.write(f"{PYPATH} -u {FILEPATH}  --dataset {dataset} --correlation {correlation} --dist {dist} --seed {seed} --train_size {train_size} --version {version} --r {r} --Ki {Ki} --Kij {Kij} --no-use_sparse |& tee -a {log_path}/output_{train_size}.txt\n\n") 
    return bash_file_path


In [1001]:
bash_files = []
seeds = np.arange(25)
for seed in seeds:
    bash_files.append(make_bash_file(seed, version, dist, r))

In [1002]:
torun = seeds
# torun = range(1,6)
submitted = []
print(len(torun))

25


In [1003]:
exit_code = 1
for i, seed in enumerate(seeds):
    if i % 100 == 0:
        clear_output(wait=True)
    print(i)
    sh = make_bash_file(seed, version, dist, r)
    while True:
        process = Popen(["sbatch",sh], stdout=PIPE)
        (output, err) = process.communicate()
        exit_code = process.wait()
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())),output,err)
        if exit_code == 0:
            print(sh,"submitted!")
            tmp_id = str(output)[-11:-3]
            print("job id:", tmp_id)
            submitted.append(tmp_id)
            break
        time.sleep(10000)

0
2023-09-04 00:18:58 b'Submitted batch job 51575669\n' None
/pool001/shibal/results-synthetic/bashes/large-synthetic-correlated/normal/v35_r2.0/seed0.sh submitted!
job id: 51575669
1
2023-09-04 00:18:58 b'Submitted batch job 51575670\n' None
/pool001/shibal/results-synthetic/bashes/large-synthetic-correlated/normal/v35_r2.0/seed1.sh submitted!
job id: 51575670
2
2023-09-04 00:18:59 b'Submitted batch job 51575671\n' None
/pool001/shibal/results-synthetic/bashes/large-synthetic-correlated/normal/v35_r2.0/seed2.sh submitted!
job id: 51575671
3
2023-09-04 00:18:59 b'Submitted batch job 51575672\n' None
/pool001/shibal/results-synthetic/bashes/large-synthetic-correlated/normal/v35_r2.0/seed3.sh submitted!
job id: 51575672
4
2023-09-04 00:18:59 b'Submitted batch job 51575673\n' None
/pool001/shibal/results-synthetic/bashes/large-synthetic-correlated/normal/v35_r2.0/seed4.sh submitted!
job id: 51575673
5
2023-09-04 00:18:59 b'Submitted batch job 51575674\n' None
/pool001/shibal/results-synth

In [142]:
# command = """/home/shibal/anaconda3/envs/jasa/bin/python -u /home/shibal/Additive-Models-with-Structured-Interactions/SparseAMsWithInteractions/src/AMsWithInteractionsStrongHierarchy/AMsWithInteractionsStrongHierarchy-Synthetic.py  --dataset large-synthetic --dist normal --seed 0 --train_size 200 --version 19 --r 1.0 |& tee -a /pool001/shibal/results-synthetic/logs/large-synthetic/normal/v19_r1.0/seed0/output_200.txt"""

In [143]:
# !{command}

In [144]:
from subprocess import Popen, PIPE

In [71]:
for job in range(51537984, 51537995):
    process = Popen(['scancel',str(job)], stdout=PIPE)
    (output, err) = process.communicate()
    exit_code = process.wait()
    if exit_code ==0:
        print(job, "deleted!")

51537984 deleted!
51537985 deleted!
51537986 deleted!
51537987 deleted!
51537988 deleted!
51537989 deleted!
51537990 deleted!
51537991 deleted!
51537992 deleted!
51537993 deleted!


In [33]:
for seed in range(100):
    command = f"""rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed{seed}/AMsWithInteractionsStrongHierarchy/v13/r1.0"""
    print(command)
    !{command}

rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed0/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed1/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed2/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed3/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed4/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed5/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed6/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed7/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed8/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic

rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed78/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed79/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed80/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed81/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed82/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed83/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed84/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed85/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed86/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/hetero

In [1]:
import numpy as np
from sklearn.metrics import f1_score

In [32]:
rs = [1.0, 1.5, 2.0]
vals = {1.0: {}, 1.5: {}, 2.0: {}}
MISE = {1.0: {}, 1.5: {}, 2.0: {}}
mains = {1.0: {}, 1.5: {}, 2.0: {}}
interactions = {1.0: {}, 1.5: {}, 2.0: {}}
features = {1.0: {}, 1.5: {}, 2.0: {}}
fprs_main = {1.0: {}, 1.5: {}, 2.0: {}}
fnrs_main = {1.0: {}, 1.5: {}, 2.0: {}}
f1s_main = {1.0: {}, 1.5: {}, 2.0: {}}
fprs_interaction = {1.0: {}, 1.5: {}, 2.0: {}}
fnrs_interaction = {1.0: {}, 1.5: {}, 2.0: {}}
f1s_interaction = {1.0: {}, 1.5: {}, 2.0: {}}
f1s_feature = {1.0: {}, 1.5: {}, 2.0: {}}

data = 'large-synthetic-correlated'
train_size = 1000
use_sparse = False
version = 39
for seed in np.arange(25):
    filename = f"/pool001/shibal/results-synthetic/{data}/normal/N_train_{train_size}/seed{seed}/AMsWithInteractionsStrongHierarchy/v{version}"
    
    for r in rs:
        print(f"====================r: {r}")
        try:
            with open(filename+f"/r{r}/use_sparse_{use_sparse}/Results-HS.txt") as file:
                lines = file.readlines()
                val = float([line for line in lines if "val:" in line and "Optimal" in line][0].split("val: ")[-1].split(",")[0])
                mise = float([line for line in lines if "True" in line][0].split(" ")[-1].split("\n")[0])
                fpr_main = float([line for line in lines if "FPR (main)" in line][0].split(" ")[-1].split("\n")[0])
                fnr_main = float([line for line in lines if "FNR (main)" in line][0].split(" ")[-1].split("\n")[0])
                f1_main = float([line for line in lines if "F1 (main)" in line][0].split(" ")[-1].split("\n")[0])
                fpr_interaction = float([line for line in lines if "FPR (interactions)" in line][0].split(" ")[-1].split("\n")[0])
                fnr_interaction = float([line for line in lines if "FNR (interactions)" in line][0].split(" ")[-1].split("\n")[0])
                f1_interaction = float([line for line in lines if "F1 (interactions)" in line][0].split(" ")[-1].split("\n")[0])
                vals[r][seed] = val
                MISE[r][seed] = mise
                fprs_main[r][seed] = fpr_main
                fnrs_main[r][seed] = fnr_main
                f1s_main[r][seed] = f1_main
                fprs_interaction[r][seed] = fpr_interaction
                fnrs_interaction[r][seed] = fnr_interaction
                f1s_interaction[r][seed] = f1_interaction
                print("Seed: ", seed, " mise:", mise)

            with open(filename+f"/r{r}/use_sparse_{use_sparse}/support_set.npy", 'rb') as f:
                main_set = np.load(f)
                interaction_set = np.load(f)
                mains[r][seed] = main_set
                interactions[r][seed] = interaction_set
                feature_set = np.unique(list(main_set)+list(np.unique(interaction_set)))
                features[r][seed] = feature_set
                if data in ['large-synthetic', 'large-synthetic-correlated']:
                    p = 500
                    k = 10
                    feature_support_truth = np.zeros(p)
                    true_support = np.arange((int)(p/(2*k)),p,(int)(p/k))
                    feature_support_truth[true_support] = 1
                    feature_support_recovered = np.zeros(p)
                    feature_support_recovered[feature_set] = 1
                    f1_feature = f1_score(feature_support_truth, feature_support_recovered)
                    f1s_feature[r][seed] = f1_feature
                elif data in ['large-synthetic-correlated-aoas']:
                    p = 500
                    k = 50
                    feature_support_truth = np.zeros(p)
                    true_support = np.arange((int)(p/(2*k)),p,(int)(p/k))
                    feature_support_truth[true_support] = 1
                    feature_support_recovered = np.zeros(p)
                    feature_support_recovered[feature_set] = 1
                    f1_feature = f1_score(feature_support_truth, feature_support_recovered)
                    f1s_feature[r][seed] = f1_feature
        except:
            pass

Seed:  0  mise: 0.44437926441753733
Seed:  0  mise: 1.5612746190002345
Seed:  0  mise: 1.5206132626236466
Seed:  1  mise: 0.846615004046383
Seed:  1  mise: 0.8487825614001229
Seed:  1  mise: 0.9529952395897495
Seed:  2  mise: 0.3616084699709767
Seed:  2  mise: 0.3655115972583775
Seed:  2  mise: 0.3381658377935022
Seed:  3  mise: 0.6659488505382662
Seed:  3  mise: 0.5865370433719594
Seed:  3  mise: 0.6158818908153421
Seed:  4  mise: 1.4997617221522679
Seed:  4  mise: 1.497891738297371
Seed:  4  mise: 0.3129095954399436
Seed:  5  mise: 1.117129436549082
Seed:  5  mise: 0.8091232404252668
Seed:  5  mise: 0.7808702653229347
Seed:  6  mise: 1.1005210745348442
Seed:  6  mise: 6.613192524075821
Seed:  6  mise: 1.5055842898680842
Seed:  7  mise: 0.7732127696989737
Seed:  7  mise: 0.7529293229640248
Seed:  7  mise: 0.3675942476999313
Seed:  8  mise: 0.6278658157174181
Seed:  8  mise: 1.351502791040904
Seed:  8  mise: 1.4314360383866815
Seed:  9  mise: 2.3298330686865465
Seed:  9  mise: 0.611427

In [33]:
vals = pd.DataFrame(vals)
MISE = pd.DataFrame(MISE)
fprs_main = pd.DataFrame(fprs_main)
fnrs_main = pd.DataFrame(fnrs_main)
f1s_main = pd.DataFrame(f1s_main)
fprs_interaction = pd.DataFrame(fprs_interaction)
fnrs_interaction = pd.DataFrame(fnrs_interaction)
f1s_interaction = pd.DataFrame(f1s_interaction)
f1s_feature = pd.DataFrame(f1s_feature)
best_r = {}
vals

Unnamed: 0,1.0,1.5,2.0
0,0.459503,1.604827,1.579366
1,0.873653,0.8479,0.927063
2,0.719483,0.321864,0.857902
3,0.773646,0.80321,0.816922
4,1.725973,1.578403,0.275414
5,1.179742,0.964011,1.733369
6,1.695719,5.169105,1.082294
7,0.883162,0.837542,0.517783
8,0.679568,1.442084,1.379193
9,2.198216,0.928407,0.945768


In [34]:
for index in vals.index:
    r = vals.loc[index][vals.loc[index]==vals.loc[index].min()].index
    best_r[index] = r[0]
    vals.loc[index, "best"] = np.array(vals.loc[index, r])[0]
    MISE.loc[index, "best"] = np.array(MISE.loc[index, r])[0]
    fprs_main.loc[index, "best"] = np.array(fprs_main.loc[index, r])[0]
    fnrs_main.loc[index, "best"] = np.array(fnrs_main.loc[index, r])[0]
    f1s_main.loc[index, "best"] = np.array(f1s_main.loc[index, r])[0]
    fprs_interaction.loc[index, "best"] = np.array(fprs_interaction.loc[index, r])[0]
    fnrs_interaction.loc[index, "best"] = np.array(fnrs_interaction.loc[index, r])[0]
    f1s_interaction.loc[index, "best"] = np.array(f1s_interaction.loc[index, r])[0]
    f1s_feature.loc[index, "best"] = np.array(f1s_feature.loc[index, r])[0]

vals = vals[["best"]].values
MISE = MISE[["best"]].values
fprs_main = fprs_main[["best"]].values
fnrs_main = fnrs_main[["best"]].values
f1s_main = f1s_main[["best"]].values
fprs_interaction = fprs_interaction[["best"]].values
fnrs_interaction = fnrs_interaction[["best"]].values
f1s_interaction = f1s_interaction[["best"]].values
f1s_feature = f1s_feature[["best"]].values

In [35]:
len(MISE)

25

In [36]:
print("val:", np.mean(vals), "std-err:", np.std(vals)/np.sqrt(len(vals)))
print("MISE:", np.mean(MISE), "std-err:", np.std(MISE)/np.sqrt(len(MISE)))
print("FPR (main):", np.mean(fprs_main), "std-err:", np.std(fprs_main)/np.sqrt(len(fprs_main)))
print("FNR (main):", np.mean(fnrs_main), "std-err:", np.std(fnrs_main)/np.sqrt(len(fnrs_main)))
print("F1 (main):", np.mean(f1s_main), "std-err:", np.std(f1s_main)/np.sqrt(len(f1s_main)))
print("FPR (interactions):", np.mean(fprs_interaction), "std-err:", np.std(fprs_interaction)/np.sqrt(len(fprs_interaction)))
print("FNR (interactions):", np.mean(fnrs_interaction), "std-err:", np.std(fnrs_interaction)/np.sqrt(len(fnrs_interaction)))
print("F1 (interactions):", np.mean(f1s_interaction), "std-err:", np.std(f1s_interaction)/np.sqrt(len(f1s_interaction)))
print("F1 (feature):", np.mean(f1s_feature), "std-err:", np.std(f1s_feature)/np.sqrt(len(f1s_feature)))


val: 0.63750504 std-err: 0.049357379084079576
MISE: 0.6470681553933149 std-err: 0.056982979149712086
FPR (main): 0.019591836734693877 std-err: 0.0046523450934775525
FNR (main): 0.0 std-err: 0.0
F1 (main): 0.7407959941532567 std-err: 0.03614632061031767
FPR (interactions): 5.451251382853517e-05 std-err: 9.425494833624412e-06
FNR (interactions): 0.395 std-err: 0.03440930106817051
F1 (interactions): 0.5154270296618244 std-err: 0.02963238954486262
F1 (feature): 0.7407959941532567 std-err: 0.03614632061031767


In [37]:
print("Number of features: ", np.mean([len(features[r][(int)(seed)]) for seed, r in best_r.items()]))
print("Number of main effects: ", np.mean([len(mains[r][(int)(seed)]) for seed, r in best_r.items()]))
print("Number of interaction effects: ", np.mean([len(interactions[r][(int)(seed)]) for seed, r in best_r.items()]))

Number of features:  19.6
Number of main effects:  19.6
Number of interaction effects:  11.64


In [38]:
best_r

{0: 1.0,
 1: 1.5,
 2: 1.5,
 3: 1.0,
 4: 2.0,
 5: 1.5,
 6: 2.0,
 7: 2.0,
 8: 1.0,
 9: 1.5,
 10: 1.5,
 11: 1.0,
 12: 2.0,
 13: 1.5,
 14: 1.5,
 15: 1.5,
 16: 1.0,
 17: 1.5,
 18: 2.0,
 19: 1.0,
 20: 2.0,
 21: 2.0,
 22: 1.0,
 23: 2.0,
 24: 1.0}

In [843]:
np.median(MISE)

0.09608121729274623

In [844]:
MISE

array([[0.46579411],
       [0.09449551],
       [0.11214856],
       [0.05670834],
       [0.11890736],
       [0.09488586],
       [0.05175347],
       [1.23398696],
       [0.45810677],
       [0.08987189],
       [0.09608122],
       [0.13033255],
       [0.11729706],
       [0.07865387],
       [0.07078335],
       [0.0501786 ],
       [0.11374408],
       [0.18979505],
       [0.04813312],
       [0.05256107],
       [0.0683961 ],
       [0.08928357],
       [0.38408993],
       [0.22460924],
       [0.14151893]])

In [91]:
interactions

{1.0: {1: array([[225, 485],
         [225, 395],
         [ 25, 485],
         [415, 485],
         [225, 285],
         [ 25, 325],
         [105, 375],
         [235, 275],
         [112, 225],
         [105, 265],
         [ 75, 425],
         [352, 415],
         [ 15, 335],
         [ 25, 285],
         [285, 425],
         [225, 305],
         [ 78, 485],
         [275, 285],
         [405, 425],
         [ 15, 315],
         [207, 425],
         [  5, 215],
         [125, 265],
         [175, 465],
         [180, 225],
         [135, 435],
         [ 25,  55],
         [ 93, 395]]),
  5: array([[225, 335],
         [225, 281],
         [175, 225],
         [122, 395],
         [ 35, 419],
         [ 25,  58],
         [225, 285],
         [ 65, 105],
         [225, 255],
         [275, 485],
         [125, 225],
         [150, 225],
         [ 25,  65],
         [175, 295],
         [265, 497],
         [ 25, 295],
         [ 25, 485],
         [ 85, 365],
         [155, 195],


In [32]:
best_r

[Float64Index([1.0], dtype='float64'),
 Index([1.0], dtype='object'),
 Index([1.0], dtype='object'),
 Index([1.5], dtype='object'),
 Index([1.0], dtype='object'),
 Index([1.0], dtype='object'),
 Index([1.0], dtype='object'),
 Index([1.0], dtype='object'),
 Index([1.5], dtype='object'),
 Index([1.0], dtype='object')]

In [570]:
np.mean([len(item) for key, item in features[1.5].items()])

19.08

In [126]:
np.unique(np.concatenate(mains).ravel(), return_counts=True)

TypeError: The first input argument needs to be a sequence

In [287]:
interactions = [inter for inter in interactions if len(inter)>0]
np.unique(np.concatenate(interactions, axis=0), return_counts=True, axis=0)

(array([[ 25,  75],
        [ 25, 125],
        [ 25, 475],
        [ 75, 125],
        [ 75, 175],
        [ 75, 275],
        [ 75, 425],
        [125, 175],
        [125, 325],
        [125, 425],
        [125, 475],
        [175, 275],
        [175, 375],
        [175, 475],
        [225, 325],
        [225, 425],
        [275, 375],
        [275, 425],
        [325, 375],
        [325, 425],
        [325, 475],
        [375, 425],
        [375, 475],
        [425, 475]]),
 array([24, 22,  1,  1,  1,  1,  1, 24,  1,  1,  2,  1,  6,  2,  2,  1,  3,
         1, 18,  4,  3,  2,  5, 20]))

In [675]:
import pandas as pd

In [744]:
df = pd.read_csv("/home/shibal/pool/results-synthetic/large-synthetic-correlated/normal/N_train_10000/seed21/AMsWithInteractionsStrongHierarchy/v24/r2.0/fold0/Training-HS.csv").set_index(['lambda_1','lambda_2','tau','Main-Effects','Interaction-Effects'])[['val']]

In [745]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,val
lambda_1,lambda_2,tau,Main-Effects,Interaction-Effects,Unnamed: 5_level_1
1e-07,100.00,1.000000,0,0,10.011642
1e-07,100.00,0.910298,2,0,6.758968
1e-07,100.00,0.828643,3,0,6.219394
1e-07,100.00,0.754312,3,0,6.219393
1e-07,100.00,0.686649,3,0,6.219393
1e-07,...,...,...,...,...
1e-07,0.01,0.014563,24,25,0.216434
1e-07,0.01,0.013257,27,29,0.219079
1e-07,0.01,0.012068,28,30,0.219108
1e-07,0.01,0.010985,30,33,0.218160


In [746]:
df.columns = ['val-{}'.format(0)]
dfr = df.reset_index()
dfr

Unnamed: 0,lambda_1,lambda_2,tau,Main-Effects,Interaction-Effects,val-0
0,1.000000e-07,100.00,1.000000,0,0,10.011642
1,1.000000e-07,100.00,0.910298,2,0,6.758968
2,1.000000e-07,100.00,0.828643,3,0,6.219394
3,1.000000e-07,100.00,0.754312,3,0,6.219393
4,1.000000e-07,100.00,0.686649,3,0,6.219393
...,...,...,...,...,...,...
495,1.000000e-07,0.01,0.014563,24,25,0.216434
496,1.000000e-07,0.01,0.013257,27,29,0.219079
497,1.000000e-07,0.01,0.012068,28,30,0.219108
498,1.000000e-07,0.01,0.010985,30,33,0.218160


In [747]:
dfr = dfr.sort_values(by=['lambda_1','lambda_2','tau'], ascending=False).set_index(['lambda_1','lambda_2','tau','Main-Effects','Interaction-Effects'])
dfr = dfr.mean(axis=1)
dfr

lambda_1      lambda_2  tau       Main-Effects  Interaction-Effects
1.000000e-07  100.00    1.000000  0             0                      10.011642
                        0.910298  2             0                       6.758968
                        0.828643  3             0                       6.219394
                        0.754312  3             0                       6.219393
                        0.686649  3             0                       6.219393
                                                                         ...    
              0.01      0.014563  24            25                      0.216434
                        0.013257  27            29                      0.219079
                        0.012068  28            30                      0.219108
                        0.010985  30            33                      0.218160
                        0.010000  33            36                      0.221670
Length: 500, dtype: float64

In [748]:
df_opt = dfr[dfr==dfr.min()].reset_index()        
display(df_opt)
val_opt = dfr.min()
L0_opt = df_opt['lambda_2'].values[0]
Smoothness_opt = df_opt['lambda_1'].values[0]   
tau_opt = df_opt['tau'].values[0] 
nnz_main_opt = df_opt['Main-Effects'].values[0]
nnz_interaction_opt = df_opt['Interaction-Effects'].values[0]

dfr = dfr[((dfr-val_opt)<0.1*val_opt)].reset_index()
dfr["Components"] = dfr["Main-Effects"]+dfr['Interaction-Effects']
dfr = dfr[dfr["Components"]==dfr["Components"].min()]
display(dfr)
dfr = dfr.set_index(['lambda_1','lambda_2','tau','Main-Effects','Interaction-Effects','Components'])
dfr

df_sp = dfr[dfr==dfr.min()].reset_index()        
val_sp = dfr.min()
L0_sp = df_sp['lambda_2'].values[0]
Smoothness_sp = df_sp['lambda_1'].values[0]   
tau_sp = df_sp['tau'].values[0] 

Unnamed: 0,lambda_1,lambda_2,tau,Main-Effects,Interaction-Effects,0
0,1e-07,35.938137,0.030888,22,23,0.168494


Unnamed: 0,lambda_1,lambda_2,tau,Main-Effects,Interaction-Effects,0,Components
12,1e-07,0.077426,0.086851,10,8,0.176382,18
13,1e-07,0.027826,0.054287,10,8,0.176719,18


In [749]:
Smoothness_sp, L0_sp, tau_sp

(1e-07, 0.07742639999999999, 0.086851)

In [250]:
p = 500
k = 50
increment = (int)(p/k)
start = (int)(p/(2*k))
true_support = np.arange(start,p,increment)
print(true_support)

[  5  15  25  35  45  55  65  75  85  95 105 115 125 135 145 155 165 175
 185 195 205 215 225 235 245 255 265 275 285 295 305 315 325 335 345 355
 365 375 385 395 405 415 425 435 445 455 465 475 485 495]


In [251]:
from itertools import combinations

In [259]:
possible_candidates = np.array(list(combinations(true_support, 2)))
true_interactions = possible_candidates[np.random.choice(len(possible_candidates), 100)]

In [260]:
possible_candidates

array([[  5,  15],
       [  5,  25],
       [  5,  35],
       ...,
       [475, 485],
       [475, 495],
       [485, 495]])

In [272]:
p = 500
k = 50
correlated = True
sigma = np.zeros((p,p))
for i in range(p):
    for j in range(p):
        sigma[i,j] = 0.5**(abs(i-j))

seed = 0
train_size = 1000
test_size = 10000
np.random.seed(seed)

Xtrain = np.random.multivariate_normal(np.zeros(p), sigma, (int)(train_size))
Xval = np.random.multivariate_normal(np.zeros(p), sigma, (int)(0.1*train_size))
Xtest = np.random.multivariate_normal(np.zeros(p), sigma, test_size)
feature_support_truth = np.zeros(p)

errortrain = np.random.normal(loc=0, scale=0.25, size=((int)(train_size),))
errorval = np.random.normal(loc=0, scale=0.25, size=((int)(0.1*train_size),))
errortest = np.random.normal(loc=0, scale=0.25, size=(test_size,))

increment = (int)(p/k)
start = (int)(p/(2*k))
true_support = np.arange(start,p,increment)
print("True Support: ", true_support)
feature_support_truth[true_support] = 1

def g0(t):
    return 0.5*t

def g1(t):
    return 1.25*np.sin(t)

def g2(t):
    return 0.3*np.exp(t)

def g3(t):
    return 0.5*(t**2)

def g4(t):
    return 0.9*np.cos(t)

bases = [g0, g1, g2, g3, g4]
possible_candidates = np.array(list(combinations(true_support, 2)))
np.random.seed(42) # force same true interactions
true_interactions = possible_candidates[np.random.choice(len(possible_candidates), 100, replace=False)]

def get_f(x):
    f = np.zeros((x.shape[0],), dtype=x.dtype)

    # main effects
    corresponding_bases = {}
    for i, index in enumerate(true_support):
        f += bases[i%5](x[:,index])
        print(i%5, index)
        corresponding_bases[index] = bases[i%5]

    # interaction effects
    for term in true_interactions:
        f += corresponding_bases[term[0]](x[:,term[0]])*corresponding_bases[term[1]](x[:,term[1]])
        print(term, (corresponding_bases[term[0]], corresponding_bases[term[1]]))

    return f

ftrain = get_f(Xtrain)
fval = get_f(Xval)
ftest = get_f(Xtest)


ytrain = ftrain+errortrain
yval = fval+errorval
ytest = ftest+errortest
ytrain = ytrain.reshape(-1,1)
yval = yval.reshape(-1,1)
ytest = ytest.reshape(-1,1)   
num_of_folds = 1
main_support_true = np.zeros(p)
main_support_true[true_support] = 1
interaction_terms_all = []
for m in range(0, p):
    for n in range(0, p):
        if m!=n and m<n:
            interaction_terms_all.append((m, n))
interaction_terms_all = np.array(interaction_terms_all)
interaction_support_true = np.zeros((len(interaction_terms_all)))
for term in true_interactions:
    interaction_support_true[(term.reshape(1,-1)==interaction_terms_all).all(axis=1)] = 1


True Support:  [  5  15  25  35  45  55  65  75  85  95 105 115 125 135 145 155 165 175
 185 195 205 215 225 235 245 255 265 275 285 295 305 315 325 335 345 355
 365 375 385 395 405 415 425 435 445 455 465 475 485 495]
0 5
1 15
2 25
3 35
4 45
0 55
1 65
2 75
3 85
4 95
0 105
1 115
2 125
3 135
4 145
0 155
1 165
2 175
3 185
4 195
0 205
1 215
2 225
3 235
4 245
0 255
1 265
2 275
3 285
4 295
0 305
1 315
2 325
3 335
4 345
0 355
1 365
2 375
3 385
4 395
0 405
1 415
2 425
3 435
4 445
0 455
1 465
2 475
3 485
4 495
[145 175] (<function g4 at 0x2aaaec97ec80>, <function g2 at 0x2aaaec97eb70>)
[175 365] (<function g2 at 0x2aaaec97eb70>, <function g1 at 0x2aaaec97eae8>)
[245 465] (<function g4 at 0x2aaaec97ec80>, <function g1 at 0x2aaaec97eae8>)
[385 395] (<function g3 at 0x2aaaec97ebf8>, <function g4 at 0x2aaaec97ec80>)
[ 95 205] (<function g4 at 0x2aaaec97ec80>, <function g0 at 0x2aaaec97e510>)
[ 35 285] (<function g3 at 0x2aaaec97ebf8>, <function g3 at 0x2aaaec97ebf8>)
[405 495] (<function g0 at 0x2

In [277]:
interaction_terms_all[np.where(interaction_support_true)[0]]

array([[  5, 245],
       [  5, 445],
       [  5, 455],
       [ 15,  25],
       [ 15,  45],
       [ 15, 115],
       [ 15, 235],
       [ 15, 295],
       [ 15, 315],
       [ 15, 345],
       [ 25,  75],
       [ 25, 135],
       [ 25, 295],
       [ 25, 345],
       [ 35, 155],
       [ 35, 285],
       [ 35, 345],
       [ 45,  75],
       [ 45, 485],
       [ 55, 115],
       [ 55, 145],
       [ 55, 185],
       [ 55, 305],
       [ 55, 415],
       [ 65, 145],
       [ 65, 205],
       [ 65, 225],
       [ 65, 315],
       [ 65, 375],
       [ 65, 495],
       [ 75, 195],
       [ 75, 305],
       [ 75, 365],
       [ 75, 385],
       [ 75, 415],
       [ 75, 495],
       [ 85, 125],
       [ 95, 105],
       [ 95, 165],
       [ 95, 205],
       [ 95, 405],
       [ 95, 475],
       [105, 135],
       [105, 275],
       [105, 335],
       [105, 445],
       [105, 475],
       [115, 185],
       [125, 195],
       [125, 335],
       [125, 405],
       [135, 225],
       [145,

In [240]:
for i, index in enumerate(true_support):
#     basis[i%5](x[index])
    print(i%5, index)

0 5
1 15
2 25
3 35
4 45
0 55
1 65
2 75
3 85
4 95
0 105
1 115
2 125
3 135
4 145
0 155
1 165
2 175
3 185
4 195
0 205
1 215
2 225
3 235
4 245
0 255
1 265
2 275
3 285
4 295
0 305
1 315
2 325
3 335
4 345
0 355
1 365
2 375
3 385
4 395
0 405
1 415
2 425
3 435
4 445
0 455
1 465
2 475
3 485
4 495
