In [146]:
import numpy as np
import pandas as pd
import os
import shutil
from subprocess import Popen, PIPE
from IPython.display import clear_output
import time

In [290]:
PYPATH = '/home/shibal/anaconda3/envs/jasa/bin/python'
FILEPATH = '/home/shibal/Additive-Models-with-Structured-Interactions/SparseAMsWithInteractions/src/AMsWithInteractionsL0/AMsWithInteractionsL0-Synthetic.py'
PATH = "/pool001/shibal/results-synthetic"

dataset = 'large-synthetic-correlated'
version = 30
Ki = 10
Kij = 6
r = 1.0
dist = 'normal'
correlation = 0.1
train_size = 10000
num_seeds = 10
init_seed = 0

In [291]:
def make_bash_file(seed, version, dist, r):
    bash_folder_path = f"{PATH}/bashes/{dataset}/{dist}/v{version}_r{r}/train_size_{train_size}"
    bash_file_path = os.path.join(bash_folder_path, f"seed{seed}.sh")
    log_path = f"{PATH}/logs/{dataset}/{dist}/v{version}_r{r}/train_size_{train_size}/seed{seed}"
    os.makedirs(bash_folder_path,exist_ok=True)
    os.makedirs(log_path,exist_ok=True)
    with open(bash_file_path,"w") as f:
        f.write("#!/bin/bash\n")
        f.write("#SBATCH --cpus-per-task=2\n")
        f.write("#SBATCH --time=1-00:00\n")
        f.write("#SBATCH --mem=24G\n")
        f.write("#SBATCH -p sched_mit_sloan_batch\n")
#         f.write("#SBATCH -p sched_mit_sloan_interactive\n")
        f.write("#SBATCH --mail-type=FAIL\n")
        f.write("#SBATCH --mail-user=shibal@mit.edu\n")
        f.write(f"#SBATCH -o {log_path}/seed{seed}_%j.out\n")
        f.write(f"#SBATCH -e {log_path}/_seed{seed}_%j.err\n\n")
        f.write("module load sloan/python/modules/python-3.6/gurobipy/9.0.1\n\n")
        f.write(f"{PYPATH} -u {FILEPATH}  --dataset {dataset} --dist {dist} --correlation {correlation} --seed {seed} --train_size {train_size} --version {version} --r {r} --Ki {Ki} --Kij {Kij} |& tee -a {log_path}/output_{train_size}.txt\n\n") 
    return bash_file_path


In [292]:
bash_files = []
seeds = np.arange(init_seed, init_seed+num_seeds)
for seed in seeds:
    bash_files.append(make_bash_file(seed, version, dist, r))

In [293]:
torun = seeds
# torun = range(1,6)
submitted = []
print(len(torun))

1


In [294]:
exit_code = 1
for i, seed in enumerate(seeds):
    if i % 100 == 0:
        clear_output(wait=True)
    print(i)
    sh = make_bash_file(seed, version, dist, r)
    while True:
        process = Popen(["sbatch",sh], stdout=PIPE)
        (output, err) = process.communicate()
        exit_code = process.wait()
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())),output,err)
        if exit_code == 0:
            print(sh,"submitted!")
            tmp_id = str(output)[-11:-3]
            print("job id:", tmp_id)
            submitted.append(tmp_id)
            break
        time.sleep(10000)

0
2023-08-31 22:58:13 b'Submitted batch job 51542942\n' None
/pool001/shibal/results-synthetic/bashes/large-synthetic-correlated/normal/v27_r1.0/train_size_10000/seed9.sh submitted!
job id: 51542942


In [142]:
# command = """/home/shibal/anaconda3/envs/jasa/bin/python -u /home/shibal/Additive-Models-with-Structured-Interactions/SparseAMsWithInteractions/src/AMsWithInteractionsL0/AMsWithInteractionsL0-Synthetic.py  --dataset large-synthetic --dist normal --seed 0 --train_size 200 --version 19 --r 1.0 |& tee -a /pool001/shibal/results-synthetic/logs/large-synthetic/normal/v19_r1.0/seed0/output_200.txt"""

In [143]:
# !{command}

In [7]:
from subprocess import Popen, PIPE

In [39]:
for job in range(51530768, 51530789):
    process = Popen(['scancel',str(job)], stdout=PIPE)
    (output, err) = process.communicate()
    exit_code = process.wait()
    if exit_code ==0:
        print(job, "deleted!")

51530768 deleted!
51530769 deleted!
51530770 deleted!
51530771 deleted!
51530772 deleted!
51530773 deleted!
51530774 deleted!
51530775 deleted!
51530776 deleted!
51530777 deleted!
51530778 deleted!
51530779 deleted!
51530780 deleted!
51530781 deleted!
51530782 deleted!
51530783 deleted!
51530784 deleted!
51530785 deleted!
51530786 deleted!
51530787 deleted!
51530788 deleted!


In [33]:
for seed in range(100):
    command = f"""rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed{seed}/AMsWithInteractionsL0/v13/r1.0"""
    print(command)
    !{command}

rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed0/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed1/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed2/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed3/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed4/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed5/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed6/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed7/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed8/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic

rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed78/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed79/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed80/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed81/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed82/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed83/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed84/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed85/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/heteroskedastic/N_train_400/seed86/AMsWithInteractionsL0/v13/r1.0
rm -r /pool001/shibal/results-synthetic/hetero

In [3]:
import numpy as np
from sklearn.metrics import f1_score

In [327]:
rs = [1.0, 1.5, 2.0]
vals = {1.0: {}, 1.5: {}, 2.0: {}}
MISE = {1.0: {}, 1.5: {}, 2.0: {}}
mains = {1.0: {}, 1.5: {}, 2.0: {}}
interactions = {1.0: {}, 1.5: {}, 2.0: {}}
features = {1.0: {}, 1.5: {}, 2.0: {}}
fprs_main = {1.0: {}, 1.5: {}, 2.0: {}}
fnrs_main = {1.0: {}, 1.5: {}, 2.0: {}}
f1s_main = {1.0: {}, 1.5: {}, 2.0: {}}
fprs_interaction = {1.0: {}, 1.5: {}, 2.0: {}}
fnrs_interaction = {1.0: {}, 1.5: {}, 2.0: {}}
f1s_interaction = {1.0: {}, 1.5: {}, 2.0: {}}
f1s_feature = {1.0: {}, 1.5: {}, 2.0: {}}

data = 'large-synthetic-correlated'
train_size = 1000
version = 26
for seed in np.arange(25):
    filename = f"/pool001/shibal/results-synthetic/{data}/normal/N_train_{train_size}/seed{seed}/AMsWithInteractionsL0/v{version}"
    
    for r in rs:
        print(f"====================r: {r}")
        try:
            with open(filename+f"/r{r}/Results.txt") as file:
                lines = file.readlines()
                val = float([line for line in lines if "val:" in line and "Optimal" in line][0].split("val: ")[-1].split(",")[0])
                mise = float([line for line in lines if "True" in line][0].split(" ")[-1].split("\n")[0])
                fpr_main = float([line for line in lines if "FPR (main)" in line][0].split(" ")[-1].split("\n")[0])
                fnr_main = float([line for line in lines if "FNR (main)" in line][0].split(" ")[-1].split("\n")[0])
                f1_main = float([line for line in lines if "F1 (main)" in line][0].split(" ")[-1].split("\n")[0])
                fpr_interaction = float([line for line in lines if "FPR (interactions)" in line][0].split(" ")[-1].split("\n")[0])
                fnr_interaction = float([line for line in lines if "FNR (interactions)" in line][0].split(" ")[-1].split("\n")[0])
                f1_interaction = float([line for line in lines if "F1 (interactions)" in line][0].split(" ")[-1].split("\n")[0])
                vals[r][seed] = val
                MISE[r][seed] = mise
                fprs_main[r][seed] = fpr_main
                fnrs_main[r][seed] = fnr_main
                f1s_main[r][seed] = f1_main
                fprs_interaction[r][seed] = fpr_interaction
                fnrs_interaction[r][seed] = fnr_interaction
                f1s_interaction[r][seed] = f1_interaction
                print("Seed: ", seed, " mise:", mise)

            with open(filename+f"/r{r}/support_set.npy", 'rb') as f:
                main_set = np.load(f)
                interaction_set = np.load(f)
                mains[r][seed] = main_set
                interactions[r][seed] = interaction_set
                feature_set = np.unique(list(main_set)+list(np.unique(interaction_set)))
                features[r][seed] = feature_set
                if data in ['large-synthetic', 'large-synthetic-correlated']:
                    p = 500
                    k = 10
                    feature_support_truth = np.zeros(p)
                    true_support = np.arange((int)(p/(2*k)),p,(int)(p/k))
                    feature_support_truth[true_support] = 1
                    feature_support_recovered = np.zeros(p)
                    feature_support_recovered[feature_set] = 1
                    f1_feature = f1_score(feature_support_truth, feature_support_recovered)
                    f1s_feature[r][seed] = f1_feature
                elif data in ['large-synthetic-correlated-aoas']:
                    p = 500
                    k = 50
                    feature_support_truth = np.zeros(p)
                    true_support = np.arange((int)(p/(2*k)),p,(int)(p/k))
                    feature_support_truth[true_support] = 1
                    feature_support_recovered = np.zeros(p)
                    feature_support_recovered[feature_set] = 1
                    f1_feature = f1_score(feature_support_truth, feature_support_recovered)
                    f1s_feature[r][seed] = f1_feature
        except:
            pass

Seed:  0  mise: 0.5393697160505195
Seed:  0  mise: 0.4319823835259618
Seed:  0  mise: 0.35456598252436805
Seed:  1  mise: 0.6414779469769012
Seed:  1  mise: 0.4378220214172392
Seed:  1  mise: 0.25183294371762016
Seed:  2  mise: 0.4789501975236292
Seed:  2  mise: 0.5023752947442534
Seed:  2  mise: 0.5288233853453764
Seed:  3  mise: 0.5442494873471212
Seed:  3  mise: 0.5000601841849852
Seed:  3  mise: 0.5416363932223489
Seed:  4  mise: 0.6520175903628073
Seed:  4  mise: 0.8512488542127546
Seed:  4  mise: 0.8056728460878708
Seed:  5  mise: 0.45440530053253586
Seed:  5  mise: 0.43475896247773355
Seed:  5  mise: 0.29168185484066694
Seed:  6  mise: 1.59165026801604
Seed:  6  mise: 0.6120744490832223
Seed:  6  mise: 0.5718768240246893
Seed:  7  mise: 0.3539534580081315
Seed:  7  mise: 0.47300210526696657
Seed:  7  mise: 0.302630282341569
Seed:  8  mise: 0.8004043275867982
Seed:  8  mise: 0.3570268316628754
Seed:  8  mise: 0.3647183587495415
Seed:  9  mise: 0.4786297833462445
Seed:  9  mise: 0

In [328]:
vals = pd.DataFrame(vals)
display(vals)
MISE = pd.DataFrame(MISE)
fprs_main = pd.DataFrame(fprs_main)
fnrs_main = pd.DataFrame(fnrs_main)
f1s_main = pd.DataFrame(f1s_main)
fprs_interaction = pd.DataFrame(fprs_interaction)
fnrs_interaction = pd.DataFrame(fnrs_interaction)
f1s_interaction = pd.DataFrame(f1s_interaction)
f1s_feature = pd.DataFrame(f1s_feature)
best_r = {}

for index in vals.index:
    r = vals.loc[index][vals.loc[index]==vals.loc[index].min()].index
    best_r[index] = r[0]
    vals.loc[index, "best"] = np.array(vals.loc[index, r])[0]
    MISE.loc[index, "best"] = np.array(MISE.loc[index, r])[0]
    fprs_main.loc[index, "best"] = np.array(fprs_main.loc[index, r])[0]
    fnrs_main.loc[index, "best"] = np.array(fnrs_main.loc[index, r])[0]
    f1s_main.loc[index, "best"] = np.array(f1s_main.loc[index, r])[0]
    fprs_interaction.loc[index, "best"] = np.array(fprs_interaction.loc[index, r])[0]
    fnrs_interaction.loc[index, "best"] = np.array(fnrs_interaction.loc[index, r])[0]
    f1s_interaction.loc[index, "best"] = np.array(f1s_interaction.loc[index, r])[0]
    f1s_feature.loc[index, "best"] = np.array(f1s_feature.loc[index, r])[0]

vals = vals[["best"]].values
MISE = MISE[["best"]].values
fprs_main = fprs_main[["best"]].values
fnrs_main = fnrs_main[["best"]].values
f1s_main = f1s_main[["best"]].values
fprs_interaction = fprs_interaction[["best"]].values
fnrs_interaction = fnrs_interaction[["best"]].values
f1s_interaction = f1s_interaction[["best"]].values
f1s_feature = f1s_feature[["best"]].values

Unnamed: 0,1.0,1.5,2.0
0,0.412997,0.403774,0.348254
1,0.651774,0.670382,0.576583
2,0.444973,0.459593,0.443418
3,1.088797,1.072816,1.089403
4,0.315565,0.522892,0.440825
5,0.344888,0.337047,0.304718
6,2.800724,1.424542,1.346226
7,0.442155,0.571197,0.469855
8,1.323556,0.931974,1.063618
9,0.407547,0.332049,0.273934


In [329]:
len(MISE)

10

In [330]:
print("val:", np.mean(vals), "std-err:", np.std(vals)/np.sqrt(len(vals)))
print("MISE:", np.mean(MISE), "std-err:", np.std(MISE)/np.sqrt(len(MISE)))
print("FPR (main):", np.mean(fprs_main), "std-err:", np.std(fprs_main)/np.sqrt(len(fprs_main)))
print("FNR (main):", np.mean(fnrs_main), "std-err:", np.std(fnrs_main)/np.sqrt(len(fnrs_main)))
print("F1 (main):", np.mean(f1s_main), "std-err:", np.std(f1s_main)/np.sqrt(len(f1s_main)))
print("FPR (interactions):", np.mean(fprs_interaction), "std-err:", np.std(fprs_interaction)/np.sqrt(len(fprs_interaction)))
print("FNR (interactions):", np.mean(fnrs_interaction), "std-err:", np.std(fnrs_interaction)/np.sqrt(len(fnrs_interaction)))
print("F1 (interactions):", np.mean(f1s_interaction), "std-err:", np.std(f1s_interaction)/np.sqrt(len(f1s_interaction)))
print("F1 (feature):", np.mean(f1s_feature), "std-err:", np.std(f1s_feature)/np.sqrt(len(f1s_feature)))


val: 0.6055642999999999 std-err: 0.11305204427705408
MISE: 0.42600983804559195 std-err: 0.03906628355291451
FPR (main): 0.0 std-err: 0.0
FNR (main): 0.27999999999999997 std-err: 0.10178408519999578
F1 (main): 0.7847452856740783 std-err: 0.08950325360472151
FPR (interactions): 5.611582305875551e-06 std-err: 2.5476982222633378e-06
FNR (interactions): 0.3375 std-err: 0.05604127942864974
F1 (interactions): 0.7448107448107447 std-err: 0.04666511403331998
F1 (feature): 0.990909090909091 std-err: 0.008624393618641035


In [324]:
print("Number of features: ", np.mean([len(features[r][(int)(seed)]) for seed, r in best_r.items()]))
print("Number of main effects: ", np.mean([len(mains[r][(int)(seed)]) for seed, r in best_r.items()]))
print("Number of interaction effects: ", np.mean([len(interactions[r][(int)(seed)]) for seed, r in best_r.items()]))

Number of features:  10.1
Number of main effects:  7.3
Number of interaction effects:  8.3


In [325]:
MISE

array([[0.04917082],
       [0.06657216],
       [0.10749127],
       [0.09214344],
       [0.07691014],
       [0.11202801],
       [0.14267347],
       [0.11816495],
       [0.15066906],
       [0.13803617]])

In [113]:
np.median(MISE)

0.35089365276967033

In [11]:
best_r

[Int64Index([0], dtype='int64'),
 Index([0], dtype='object'),
 Index([0], dtype='object'),
 Index([0], dtype='object'),
 Index([0], dtype='object'),
 Index([0], dtype='object'),
 Index([0], dtype='object'),
 Index([0], dtype='object'),
 Index([0], dtype='object'),
 Index([0], dtype='object')]

In [371]:
np.unique(np.concatenate(mains).ravel(), return_counts=True)

TypeError: The first input argument needs to be a sequence

In [29]:
interactions = [inter for inter in interactions if len(inter)>0]
np.unique(np.concatenate(interactions, axis=0), return_counts=True, axis=0)

(array([[ 25,  75],
        [ 25, 125],
        [ 75, 125],
        [ 75, 175],
        [125, 175],
        [125, 375],
        [125, 425],
        [175, 375],
        [175, 425],
        [325, 375],
        [375, 425],
        [425, 475]]),
 array([15, 15,  1,  1, 15,  1,  1,  1,  1, 12,  1, 10]))

In [436]:
from sklearn.metrics import f1_score

In [439]:
f1_score([1,1,1,1,0,0,0,0,0,0], [1,1,1,0,0,1,1,0,0,0])

0.6666666666666665