In [1]:
import os
import pickle
import random
import itertools
import numpy as np
from sklearn import metrics
from collections import OrderedDict
from sklearn.decomposition import PCA
import synthetic_data_generator as sdg
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
np.set_printoptions(suppress=True, linewidth=120, precision=2)

In [2]:
NUM_Repeats = 5
size = (10000, 100, 2) 
N, V, K = size[0], size[1], size[2]

pr_v   = set([0.3, 0.6, 0.9])  # \epsilon OR \alpha in the paper 
cardinalities = [9500, 500]
settings_quant = list(itertools.product(pr_v))
print("pr_v")
for l in settings_quant:
    print(l)


pr_v
(0.3,)
(0.6,)
(0.9,)


In [4]:
name = 'B'
data = {}  

repeats = list(range(NUM_Repeats))

features_type = 'Q'

for setting in settings_quant:
    data[setting] = {}
    for repeat in repeats:
                
        data[setting][repeat] = {}
        
        Y, Yn = sdg.generate_Y(N=N, V=V, K=K, pr_v=setting[-1],
                               cardinality=cardinalities, 
                               features_type=features_type,
                               V_noise1= int(np.floor(V/2))
                              )

        Xg = Y[:cardinalities[0], :]
        Xb = Y[cardinalities[0]:, :]
        Lg = np.repeat(int(1), Xg.shape[0])  # Labels Good
        Lb = np.repeat(int(0), Xb.shape[0])  # Labels Bad 
        
        Xg_train, Xg_test, Lg_train, Lg_test = train_test_split(Xg, Lg,
                                                                test_size=0.02, 
                                                                shuffle=True)

        Xg_test, Xg_val, Lg_test, Lg_val = train_test_split(Xg_test, Lg_test,
                                                            test_size=0.5, 
                                                            shuffle=True)

        Xb_train, Xb_test, Lb_train, Lb_test = train_test_split(Xb, Lb,
                                                                test_size=0.02, 
                                                                shuffle=True)

        Xb_test, Xb_val, Lb_test, Lb_val = train_test_split(Xb_test, Lb_test,
                                                            test_size=0.5, 
                                                            shuffle=True)

        X_train = np.concatenate((Xg_train, Xb_train), axis=0)
        X_val = np.concatenate((Xg_val, Xb_val), axis=0)
        X_test = np.concatenate((Xg_test, Xb_test), axis=0)


        L_train = np.concatenate((Lg_train, Lb_train), axis=0)
        L_val = np.concatenate((Lg_val, Lb_val), axis=0)
        L_test = np.concatenate((Lg_test, Lb_test), axis=0)


        data[setting][repeat]['X_tr'] = X_train
        data[setting][repeat]['X_vl'] = X_val
        data[setting][repeat]['X_ts'] = X_test
        data[setting][repeat]['y_tr'] = L_train
        data[setting][repeat]['y_vl'] = L_val
        data[setting][repeat]['y_ts'] = L_test


        
with open (os.path.join('matrices/', name+features_type+str(size)+'.pickle'), 'wb') as fp:
    pickle.dump(data, fp)

In [14]:
data.keys()

dict_keys([(0.3,), (0.6,), (0.9,)])

In [13]:
for k, v in data.items():
    print(k)
    for kk, vv in v.items():
        print(kk, vv['X_tr'].shape,  vv['X_ts'].shape,  vv['X_vl'].shape)
        print(kk, vv['y_tr'].shape,  vv['y_ts'].shape,  vv['y_vl'].shape)

(0.3,)
0 (9800, 100) (100, 100) (100, 100)
0 (9800,) (100,) (100,)
1 (9800, 100) (100, 100) (100, 100)
1 (9800,) (100,) (100,)
2 (9800, 100) (100, 100) (100, 100)
2 (9800,) (100,) (100,)
3 (9800, 100) (100, 100) (100, 100)
3 (9800,) (100,) (100,)
4 (9800, 100) (100, 100) (100, 100)
4 (9800,) (100,) (100,)
(0.6,)
0 (9800, 100) (100, 100) (100, 100)
0 (9800,) (100,) (100,)
1 (9800, 100) (100, 100) (100, 100)
1 (9800,) (100,) (100,)
2 (9800, 100) (100, 100) (100, 100)
2 (9800,) (100,) (100,)
3 (9800, 100) (100, 100) (100, 100)
3 (9800,) (100,) (100,)
4 (9800, 100) (100, 100) (100, 100)
4 (9800,) (100,) (100,)
(0.9,)
0 (9800, 100) (100, 100) (100, 100)
0 (9800,) (100,) (100,)
1 (9800, 100) (100, 100) (100, 100)
1 (9800,) (100,) (100,)
2 (9800, 100) (100, 100) (100, 100)
2 (9800,) (100,) (100,)
3 (9800, 100) (100, 100) (100, 100)
3 (9800,) (100,) (100,)
4 (9800, 100) (100, 100) (100, 100)
4 (9800,) (100,) (100,)
