In [1]:
import os
import numpy as np
from os.path import join

In [None]:
dataset_spec = {'dataset_1': {'original_dims': 30,
                              'output_dims': 2,
                              'max_additional_dims': 50,
                              'mean_val': [list(2 * np.round(np.random.randn(30), decimals=3)),
                                            list(-2 * np.round(np.random.randn(30), decimals=3))],
                              'std_val': [list(0.5 * np.ones(30)),
                                           list(0.5 * np.ones(30))],
                              'noise': 'gaussian',
                              'noise_mean': 0.,
                              'noise_sigma': 0.5,
                              'n_samples_per_class': 5000
                             }
                }

In [2]:

class Hyperparameters(object):
    """ Add hyper-parameters in init so when you read a json, it will get updated as your latest code. """
    def __init__(self,
                 learning_rate=5e-2,
                 architecture=None,
                 epochs=500,
                 batch_size=10,
                 loss='cross_entropy',
                 optimizer='sgd',
                 lr_at_plateau=True,
                 reduction_factor=None,
                 validation_check=True):
        """
        :param learning_rate: float, the initial value for the learning rate
        :param architecture: str, the architecture types
        :param epochs: int, the number of epochs we want to train
        :param batch_size: int, the dimension of the batch size
        :param loss: str, loss type, cross entropy or square loss
        :param optimizer: str, the optimizer type.
        :param lr_at_plateau: bool, protocol to decrease the learning rate.
        :param reduction_factor, int, the factor which we use to reduce the learning rate.
        :param validation_check: bool, if we want to keep track of validation loss as a stopping criterion.
        """
        self.learning_rate = learning_rate
        self.architecture = architecture
        self.epochs = epochs
        self.batch_size = batch_size
        self.loss = loss
        self.optimizer = optimizer
        self.lr_at_plateau = lr_at_plateau
        self.reduction_factor = reduction_factor
        self.validation_check = validation_check


class Dataset:
    """ Here we save the dataset specific related to each experiment. The name of the dataset,
    the scenario, if we modify the original dataset, and the dimensions of the input.
    This is valid for the modified_MNIST_dataset, verify if it is going to be valid next"""
    # TODO: add output_dims
    def __init__(self,
                 scenario=1,
                 original_dims=30,
                 output_dims=2,
                 additional_dims=2,
                 mean_val=None,
                 std_val=None,
                 noise='gaussian',
                 noise_mean=0.,
                 noise_sigma=0.5,
                 n_training=10,
                 redundancy_amount=None):
        """
        :param scenario: int, the learning paradigm
        :param original_dims: int, name of the folder of the experiments
        :param output_dims: int, dimensionality of the output
        :param additional_dims: int, additional noise
        :param mean_val:
        :param std_val:
        :param noise: str or None
        :param noise_mean: int or np.array
        :param noise_sigma: int or np.array
        :param n_training: int, number of training examples
        :param redundancy_amount, percentage of redundant features, scenario 4 only
        """
        self.scenario = scenario
        self.original_dims = original_dims
        self.output_dims = output_dims
        self.additional_dims = additional_dims
        self.mean_val = mean_val
        self.std_val = std_val
        self.noise = noise
        self.noise_mean = noise_mean
        self.noise_sigma = noise_sigma
        self.n_training = n_training
        self.redundancy_amount = redundancy_amount


class Experiment(object):
    """
    This class represents your experiment.
    It includes all the classes above and some general
    information about the experiment index.
    IF YOU ADD ANOTHER CLASS, MAKE SURE TO INCLUDE IT HERE.
    """
    def __init__(self,
                 id,
                 output_path,
                 train_completed=False,
                 hyper=None,
                 dataset=None):
        """
        :param id: index of output data folder
        :param output_path: output directory
        :param train_completed: bool, it indicates if the experiment has already been trained
        :param hyper: instance of Hyperparameters class
        :param dataset: instance of Dataset class
        """
        if hyper is None:
            hyper = Hyperparameters()
        if dataset is None:
            dataset = Dataset()

        self.id = id
        self.output_path = output_path
        self.train_completed = train_completed
        self.hyper = hyper
        self.dataset = dataset

In [3]:
exp = Experiment(id=0, output_path='./exp_output')

In [4]:
exp.dataset.additional_dims

2

In [22]:
class DatasetGenerator:
    """ Class for the data set generation. We consider three scenarios = [1,2,4].
    Each related to a different transformation of the low dimensional data.
    We generate DatasetGenerator objects everytime we generate a model and create
    a sample split.

    In the case of redundant transformation
        ** pass the linear transformation as a dct_kwargs['A'] argument **

    The risk otherwise is to have three different linear transformation for the
    training, validation, and test dataset splits.
    Given the input argument, the class initialization already generate the input
    output relations, with the transformations of interest.

    If the noise mean and standard deviations are not specified and we are in scenario
    1 or 4, we generate normally distributed features.

    """
    def __init__(self,
                 data_path=None,
                 load=False,
                 dct_dataset=None,  # dataset_spec['dataset_1']
                 exp=None):
        """
        Generate dataset for a supervised learning task. The features are extracted using
        Gaussian distributions.
        :param dct_dataset: dict, containing the meaningful information to generate the dataset
        :param load: bool, if True we load the data already generated
        """
        self.data_path = data_path
        self.load = load
        self.dct_dataset = dct_dataset
        self.exp = exp
        
        self.minimal_dataset=False
        self.splits_lst = ['train', 'validation', 'test']
        
        if self.dct_dataset is not None:
            self.p = self.dct_dataset['original_dims']
            self.K = self.dct_dataset['output_dims']
            self.max_add_p = self.dct_dataset['max_additional_dims']
            self.N_per_class = self.dct_dataset['n_samples_per_class']
            self.N = self.K * self.N_per_class
            self.mu_array = np.array(dct_dataset['mean_val'])
            self.sigma_array = np.array(dct_dataset['std_val'])

            if not self.load:
                self.save_minimal_data()
                self.minimal_data = True

        if load:
            if self.data_path is None:
                raise ValueError("You need to provide a path to the dataset")
            else:
                # TESTED
                self.load_minimal_data()
                self.minimal_dataset = True
                                              
        if exp is not None:
            self.exp = exp
            
    def _generate_minimal_data(self):
        """ Here we generate the data by using the relevant features only.
        Each feature is Gaussian distributed. Mean and standard
        deviation for each variable varies depending on the user specification.

        The generic i-th feature is x_i
                    x_i = mean_i + N(0,1) * std_i, x_i in R^n_samples

        The labels are generating depending on the learning task.
        The classifier the two distribution are
        given different values. # at the moment we are not considering the
        multi-classification task.
        """
        check_output_mu, check_input_mu = np.squeeze(np.array(self.mu_array)).shape
        check_output_st, check_input_st = np.squeeze(np.array(self.sigma_array)).shape
        if check_output_mu != self.K or check_output_st != self.K:
            raise ValueError("Arrays inconsistent with the number of classes")

        X_ = np.zeros((self.p, self.N))
        y_ = np.zeros((self.K, self.N))
        for k_, (mu_class_, sigma_class_) in enumerate(zip(self.mu_array, self.sigma_array)):  # for each class
            first_ = k_ * self.N_per_class  # n_per_class
            last_ = self.N if k_ == self.K - 1 else (k_ + 1) * self.N_per_class
            for id_, (mu_, sigma_) in enumerate(zip(mu_class_, sigma_class_)):
                X_[id_, first_:last_] = mu_ + np.random.randn(last_ - first_) * sigma_
            y_[k_, first_:last_] = 1

        self.y = y_
        self.X = X_
        self.minimal_dataset = True

        return self
    
    
    def load_minimal_data(self):
        # TESTED
        self.A = np.load(join(self.data_path, 'A.npy'))
        X_splits, y_splits, noise_splits = [], [], []
        for fold_ in self.splits_lst:
            X_splits.append(np.load(join(self.data_path, fold_, 'X.npy')))
            y_splits.append(np.load(join(self.data_path, fold_, 'y.npy')))
            noise_splits.append(np.load(join(self.data_path, fold_, 'N.npy')))
        self.X_splits = X_splits
        self.y_splits = y_splits
        self.noise_splits = noise_splits                                                           
                            
    def save_minimal_data(self):
        # TESTED
        X_splits, y_splits, noise_splits = [], [], []
        self.A = np.random.randn(self.max_add_p, self.p)
        np.save(join(self.data_path, 'A.npy'), self.A)
                                    
        for id_split_, fold_ in enumerate(self.splits_lst):           
            fold_data = join(self.data_path, fold_)             
            os.makedirs(fold_data, exist_ok=True)             
            self._generate_minimal_data()
                                    
            self.noise = np.random.randn(self.max_add_p, self.N)
            np.save(join(self.data_path, fold_, 'X.npy'), self.X)
            np.save(join(self.data_path, fold_, 'y.npy'), self.y)
            np.save(join(self.data_path, fold_, 'N.npy'), self.noise)        
    
                            
    def add_redundancy(self):
        # TESTED
        """ We add redundancy to the dataset.
        Using a linear combination of the input features.
        """
        self.A = self.A[:self.exp.dataset.additional_dims, :]
        X_splits_ = []
        for x_data_, f_ in zip(self.X_splits, 
                               self.splits_lst):
            X_splits_.append(np.vstack((x_data_, 
                                        np.dot(self.A, x_data_))))
        return X_splits_, self.y_splits                                       
                            
    def add_gaussian_noise(self):
        # TESTED
        """ We add noisy features to the dataset.
        This is done by adding Gaussian distributed
        random variables to the original features.
        """
        if not self.minimal_dataset:
            raise ValueError("Generate the dataset first")
        X_splits_ = []
        for x_data_, n_data_, f_ in zip(self.X_splits, 
                                        self.noise_splits, 
                                        self.splits_lst):
            X_splits_.append(np.vstack((x_data_, 
                                        n_data_[:self.exp.dataset.additional_dims])))
        return X_splits_, self.y_splits
                       
    
    def add_mixture(self, n_noise_feat, n_rdndt_feat):
        # TESTED
        """ With this call we add a percentage of redundancy and a (1-percentage) of noisy features. """
        self.A = self.A[:n_rdndt_feat, :]  # the first n_rdndt components
        X_splits_ = []  # we have the three splits 
        for x_data_, n_data_, f_ in zip(self.X_splits, 
                                        self.noise_splits, 
                                        self.splits_lst):
            tmp_ = np.vstack((x_data_, n_data_[:n_noise_feat]))
            X_splits_.append(np.vstack((tmp_, np.dot(self.A, x_data_))))
            
        return X_splits_, self.y_splits
  
    def _get_n_train_elements_per_class(self):
        # TESTED
        """ Consider a fixed amount of training data. """
        if not self.minimal_data or self.exp is None:
            raise ValueError("Generate the dataset first")
        n_per_class = self.exp.dataset.n_training

        y_tr = self.y_splits[0]  # (k, n)
        n_classes, n_samples = y_tr.shape
        n_s_per_class = data_generator.exp.dataset.n_training

        idx = np.array([], dtype=int)
        for k in range(n_classes):
            idx = np.append(idx, np.arange(k * (n_samples // n_classes), 
                                           k * (n_samples // n_classes) + n_s_per_class))
        return idx
                            
    def generate_input_experiment(self):
        # TESTED
        """ Generate the dataset (X, y) for a specific experiment. """        
        if self.exp.dataset.scenario == 1:
            return self.add_gaussian_noise()
                            
        elif self.exp.dataset.scenario == 2:
            return self.add_redundancy()
                            
        elif self.exp.dataset.scenario == 4:
            r_ = self.exp.dataset.redundancy_amount 
            n_noise_feat = int(self.exp.dataset.additional_dims * (1-r_))
            n_rdndt_feat = int(self.exp.dataset.additional_dims * r_)
            return self.add_mixture(n_noise_feat, n_rdndt_feat)                                         

In [23]:
key_dataset = 'dataset_1'

data_generator = DatasetGenerator(data_path=key_dataset, 
                                  # dct_dataset=dataset_spec[key_dataset],
                                  load=True,
                                  exp=exp)

In [24]:
[X_splits, y_splits] = data_generator.generate_input_experiment()

I am here
I am here


array([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9, 5000,
       5001, 5002, 5003, 5004, 5005, 5006, 5007, 5008, 5009])

In [51]:
y_splits[0][:, idx]
X_splits[0][:, idx]

array([[-0.12227757,  0.71764277,  0.77043894, -0.4033331 ,  0.75041773,
         0.5774073 , -0.78842729, -0.36212822,  0.3605753 , -0.87174395,
         0.88089455,  0.62452585, -0.38670337, -0.37459276,  0.92215839,
        -0.01694338,  0.5577367 ,  0.10868146, -0.06597793,  0.2806442 ],
       [ 2.34519917,  2.64982012,  2.07982889,  1.62859002,  1.93802655,
         1.39984328,  1.73107201,  2.34039206,  2.27191495,  1.94834235,
        -0.91988174, -1.26135136, -1.27760393, -2.2726067 , -2.18425958,
        -1.87086354, -1.21286184, -1.51026562, -1.95748186, -1.52044892],
       [-1.07952103, -1.34703513, -1.63130437, -0.96195073, -1.4177742 ,
        -1.84969245, -1.20151081, -1.78456528, -1.86380838, -1.62562871,
        -2.47138789, -1.90623784, -3.15547536, -4.01583724, -2.66289881,
        -2.82092555, -2.95659019, -3.06084798, -1.74584754, -3.40076646],
       [ 2.94611934,  3.5126783 ,  4.13466964,  4.52262513,  3.66256933,
         3.60111639,  3.82307553,  3.81750755,  

In [28]:
X_splits[0].shape

(32, 10000)

In [32]:
self

array([[0., 1.],
       [1., 0.]])

In [None]:
[X_splits, y_splits] = data_generator.add_mixture(n_noise_feat=1, n_rdndt_feat=1)

In [None]:
np.linalg.matrix_rank(X_splits[0])

In [None]:
exp.dataset.redundancy_amount = 0.5

In [None]:
data_generator.X_splits[0]

In [None]:
y = np.array([[1,0,0,0,0],
              [0,1,0,0,0],
              [0,1,0,0,0]])

np.unique(y, axis=0)