In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def generate_XOR_labels(X, feats=[0, 1]):
    y = np.exp(X[:,feats[0]]*X[:,feats[1]])

    prob_1 = np.expand_dims(1 / (1+y) ,1)
    prob_0 = np.expand_dims(y / (1+y) ,1)

    y = np.concatenate((prob_0,prob_1), axis = 1)

    return y

In [3]:
def generate_orange_labels(X, feats=list(range(4))):
    logit = np.exp(np.sum(X[:,feats]**2, axis = 1) - 4.0) 

    prob_1 = np.expand_dims(1 / (1+logit) ,1)
    prob_0 = np.expand_dims(logit / (1+logit) ,1)

    y = np.concatenate((prob_0,prob_1), axis = 1)

    return y

In [4]:
def generate_additive_labels(X, feats=[0, 1, 2, 3]):
    logit = np.exp(-100 * np.sin(0.2*X[:, feats[0]]) + abs(X[:,feats[1]]) + X[:,feats[2]] + np.exp(-X[:,feats[3]])  - 2.4) 

    prob_1 = np.expand_dims(1 / (1+logit) ,1)
    prob_0 = np.expand_dims(logit / (1+logit) ,1)

    y = np.concatenate((prob_0,prob_1), axis = 1)

    return y

In [5]:
def generate_data(n=10000, datatype='', feats=[], size=11, seed = 0):
    np.random.seed(seed)

    X = np.random.randn(n, 11)

    datatypes = None 
    
    y = []
    if datatype == 'orange_skin': 
        y = generate_orange_labels(X, feats) 

    elif datatype == 'XOR':
        y = generate_XOR_labels(X, feats)    

    elif datatype == 'nonlinear_additive':  
        y = generate_additive_labels(X, feats) 

    elif datatype == 'switch':

        # Construct X as a mixture of two Gaussians.
        X[:n//2,-1] += 3
        X[n//2:,-1] += -3
        X1 = X[:n//2]; X2 = X[n//2:]

        y1 = generate_orange_labels(X1, feats[0])
        y2 = generate_additive_labels(X2, feats[1])

        # Set the key features of X2 to be the 4-8th features.
        X2[:,4:8],X2[:,:4] = X2[:,:4],X2[:,4:8]

        X = np.concatenate([X1,X2], axis = 0)
        y = np.concatenate([y1,y2], axis = 0) 

        # Used for evaluation purposes.
        datatypes = np.array(['orange_skin'] * len(y1) + ['nonlinear_additive'] * len(y2)) 

        # Permute the instances randomly.
        perm_inds = np.random.permutation(n)
        X,y = X[perm_inds],y[perm_inds]
        datatypes = datatypes[perm_inds]


    return X, y, datatypes 

In [6]:
params = [
    ["XOR", [0, 1], 0],
    ["orange_skin", [0, 1, 2, 3], 1],
    ["nonlinear_additive", [0, 1, 2, 3], 2],
    ["switch", [[0, 1], [2, 3, 4, 5]], 3],
    ["switch", [[0, 1], [2, 3, 4, 5]], 4],
    ["switch", [[0, 1], [2, 3, 4, 5]], 5],
]

In [7]:
featnames = ["X" + str(i) for i in range(11)]
target = "TARGET"

In [8]:
train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15

In [9]:
n = 10000

In [10]:
for i, param in enumerate(params):
    X, y, d = generate_data(n, param[0], param[1], 11, param[2])
    if d is None:
        d = np.array([param[0]] * n)
    
    data = pd.DataFrame(X, columns=featnames)
    data["TARGET"] = [1 if x[0] > 0.5 else 0 for x in y]
    datatypes = pd.DataFrame(d, columns=["type"])
    data.to_csv("syn" + str(i + 1) + "/data.csv", index=False)
    datatypes.to_csv("syn" + str(i + 1) + "/types.csv", index=False)
    
    data_train, data_test, datatypes_train, datatypes_test = train_test_split(data, datatypes, test_size=1 - train_ratio, random_state=42, shuffle=True)
    data_val, data_test, datatypes_val, datatypes_test = train_test_split(data_test, datatypes_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42, shuffle=True)
    
    data_train.to_csv("syn"+str(i + 1)+"/train.csv", index=False)
    data_val.to_csv("syn"+str(i + 1)+"/val.csv", index=False)
    data_test.to_csv("syn"+str(i + 1)+"/test.csv", index=False)
    datatypes_train.to_csv("syn"+str(i + 1)+"/train_types.csv", index=False)
    datatypes_val.to_csv("syn"+str(i + 1)+"/val_types.csv", index=False)
    datatypes_test.to_csv("syn"+str(i + 1)+"/test_types.csv", index=False)