## Create a dataset with heterogenous features in 2D

In [1]:
import torch
import os
import sys
sys.path.append('../..')

In [2]:
torch.manual_seed(0)

# TRUE FEATURES

dimspace = 2 #prev 32
numpoints_perconc = 1000000*(dimspace//2)

numclusters = 6

intrinsic_dims = 2*torch.ones((numclusters,), dtype=int)

# allscales = torch.arange(10, 300, 30)
# allscales = torch.pow(torch.tensor([10]), torch.linspace(0.5, 2.5, 10))
scaler_alpha = 4.5
K = 1
Kc = K/dimspace


# centers = Kc*torch.rand(numclusters, dimspace)
#centers on corners of a regular pentagon
# rad = Kc*torch.tensor([1, 1, 1, 1, 1])
# rad = Kc*(1.0+ 0.3*torch.abs(torch.rand(numclusters)))
rad = Kc*torch.tensor([3.0, 1.0, 3.0, 1.0, 3.0, 1.0])
angles = torch.arange(0, 2*torch.pi, 2*torch.pi/numclusters)
centers = torch.stack((rad*torch.cos(angles), rad*torch.sin(angles)), dim=1)


# Qv = 1/200
Qv = 1/(2**scaler_alpha)
Kv = Qv*K/intrinsic_dims.float().max()
torch.manual_seed(625)

variances = [Kv*torch.ones((intrinsic_dims[i],)) for i in range(numclusters)]
Covmats = [1e-6*torch.eye(dimspace) + torch.diag(variances[i]) for i in range(numclusters)]


from torch.distributions.multivariate_normal import MultivariateNormal

# numpoints_perconc = 1000*(dimspace//2)
truefeatures = {'centers': centers, 'variances': variances}

data_all = torch.zeros((numclusters*numpoints_perconc, dimspace))
class_id_all = torch.zeros((numclusters*numpoints_perconc,), dtype=int)
for k in range(numclusters):
    clusterk = MultivariateNormal(centers[k,:], Covmats[k])
    data_all[k*numpoints_perconc:(k+1)*numpoints_perconc, :] = clusterk.sample((numpoints_perconc,))
    class_id_all[k*numpoints_perconc:(k+1)*numpoints_perconc] = k

numpoints_total = data_all.shape[0]
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_all, class_id_all, test_size=0.33, random_state=42)

clf = LogisticRegression(random_state=0, max_iter=1000, C=1e-1,penalty='l2').fit(X_train, y_train)
score = clf.score(X_test, y_test)
#get score separately for each concept (one vs all)
scoresperconcept = []
for k in range(numclusters):
    y_train_concept = (y_train == k)
    y_test_concept = (y_test == k)
    clf = LogisticRegression(random_state=0, max_iter=1000, C=1e-1,penalty='l2').fit(X_train, y_train_concept)
    score = clf.score(X_test, y_test_concept)
    scoresperconcept.append(score)


# print(f"Scaler={scaler_alpha}, Score={score}")






In [3]:
# import matplotlib.pyplot as plt
# plt.scatter(data_all[:,0], data_all[:,1], c=class_id_all, cmap='tab10')
# plt.title(f"Data with different magnitudes")
# plt.show()

In [4]:
# scoresperconcept

In [5]:
torch.manual_seed(41)
shuffle_indices = torch.randperm(numpoints_total)
datax = data_all[shuffle_indices,:]
classidx = class_id_all[shuffle_indices]
#CREATE TRAIN, TEST SPLITS
#save training data and test data separately
torch.manual_seed(4)
frac_train = 0.7 #70% train, 30% test
total_points = numpoints_total
train_data_size = int(frac_train*total_points)
test_data_size = total_points-train_data_size
random_ordering = torch.randperm(total_points)
train_indices = random_ordering[:train_data_size]
test_indices = random_ordering[train_data_size:]

train_datax = datax[train_indices,:]
test_datax = datax[test_indices,:]

train_classidx = classidx[train_indices]
test_classidx = classidx[test_indices]


In [6]:
# SAVE DATA
#location to save data
labdir = os.environ['USERDIR']
data_loc = labdir+'/data/'
dataset_dir = data_loc+f'/2dgaussian_diffmag/'
dim = 2 #data dimension

if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

# torch.save({'numclusters':numclusters,\
#             'dim':dim,\
#             'data':train_datax,\
#             'labels':train_classidx,\
#             'truefeatures':truefeatures}, dataset_dir+f'traindata.pt')

# torch.save({'numclusters':numclusters,\
#             'dim':dim,\
#             'data':test_datax,\
#             'labels':test_classidx,\
#             'truefeatures':truefeatures}, dataset_dir+f'testdata.pt')