In [27]:
import numpy as np

data = np.load('../dataset/raw/data.npy')
raw_label = np.load('../dataset/raw/label.npy')

print(data.shape, raw_label.shape)

(259062, 21, 5) (259062,)


In [28]:
## remove the empty values
data = data[:, :, :3]
print(data.shape, raw_label.shape)

## convert all 'two_up' into 'peace'
raw_label[raw_label == 'two_up'] = 'peace'

## figure out the number of classes
classes = np.unique(raw_label)
print(f"{len(classes)} classes")

(259062, 21, 3) (259062,)
11 classes


In [29]:
## print the number of samples in each classes
print("Class distribution")
for i, c in enumerate(classes):
    print(f'{i:<3}{c:<10}:  {np.sum(raw_label == c)}')

Class distribution
0  call      :  22195
1  dislike   :  20526
2  fist      :  21359
3  like      :  20093
4  ok        :  21879
5  one       :  20751
6  palm      :  22319
7  peace     :  45166
8  rock      :  20730
9  three     :  21179
10 three2    :  22865


In [30]:
## convert the label to one-hot

label = np.zeros((raw_label.shape[0], len(classes)))
for i, c in enumerate(classes):
    label[raw_label == c, i] = 1

print(label.shape)


(259062, 11)


In [31]:
## shuffle the data
idx = np.random.permutation(data.shape[0])

data, label = data[idx], label[idx]

print(data.shape, label.shape)
print(label[:10])

(259062, 21, 3) (259062, 11)
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [32]:
# train_size = 200000
# test_size = 50000
# 
# assert train_size + test_size <= data.shape[0]
# 
# ## split the data into train and test
# train_data, train_label = data[:train_size], label[:train_size]
# print(train_data.shape, train_label.shape)
# 
# test_data, test_label = data[train_size:train_size + test_size], label[train_size:train_size + test_size]
# print(test_data.shape, test_label.shape)


In [33]:
train_size = 100000
test_size = 50000

## train data should only have samples of the first 6 classes

train_data, train_label = [], []

idx = np.where(np.argmax(label, axis=1) < 8)[0]
train_data.append(data[idx[:train_size]])
train_label.append(label[idx[:train_size]])

train_data = np.concatenate(train_data, axis=0)
train_label = np.concatenate(train_label, axis=0)

print(train_data.shape, train_label.shape)
print(np.sum(train_label, axis=0))


## test data should have samples of all classes

test_data, test_label = [], []

idx = np.random.permutation(data.shape[0])
test_data.append(data[idx[:test_size]])
test_label.append(label[idx[:test_size]])

test_data = np.concatenate(test_data, axis=0)
test_label = np.concatenate(test_label, axis=0)

print(test_data.shape, test_label.shape)
print(np.sum(test_label, axis=0))

(100000, 21, 3) (100000, 11)
[11395. 10556. 10951. 10326. 11394. 10646. 11494. 23238.     0.     0.
     0.]
(50000, 21, 3) (50000, 11)
[4347. 4037. 4097. 3805. 4276. 4029. 4356. 8630. 4031. 4064. 4328.]


In [34]:
## save the data
np.savez_compressed('../dataset/8class_dataset_100k.npz', train_data=train_data, train_label=train_label,
                    test_data=test_data, test_label=test_label)