In [40]:
import numpy as np

# Loading Signal Data

In [41]:
# load raw signal data
data=np.load('swbd.npy').item()
train_x = data['train']['data']
train_y_raw = data['train']['labels']
train_y = [label.split('_')[0] for label in train_y_raw]
dev_x = data['dev']['data']
dev_y_raw = data['dev']['labels']
dev_y = [label.split('_')[0] for label in dev_y_raw]
test_x = data['test']['data']
test_y_raw = data['test']['labels']
test_y = [label.split('_')[0] for label in test_y_raw]

# used in clustering
signals = train_x+dev_x+test_x # tuple of numpy.ndarray  31961 * [time*39]
raw_labels = train_y_raw + dev_y_raw + test_y_raw  # tuple of string. Similar as label, e.g.'cadillacs_sw03453-A_027156-027212',
labels = train_y+dev_y+test_y  # labels. tuple of strings 31961. e.g. 'abandoned'

print('number of signals: {}'.format(len(signals)))

number of signals: 31961


In [42]:
# count number of words. Change strings to numeric labels
wordSet = set()
for word in labels:
    wordSet.add(word)
print('total number of words is {}'.format(len(wordSet)))

# make a list. let list index be the numeric label of word
wordList = list(wordSet)

# numeric label of signals
numLabels = np.array(list(map(lambda x: wordList.index(x), labels)))

total number of words is 6204


# Small raw data set with most frequent word

In [43]:
# find the most frequent words
wordBags = dict()
for word in labels:
    if word in wordBags:
        wordBags[word] += 1
    else:
        wordBags[word] = 1
sorted_by_value = sorted(wordBags.items(), key=lambda kv: -kv[1])
print(sorted_by_value[:20])
print('total number of series: {}'.format(sum([x[1] for x in sorted_by_value[:20]])))

[('because', 340), ('recycling', 315), ('benefits', 259), ('something', 230), ('exactly', 228), ('probably', 225), ('insurance', 196), ('punishment', 190), ('everything', 174), ('company', 149), ('sometimes', 144), ('interesting', 143), ('recycle', 143), ('situation', 141), ('problem', 139), ('anything', 131), ('plastic', 127), ('actually', 125), ('understand', 123), ('vacation', 123)]
total number of series: 3645


In [44]:
# get the small data set
small_index_list = [x[0] for x in sorted_by_value[:20]]
small_raw_signal = []
#small_emb_data = []
small_num_labels = []
small_labels = []
for i in range(len(labels)):
    if labels[i] in small_index_list:
        small_raw_signal.append(signals[i])
        #small_emb_data.append(emb_signals[i,:])
        small_num_labels.append(small_index_list.index(labels[i]))
        small_labels.append(raw_labels[i])

Small data summary: 
1. small_index_list: array with length 20. Used as a reference for numeric label

    ['because', 'recycling', 'benefits', 'something', 'exactly', 'probably', 
    'insurance', 'punishment', 'everything', 'company', 'sometimes',
    'interesting', 'recycle', 'situation', 'problem', 'anything', 
    'plastic', 'actually', 'understand', 'vacation']
                 
                 
2. small_raw_signal: # list of numpy.ndarray  3645 * [time*39]

3. small_emb_signal: # numpy.ndarray 3645 * 512

4. small_num_labels: # array with length 3645. Numeric Label for each signal.

5. small_labels: # array with length 3645. string Label for each signal.

The signals of embedding coordinate with signals of raw data

In [45]:
print(small_index_list)

['because', 'recycling', 'benefits', 'something', 'exactly', 'probably', 'insurance', 'punishment', 'everything', 'company', 'sometimes', 'interesting', 'recycle', 'situation', 'problem', 'anything', 'plastic', 'actually', 'understand', 'vacation']


# Reorder data for neural network training

In [46]:
# get the rest of data not in small_index list
rest_raw_signal = []
#small_emb_data = []
rest_labels = []
for i in range(len(labels)):
    if not labels[i] in small_index_list:
        rest_raw_signal.append(signals[i])
        #small_emb_data.append(emb_signals[i,:])
        rest_labels.append(raw_labels[i])

# attach data with most frequency to the end so that later it will be cut to test data
rest_raw_signal = rest_raw_signal + small_raw_signal
rest_labels = rest_labels + small_labels

In [47]:
totalLength = len(rest_labels)
cut1 = totalLength//3
cut2 = 2*totalLength//3

In [48]:
rest_train_data = tuple(rest_raw_signal[:cut1])
rest_dev_data = tuple(rest_raw_signal[cut1:cut2])
rest_test_data = tuple(rest_raw_signal[cut2:])

In [49]:
rest_train_label = tuple(rest_labels[:cut1])
rest_dev_label = tuple(rest_labels[cut1:cut2])
rest_test_label = tuple(rest_labels[cut2:])

In [50]:
data3 = {'train':{'data': rest_train_data, 'labels' : rest_train_label}}

In [51]:
data3['dev'] = {'data': rest_dev_data, 'labels' : rest_dev_label}

In [52]:
data3['test'] = {'data': rest_test_data, 'labels' : rest_test_label}

In [53]:
np.save('swbd_rest.npy',data3)

# test the new dataset

In [54]:
data2=np.load('swbd_rest.npy').item()
train_x = data2['train']['data']
train_y_raw = data2['train']['labels']
train_y = [label.split('_')[0] for label in train_y_raw]
dev_x = data2['dev']['data']
dev_y_raw = data2['dev']['labels']
dev_y = [label.split('_')[0] for label in dev_y_raw]
test_x = data2['test']['data']
test_y_raw = data2['test']['labels']
test_y = [label.split('_')[0] for label in test_y_raw]

# used in clustering
signals = train_x+dev_x+test_x # tuple of numpy.ndarray  31961 * [time*39]
labels = train_y+dev_y+test_y  # labels. tuple of strings 31961. e.g. 'abandoned'

print('number of signals: {}'.format(len(signals)))

number of signals: 31961


In [55]:
print(len(labels))

31961


In [56]:
print(signals[0])

[[ 0.93658495  0.579677   -0.9623296  ...  0.02585247 -0.02391806
  -0.02645081]
 [ 1.1146712   0.65270305 -1.306045   ... -0.04874871 -0.19383088
   0.05306163]
 [ 1.0553093   0.579677   -1.5284493  ... -0.06366371 -0.2684528
   0.03710639]
 ...
 [ 0.04257631  2.015521    0.23703027 ...  0.11086545  0.01841247
  -0.09021612]
 [-0.06178164  1.8798103   0.763125   ...  0.09939449  0.07989333
  -0.00784525]
 [ 0.19041657  0.445796    0.4618032  ...  0.00429505  0.07983831
   0.11513653]]


In [57]:
print(labels)

