In [2]:
import numpy as np

# Loading Signal Data

In [3]:
# load raw signal data
data=np.load('data/swbd.npy').item()
train_x = data['train']['data']
train_y_raw = data['train']['labels']
train_y = [label.split('_')[0] for label in train_y_raw]
dev_x = data['dev']['data']
dev_y_raw = data['dev']['labels']
dev_y = [label.split('_')[0] for label in dev_y_raw]
test_x = data['test']['data']
test_y_raw = data['test']['labels']
test_y = [label.split('_')[0] for label in test_y_raw]

# used in clustering
signals = train_x+dev_x+test_x # tuple of numpy.ndarray  31961 * [time*39]
labels = train_y+dev_y+test_y  # labels. tuple of strings 31961. e.g. 'abandoned'

print('number of signals: {}'.format(len(signals)))

number of signals: 31961


In [4]:
# count number of words. Change strings to numeric labels
wordSet = set()
for word in labels:
    wordSet.add(word)
print('total number of words is {}'.format(len(wordSet)))

# make a list. let list index be the numeric label of word
wordList = list(wordSet)

# numeric label of signals
numLabels = np.array(list(map(lambda x: wordList.index(x), labels)))

total number of words is 6204


# Loading Embedding Data

In [6]:
# load embedding data
emb_data = np.load('data/swbd_embeddings.npy').item()
emb_train_x = emb_data['train']['embs']
emb_train_y_raw = emb_data['train']['labels']
emb_train_y = [label.split('_')[0] for label in train_y_raw]
emb_dev_x = emb_data['dev']['embs']
emb_dev_y_raw = emb_data['dev']['labels']
emb_dev_y = [label.split('_')[0] for label in dev_y_raw]
emb_test_x = emb_data['test']['embs']
emb_test_y_raw = emb_data['test']['labels']
emb_test_y = [label.split('_')[0] for label in test_y_raw]

# labels are the same. Use the embedding matrix in clustering
emb_signals = np.concatenate((emb_train_x,emb_dev_x,emb_test_x), axis = 0) # np.ndarray 31961*512
print('shape of embeddings: {}'.format(emb_signals.shape))

shape of embeddings: (31961, 512)


# Small raw data and small labels

In [5]:
wordBags = dict()
for word in labels:
    if word in wordBags:
        wordBags[word] += 1
    else:
        wordBags[word] = 1
sorted_by_value = sorted(wordBags.items(), key=lambda kv: -kv[1])
print(sorted_by_value[:20])
print('total number of series: {}'.format(sum([x[1] for x in sorted_by_value[:20]])))

[('because', 340), ('recycling', 315), ('benefits', 259), ('something', 230), ('exactly', 228), ('probably', 225), ('insurance', 196), ('punishment', 190), ('everything', 174), ('company', 149), ('sometimes', 144), ('interesting', 143), ('recycle', 143), ('situation', 141), ('problem', 139), ('anything', 131), ('plastic', 127), ('actually', 125), ('understand', 123), ('vacation', 123)]
total number of series: 3645


In [7]:
# get the small data set
small_index_list = [x[0] for x in sorted_by_value[:20]]
small_raw_signal = []
small_emb_data = []
small_num_labels = []
for i in range(len(labels)):
    if labels[i] in small_index_list:
        small_raw_signal.append(signals[i])
        small_emb_data.append(emb_signals[i,:])
        small_num_labels.append(small_index_list.index(labels[i]))

In [13]:
# concatinate the embeded signal to make a data matrix
small_emb_signal = np.zeros((len(small_emb_data), 512))
for i in range(len(small_emb_data)):
    small_emb_signal[i] = small_emb_data[i]

Small data summary: 
1. small_index_list: array with length 20. Used as a reference for numeric label

    ['because', 'recycling', 'benefits', 'something', 'exactly', 'probably', 
    'insurance', 'punishment', 'everything', 'company', 'sometimes',
    'interesting', 'recycle', 'situation', 'problem', 'anything', 
    'plastic', 'actually', 'understand', 'vacation']
                 
                 
2. small_raw_signal: # list of numpy.ndarray  3645 * [time*39]

3. small_emb_signal: # numpy.ndarray 3645 * 512

4. small_num_labels: # array with length 3645. Numeric Label for each signal.

The signals of embedding coordinate with signals of raw data