# Preprocessing The Data #

There are some techniques we want to use to process the data, but first let's load in the values from the .mat files.

In [47]:
# Import list
import scipy.io as sio
import numpy as np

In [9]:
contents = sio.loadmat('data/data.mat') # This is the first file, I think it contains the labels?
for key in contents.keys():
    print "Key : ", key, "\n Values : ", contents[key]

Key :  __version__ 
 Values :  1.0
Key :  data 
 Values :  [[array([[152]], dtype=uint8) array([u'2b'], 
      dtype='<U2')
  array([[ nan]]) ..., array([[ nan]]) array([[5]], dtype=uint8)
  array([[ nan]])]
 [array([[189]], dtype=uint8) array([u'2b'], 
      dtype='<U2')
  array([[ nan]]) ..., array([[ nan]]) array([[4]], dtype=uint8)
  array([[20]], dtype=uint8)]
 [array([[137]], dtype=uint8) array([u'2a'], 
      dtype='<U2')
  array([[ nan]]) ..., array([[ nan]]) array([[5]], dtype=uint8)
  array([[19]], dtype=uint8)]
 ..., 
 [array([], shape=(1, 0), dtype=float64)
  array([], shape=(1, 0), dtype=float64)
  array([], shape=(1, 0), dtype=float64) ...,
  array([], shape=(1, 0), dtype=float64)
  array([], shape=(1, 0), dtype=float64)
  array([], shape=(1, 0), dtype=float64)]
 [array([], shape=(1, 0), dtype=float64)
  array([], shape=(1, 0), dtype=float64)
  array([], shape=(1, 0), dtype=float64) ...,
  array([], shape=(1, 0), dtype=float64)
  array([], shape=(1, 0), dtype=float64)
  a

## What do we see ##
We see a couple different things inside, one strikes our attention:
- "__version__" is the version of the mat
- "__data__" is the data we want
- "__header__" and "__globals__" are empty things we don't really care about.

In [80]:
# Extract the data field
raw_data = contents['data']
# We see there is an array of arbitrary size, sometimes there's 152, nan, 189, etc...
# We see the label (ground truth) in the array with the dtype = unsigned 2 bit integer

raw_labels = [raw_data_element[1] for raw_data_element in raw_data]
# print "Some raw data labels : ", raw_labels[:30]

label = raw_labels
# print label
# We need to change these into classes, so let's map the labels to values:
# An issue is that for some reason the matrices are in 2d, so we can't get the value straight away, reshape it
# nan = -1 <-- What is "nan"? Ask fabien
# 0 = 0
# 1 = 1
# 2a = 2
# 2b = 3
# 3 = 4
mapping = {'na' : -1, '0' : 0, '1' : 1, '2a' : 2, '2b' : 3, '3' : 4}

labels = []
for label in raw_labels:
    unrolled_label = label.reshape(-1)
    if len(unrolled_label) > 0:
        # we turn the numbers into str, and then cut anything unnecessary out
        # There is one label with '0 (bilateral MCA)' and we can remove the second part by .split()
        # There is one label with '2a?' and we can remove the question mark by just thresholding str to len <= 2
        labels.append(str(unrolled_label[0])[:2].split(" ")[0]) 

# Now we turn them into y's:
raw_y = [mapping[label] for label in labels]
print "y's : " , raw_y

# Now let's count the number of each occurrence:
y_freq = {y:raw_y.count(y) for y in set(raw_y)}
print "y freq's : ", y_freq

# From this, we can see there's a lot of 0's, and most data tends to be in the middle(2 & 3)'s. 
# TODO: Data augmentation to add more samples in the cases of class 1 and class 4.

y's :  [3, 3, 2, 2, 2, 2, 2, 3, 4, 2, 3, 3, 3, 1, 2, 2, 2, 3, 0, 2, 3, 2, 3, 0, 3, 2, 3, 2, 3, 0, 3, 3, 0, 2, 3, 2, 0, 3, 0, 0, 0, 4, 4, 3, 2, 4, 3, 2, 2, 0, 2, 3, 2, 0, 2, 0, 0, 3, 3, 2, 3, 4, 0, 0, -1, 2, 0, 3, -1, 1, 3, 3, 3, 0, 2, 0, 0, 4, 0, 2, 2, 2, 3, 3, 2, 3, 4, 2, 3, 2, 3, 3, 3, 0, 0, 0, 3, 2, 4, 2, 3, 3, 3, 3, 1, 0, 0, 1, 1, 2, 3, 3, 2, 2, 3, 3, 3, 3, 2, 2, 3, 4, 3, 3, 3, 4, 3, 0, 4, 3, 2, 4, 2, 2, 2, 3, 3, 2, 2, 1, 3, 3, 2, 0, 3, 3, 3, 2, 3, 0, 4, 1, 3, 3, 1, 3, 0, 2, 3, 2, 0, 4, 3, 3, 4, 3, 2, 2, 3, 3, 4, 3, 2, 3, 2, 2, 2, 2, 3, 3, 3, 2, 3]
y freq's :  {0: 29, 1: 8, 2: 55, 3: 73, 4: 16, -1: 2}
