# Data Processing

In [None]:
import numpy as np
from numpy.random import randn

In [None]:
X = randn(4, 4)
X

## Zero-centered

In [None]:
mean = np.mean(X, axis=0)
mean

In [None]:
# axis=0 means column
(X[0,0]+X[1,0]+X[2,0]+X[3,0]) / 4

In [None]:
# zero-centered
X -= mean
X

## Normalized

In [None]:
std = np.std(X, axis=0)
std

In [None]:
x_mean = (X[0,0]+X[1,0]+X[2,0]+X[3,0]) / 4

x0 = X[0,0] - x_mean
x1 = X[1,0] - x_mean
x2 = X[2,0] - x_mean
x3 = X[3,0] - x_mean

np.sqrt((x0**2 + x1**2 + x2**2 + x3**2) / 4)

In [None]:
# normalized
X /= std
X

In [None]:
# X -= np.mean(X, axis=0)
# X /= np.std(X, axis=0)

## PCA and Whitening

In [None]:
import numpy as np

X = np.random.randn(8, 4)         # input data matrix X of size [NxD]
X -= np.mean(X, axis=0)           # zero-center the data (important)

# **covariance matrix**
# The (i,j) element of the data covariance matrix contains the covariance
# between i-th and j-th dimension of the data. In particular, the diagonal
# of this matrix contains the variances. Furthermore, the covariance matrix
# is symmetric and positive semi-definite. We can compute the SVD 
# factorization of the data covariance matrix:
cov = np.dot(X.T, X) / X.shape[0]

# **SVD factorization**
# We can compute the SVD factorization of the data covariance matrix, where
# the columns of U are the eigenvectors and S is a 1-D array of the singular
# values
U,S,V = np.linalg.svd(cov)

# **Decorrelate the data**
# To decorrelate the data, we project the original (but zero-centered) data
# into the eigenbasis. Notice that the columns of U are a set of orthonormal
# vectors (norm of 1, and orthogonal to each other), so they can be regarded
# as basis vectors. The projection therefore corresponds to a rotation of 
# the data in X so that the new axes are the eigenvectors. If we were to 
# compute the covariance matrix of Xrot, we would see that it is now diagonal.
# A nice property of np.linalg.svd is that in its returned value U, the 
# eigenvector columns are sorted by their eigenvalues.
Xrot = np.dot(X, U)

# **reduce the dimensionality**
# We can use this to reduce the dimensionality of the data by only using the 
# top few eigenvectors, and discarding the dimensions along which the data 
# has no variance. This is also sometimes refereed to as Principal Component 
# Analysis (PCA) dimensionality reduction
Xrot_reduced = np.dot(X, U[:, :4])

# **whiten the data**
# The whitening operation takes the data in the eigenbasis and divides every
# dimension by the eigenvalue to normalize the scale. The geometric interpretation
# of this transformation is that if the input data is a multivariable gaussian,
# then the whitened data will be a gaussian with zero mean and identity covariance
# matrix. 
Xwhite = Xrot / np.sqrt(S + 1e-5) # adding 1e-5 to prevent division by zero

In [None]:
np.dot(U.T, U)

## Formulas

* Means $$ \bar x = \frac {\sum_{i=1}^N x_i} {N} $$
* Standard Deviation $$ s = \sqrt {\frac{\sum_{i=1}^N (x_i - \bar x)^2} {N} } $$

## References
* [Standard deviation](https://en.wikipedia.org/wiki/Standard_deviation)
* [MathJax basic tutorial and quick reference](https://math.meta.stackexchange.com/questions/5020/mathjax-basic-tutorial-and-quick-reference)