In [69]:
import numpy as np
from sklearn.decomposition import PCA
from scipy.linalg import eigh
from scipy.stats import multivariate_normal as gaussian

## Data

In [70]:
training_data = np.load('mnist_demo/mnist_data/mnist_train_images.npy')
training_labels = np.load('mnist_demo/mnist_data/mnist_train_labels.npy')

In [71]:
set(training_labels)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

In [72]:
testing_data = np.load('mnist_demo/mnist_data/mnist_test_images.npy')
testing_labels = np.load('mnist_demo/mnist_data/mnist_test_labels.npy')

## Helpers

In [73]:
def calc_scatter_matrices(X, Y):
    """ See Equations (1) on p.532 of Ioffe 2006. """
    assert len(X.shape) == 2
    assert X.shape[0] == len(Y)

    unique_labels = np.unique(Y)
    labels = np.asarray(Y)

    m = X.mean(axis=0)
    N = X.shape[0]

    cov_ks = []
    m_ks = []
    n_ks = []

    for k in unique_labels:
        bool_idxs = labels == k
        X_k = X[bool_idxs]

        m_ks.append(X_k.mean(axis=0))
        n_ks.append(bool_idxs.sum())

        cov_ks.append(np.cov(X_k.T))

    n_ks = np.asarray(n_ks)
    m_ks = np.asarray(m_ks)

    m_ks_minus_m = m_ks - m
    S_b = np.matmul(m_ks_minus_m.T * (n_ks / N), m_ks_minus_m)

    S_w = np.asarray(cov_ks) * ((n_ks - 1) / N)[:, None, None]
    S_w = np.sum(S_w, axis=0)

    return S_b, S_w


In [74]:
def calc_m(X):
    """ See Fig. 2 on p.537 of Ioffe 2006. """
    assert len(X.shape) == 2
    return X.mean(axis=0)

def calc_W(S_b, S_w):
    """ See Fig. 2 on p.537 of Ioffe 2006. """
    eigenvalues, eigenvectors = eigh(S_b, S_w)
    return eigenvectors

def calc_Lambda_b(S_b, W):
    """ See Fig. 2 on p.537 of Ioffe 2006. """
    return np.matmul(np.matmul(W.T, S_b), W)

def calc_Lambda_w(S_w, W):
    """ See Fig. 2 on p.537 of Ioffe 2006. """
    return np.matmul(np.matmul(W.T, S_w), W)

def calc_n_avg(Y):
    """ This is the \"hack\" suggested in Fig 2 on p.537 of Ioffe 2006. """
    unique = np.unique(Y)
    return len(Y) / unique.shape[0]

def calc_A(n_avg, Lambda_w, W):
    """ See Fig. 2 on p.537 of Ioffe 2006. """
    Lambda_w_diagonal = Lambda_w.diagonal()  # Should be diagonal matrix.
    inv_W_T = np.linalg.inv(W.T)
    return inv_W_T * (n_avg / (n_avg - 1) * Lambda_w_diagonal) ** .5


def calc_Psi(Lambda_w, Lambda_b, n_avg):
    """ See Fig. 2 on p.537 of Ioffe 2006. """
    Lambda_w_diagonal = Lambda_w.diagonal()  # Should be diagonal matrix.
    Lambda_b_diagonal = Lambda_b.diagonal()  # Should be diagonal matrix.
    Psi = (n_avg - 1) / n_avg * Lambda_b_diagonal / Lambda_w_diagonal
    Psi -= 1 / n_avg
    Psi[Psi <= 0] = 0

    return np.diag(Psi)

def get_relevant_U_dims(Psi):
    """ See Fig. 2 on p.537 of Ioffe 2006. """
    relevant_dims = np.squeeze(np.argwhere(Psi.diagonal() != 0))
    if relevant_dims.shape == ():
        relevant_dims = relevant_dims.reshape(1,)
    return relevant_dims

def optimize_maximum_likelihood(X, labels):
    """ Performs the optimization in Fig. 2 of p.537 of Ioffe 2006.

    DESCRIPTION
     - The main model parameters are `m`, `A`, and `Psi`.
     - However, to improve the performance (speed and numerical stability)
        of the plda.Model object,
        inv_A and relevant_U_dims are also returned here.

    ADDITIONAL NOTES
     Be sure to test that np.cov(X.T) is full rank before running this.

     Recall that there are 4 \"spaces\":
      'D' (data) <---> 'X' (preprocessed) <---> 'U' (latent) <---> 'U_model'

    ARGUMENTS
     X  (numpy.ndarray), shape=(n_data, n_dimensions)
       - Data in statistics format, i.e. row-wise.

     labels  (list or numpy.ndarray), length=X.shape[0]
       - Labels for the data in `X`.
       - Must be sorted in the same order as `X`.

    RETURNS
     m  (numpy.ndarray), shape=X.shape[-1]
       - The mean of the row vectors in X.
       - This is the prior mean fitted via maximum likelihood.

     A  (numpy.ndarray), shape=(X.shape[-1], X.shape[-1])
       - Transformation from X space to the latent U space.

     Psi  (numpy.ndarray), shape=(X.shape[-1], X.shape[-1])
       - The covariance matrix of the prior distribution on
          the category means in U space.

     relevant_U_dims  (numpy.ndarray), shape=(len(np.unique(labels)) - 1,)
       - The \"effective\" latent dimensions,
          i.e. the ones that are actually used by the model.

     inv_A  (numpy.ndarray), shape=A.shape
       - The inverse of the matrix A.
       - Transformation from the latent U space to the X space.
    """
    assert len(X.shape) == 2
    assert X.shape[0] == len(labels)

    m = X.mean(axis=0)

    S_b, S_w = calc_scatter_matrices(X, labels)
    W = calc_W(S_b, S_w)

    Lambda_b = calc_Lambda_b(S_b, W)
    Lambda_w = calc_Lambda_w(S_w, W)
    n_avg = calc_n_avg(labels)

    A = calc_A(n_avg, Lambda_w, W)
    inv_A = np.linalg.inv(A)

    Psi = calc_Psi(Lambda_w, Lambda_b, n_avg)
    relevant_U_dims = get_relevant_U_dims(Psi)

    return m, A, Psi, relevant_U_dims, inv_A



## Calculate

In [75]:
data=training_data
labels=training_labels
n_principal_components=5

#### Scatte matrices (not needed if n_principal components set)

In [76]:
S_b, S_w = calc_scatter_matrices(data, labels)
matrix_rank = np.linalg.matrix_rank(S_w)

In [77]:
S_b

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [78]:
S_w

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [79]:
matrix_rank

190

well set it to n_componentes

In [80]:
matrix_rank=n_principal_components

#### PCA

In [81]:
pca = PCA(n_components=matrix_rank)
pca.fit(data)

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

### transform

In [82]:
# X = self.transform(data, from_space='D', to_space='X')

X=pca.transform(data)

In [83]:
X.shape

(200, 5)

In [84]:
X[0]

array([ 1.447227  , -1.4779868 , -0.71444005,  0.561218  , -1.1190547 ],
      dtype=float32)

#### learn params

In [85]:
m, A, Psi, relevant_U_dims, inv_A = optimize_maximum_likelihood(X, labels)

In [86]:
m

array([-1.6003847e-07, -5.4836274e-08, -7.1823600e-08, -8.9704990e-08,
        9.0003013e-08], dtype=float32)

In [87]:
A

array([[ 0.03050822,  0.94798993, -0.03549896, -0.26389018, -0.93577497],
       [-0.16054445,  0.26030023,  1.1934572 ,  0.13195544,  0.04582924],
       [-0.52330289, -0.28802894, -0.0927878 ,  0.85863152, -0.30746867],
       [-1.57719553, -0.16134386, -0.01523872, -0.32562985, -0.00387658],
       [ 0.33287221, -1.01493607,  0.2222635 , -0.2481869 , -0.35077958]])

In [88]:
Psi

array([[0.07600691, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.7413552 , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 1.65183813, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 2.49805864, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 3.43666356]])

In [89]:
relevant_U_dims

array([0, 1, 2, 3, 4], dtype=int64)

In [90]:
inv_A

array([[ 0.00576119, -0.04326186, -0.16272384, -0.54858509,  0.12767355],
       [ 0.28971625,  0.11351532, -0.14494619, -0.09082049, -0.62999222],
       [-0.01652167,  0.79258737, -0.07110832, -0.01306257,  0.21009907],
       [-0.16200631,  0.11559749,  0.86799574, -0.36820982, -0.30946757],
       [-0.72863434,  0.05092086, -0.39422216, -0.00555967, -0.55475306]])

#### transfrom X from space X to U_model

In [91]:
#from X -> U
x_in_u=np.matmul(X - m, inv_A.T)

U_model=x_in_u[..., relevant_U_dims]
U_model.round(3)[0]

array([-0.262,  1.009, -1.387, -0.886, -0.23 ])

#### get prior params

In [92]:
cov_diag = Psi.diagonal()[relevant_U_dims]
mean = np.zeros(relevant_U_dims.shape)

prior_params= {'mean': mean, 'cov_diag': cov_diag}
mean

array([0., 0., 0., 0., 0.])

In [93]:
cov_diag

array([0.07600691, 0.7413552 , 1.65183813, 2.49805864, 3.43666356])

#### get posterior params

In [104]:
labels = np.asarray(labels)
prior_cov_diagonal = prior_params['cov_diag']

cov_diags = []
means = []
categories = []

for k in np.unique(labels):
#     print(k)
    bool_idxs = labels == k
    U_model_k = U_model[bool_idxs]
    n_k = bool_idxs.sum()

    cov_diag = prior_cov_diagonal / (1 + n_k * prior_cov_diagonal)
    mean = U_model_k.sum(axis=0) * cov_diag

    cov_diags.append(cov_diag)
    means.append(mean)
    categories.append(k)

    #add them to dict
posterior_params = dict()
for label, mean, cov_diag in zip(categories, means, cov_diags):
    category_params = dict()
    category_params['mean'] = mean
    category_params['cov_diag'] = cov_diag

    posterior_params[label] = category_params

In [106]:
posterior_params

{0: {'mean': array([-0.02610668, -0.2015203 ,  0.94471578, -1.59135246, -4.46393953]),
  'cov_diag': array([0.02927683, 0.04474497, 0.04628475, 0.04672829, 0.04696825])},
 1: {'mean': array([ 0.12092588, -1.51226444, -1.42831989, -0.18379293,  1.28496565]),
  'cov_diag': array([0.02553841, 0.03656457, 0.03758637, 0.03787834, 0.03803586])},
 2: {'mean': array([ 0.05349063,  0.24555476, -0.99770902,  0.02936313, -1.10750526]),
  'cov_diag': array([0.03015982, 0.04684086, 0.048531  , 0.04901886, 0.04928298])},
 3: {'mean': array([ 0.24353772,  1.46986828, -1.55764483, -1.7578891 ,  0.56726187]),
  'cov_diag': array([0.02927683, 0.04474497, 0.04628475, 0.04672829, 0.04696825])},
 4: {'mean': array([0.09809288, 0.41656934, 1.39487302, 1.07606885, 1.04385877]),
  'cov_diag': array([0.02927683, 0.04474497, 0.04628475, 0.04672829, 0.04696825])},
 5: {'mean': array([ 0.00967344, -1.0328505 , -0.28146238, -0.22571765, -0.13078113]),
  'cov_diag': array([0.03823112, 0.06969185, 0.0735003 , 0.0746

#### get posterior_predictive_params

In [107]:
posterior_predictive_params = posterior_params.copy()

for k, k_params in posterior_predictive_params.items():
    k_params['cov_diag'] += 1

In [108]:
posterior_predictive_params

{0: {'mean': array([-0.02610668, -0.2015203 ,  0.94471578, -1.59135246, -4.46393953]),
  'cov_diag': array([1.02927683, 1.04474497, 1.04628475, 1.04672829, 1.04696825])},
 1: {'mean': array([ 0.12092588, -1.51226444, -1.42831989, -0.18379293,  1.28496565]),
  'cov_diag': array([1.02553841, 1.03656457, 1.03758637, 1.03787834, 1.03803586])},
 2: {'mean': array([ 0.05349063,  0.24555476, -0.99770902,  0.02936313, -1.10750526]),
  'cov_diag': array([1.03015982, 1.04684086, 1.048531  , 1.04901886, 1.04928298])},
 3: {'mean': array([ 0.24353772,  1.46986828, -1.55764483, -1.7578891 ,  0.56726187]),
  'cov_diag': array([1.02927683, 1.04474497, 1.04628475, 1.04672829, 1.04696825])},
 4: {'mean': array([0.09809288, 0.41656934, 1.39487302, 1.07606885, 1.04385877]),
  'cov_diag': array([1.02927683, 1.04474497, 1.04628475, 1.04672829, 1.04696825])},
 5: {'mean': array([ 0.00967344, -1.0328505 , -0.28146238, -0.22571765, -0.13078113]),
  'cov_diag': array([1.03823112, 1.06969185, 1.0735003 , 1.0746

## Predict on new data

#### transform from D to U_model

In [109]:
data_temp=pca.transform(testing_data)
data_temp=np.matmul(data_temp - m, inv_A.T)
testing_data_transformed=data_temp[..., relevant_U_dims]

In [110]:
testing_data_transformed[0]

array([ 0.58580544,  0.23396322,  2.15497222, -1.21182056,  1.78022659])

In [111]:
testing_data_transformed[1]

array([ 0.87711084, -1.0831509 , -2.28571034,  0.56450825, -2.31310678])

#### calculate logprobs per category

In [112]:
def calc_logp_posterior_predictive( U_model, category):
    mean = posterior_predictive_params[category]['mean']
    cov_diag = posterior_predictive_params[category]['cov_diag']

    return gaussian(mean, np.diag(cov_diag)).logpdf(U_model)

In [113]:
logpps_by_category = []
K =  [k for k in posterior_params.keys()]

for k in K:
    logpps_k = calc_logp_posterior_predictive(testing_data_transformed, k)
    logpps_by_category.append(logpps_k)

logpps_by_category = np.stack(logpps_by_category, axis=-1)

logps = logpps_by_category
K=np.asarray(K)

In [114]:
logps[1]

array([-14.88410838, -11.90844957,  -7.49677593, -14.80564561,
       -18.05118594,  -9.49458779, -13.79777023, -20.78890266,
       -12.87057499, -20.49863311])

#### get highest logprob

In [117]:
predictions = K[np.argmax(logps, axis=-1)]

In [118]:
predictions

array([7, 2, 1, 0, 4, 1, 9, 4, 2, 7, 0, 5, 4, 0, 1, 5, 4, 7, 2, 9, 7, 6,
       5, 2, 9, 0, 7, 4, 0, 1, 3, 1, 3, 5, 9, 2, 9, 1, 2, 1, 1, 9, 9, 5,
       1, 2, 1, 5, 9, 4, 6, 2, 4, 5, 5, 2, 4, 1, 4, 5, 4, 2, 9, 8, 9, 9,
       5, 4, 3, 0, 7, 0, 2, 8, 1, 9, 3, 9, 8, 7, 4, 6, 2, 9, 8, 9, 9, 2,
       4, 1, 3, 6, 9, 3, 1, 4, 1, 8, 6, 4], dtype=uint8)