In [1]:
import numpy as np

In [76]:
X = np.array([[8, 4, 7], [2, 8, 1], [3, 1, 1], [9, 7, 4]]).T
barX = np.mean(X, axis=1).reshape((-1, 1))
S = 1/X.shape[1] * np.matmul(X, X.T) - np.matmul(barX, barX.T)

print('X:\n', X)
print('\nbarX:\n', barX)
print('\nS:\n', S)


### Measuring the spread of a point cloud

In [61]:
points = np.array([[1, 2], [3, 4], [-1, 0]])
u = 1 / np.sqrt(5) * np.array([[1, 2]])

signed_distance = np.matmul(u, points.T)
projection = signed_distance.T.dot(u)
print('projection of the points onto the unit vector u:\n', projection)
# print(signed_distance)

projection of the points onto the unit vector u:
 [[ 1.   2. ]
 [ 2.2  4.4]
 [-0.2 -0.4]]


In [94]:
data = np.matmul(u, points.T)
var_u_data = np.var(data)

n = points.shape[0]
H = (np.identity(n) - 1 / n * np.matmul(np.ones(n).reshape((n, 1)), np.ones(n).reshape((n, 1)).T))
S = (1 / n) * np.matmul(np.matmul(points.T, H), points)

u_S_u = np.matmul(np.matmul(u, S), u.T)
print(u_S_u)

[[4.8]]


## Principal Component Analysis (PCA)

In [23]:
data = np.array([[0, 2], [1, -1], [-1, -1]])
barX = np.mean(data, axis=0)
sigma = 1/(data.shape[0] - 1) * np.matmul(data.T, data)

evals, evecs = np.linalg.eig(sigma)  # /!\ eigenvectors are given in columns, not in rows (use evecs.T[0] to get the eigenvector corresponding to evals[0])
# print(evals)
# print(evecs.T)
PC1 = evecs.T[np.argmax(evals)]

projection_onto_PC1 = np.matmul(data, PC1)
# print(projection_onto_PC1)

np.var(projection_onto_PC1, ddof=1)  # we can check that the variance of the projected data onto PC1 is equal to lambda_1 (largest eigenvalue)

[ 1.11803399 -1.11803399]


In [37]:
data = np.array([[0, 2], [0, -2], [1, 1], [-1, -1]])
barX = np.mean(data, axis=0)
sigma = 1 / (data.shape[0] - 1) * np.matmul(data.T, data)

evals, evecs = np.linalg.eig(sigma)

PC1 = evecs.T[np.argmax(evals)]
projection_onto_PC1 = np.matmul(data, PC1)


[-1.94649798  1.94649798 -1.20300191  1.20300191]
