In [0]:
PCA_GRID_PATH = '/datascope/subaru/data/pfsspec/models/stellar/rbf/phoenix/phoenix_HiRes_GK/pca_none_02'

In [0]:
import os
import h5py
import numpy as np
import matplotlib.pyplot as plt

# Load data and run PCA

In [0]:
fn = os.path.join(PCA_GRID_PATH, 'spectra.h5')
with h5py.File(fn, 'r') as f:
    X = f['x'][()]
    W = f['w'][()]

In [0]:
W.shape, X.shape

In [0]:
# Calculate the weighted data matrix

# TODO: normalization is wrong in the code

mask = (W > 0)
WX = np.sqrt(W[mask][:, np.newaxis]) * X[mask]

####WX /= np.sum(np.sqrt(W[mask]))

WX.shape

In [0]:
# Calculate weighted mean

# TODO: weighted mean is wrong in the code, should be sqrt(W)

M = np.mean(WX, axis=0)
M.shape

In [0]:
plt.plot(M, lw=0.2)

In [0]:
# Run the full SVD
_, S, Vh = np.linalg.svd(WX - M, full_matrices=False)
S.shape, Vh.shape

In [0]:
WX.shape

In [0]:
plt.plot(1 + np.arange(S.size), np.log10(np.cumsum(S[::-1]**2)[::-1]/np.sum(S**2)))
plt.semilogx()
plt.grid()

In [0]:
plt.plot(Vh[0], lw=0.2)
plt.plot(Vh[1], lw=0.2)

In [0]:
np.dot(Vh[0], Vh[1])

# Calculate the PCs

In [0]:
# Calculate the PCs, no weights here!
PC = np.matmul(X - M, Vh.T)
#PC = np.dot(X - M, Vh.T)
PC.shape

In [0]:
PC

# Truncate and calculate residuals

In [0]:
K = 2000

In [0]:
Y = np.dot(PC[:, :K], Vh[:K, :]) + M
Y.shape

In [0]:
plt.plot(X[0], lw=0.2)
plt.plot(Y[0], lw=0.2)

In [0]:
plt.plot(X[0] - Y[0], lw=0.2)

In [0]:
R = X - Y
R.shape

In [0]:
# R = np.dot(PC[:, K:], Vh[K:, :])
# R.shape

In [0]:
# This has a weight of 1

plt.plot(R[0], lw=0.2)

In [0]:
# This has a weight of 0

plt.plot(R[-1], lw=0.2)

In [0]:
hist, bins = np.histogram(R[0], bins=100)
plt.plot(bins[:-1], hist)
plt.grid()

In [0]:
hist, bins = np.histogram(R[mask, 22000], bins=100)
plt.plot(bins[:-1], hist)

#hist, bins = np.histogram(R[~mask, 5000], bins=100)
#plt.plot(bins[:-1], hist)

plt.grid()

In [0]:
plt.plot(R[mask].std(axis=0), lw=0.2)

In [0]:
plt.plot(R[~mask].std(axis=0), lw=0.2)

# Calculate the residuals using projectors

In [0]:
# Calculate the PCs using the projector
P = np.matmul(Vh[:K, :].T, Vh[:K, :])
P.shape

In [0]:
Q = np.eye(P.shape[0]) - P

In [0]:
RR = np.matmul(Q, (X - M).T)
RR.shape

In [0]:
plt.plot(RR[:, 0], lw=0.2)

# Error vector from the original full SVD

$ C = \frac{1}{n - 1} X^T X $

$ X = U S V^T $

$ C = \frac{1}{n - 1} V S^2 V^T = E \Lambda E^T $

$ E = V $

$ \Lambda = \frac{1}{n - 1} S^2 $

Leftover variance

In [0]:
K, WX.shape[0], X.shape[0]

In [0]:
W

In [0]:
plt.plot(WX[0], lw=0.2)
plt.plot(WX[1], lw=0.2)

In [0]:
# Eigenvalues of C from singular values of WX

L = 1 / (WX.shape[0] - 1) * S**2

In [0]:
L

In [0]:
# sigma = np.sqrt(np.sum((1 / (X.shape[0] - 1) * S[:, np.newaxis]**2 * Vh**2)[K:], axis=0))
sigma = np.sqrt(np.sum((L[:, np.newaxis] * Vh**2)[K:], axis=0))
sigma.shape

In [0]:
f, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=120)

#ax.plot(sigma, lw=0.2)
ax.plot(R[mask].std(axis=0), lw=0.2)

#ax.plot(sigma - R[mask].std(axis=0), lw=0.2)

In [0]:
f, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=120)

ax.plot(R.std(axis=0), lw=0.2)
ax.plot(R[mask].std(axis=0), lw=0.2)
#ax.plot(R[~mask].std(axis=0), lw=0.2)

In [0]:
mask2 = np.abs(R) > 2.5e-4
R2 = R.copy()
R2[mask2] = np.nan

In [0]:
f, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=120)

#ax.plot(R.std(axis=0), lw=0.2)
ax.plot(np.nanstd(R2, axis=0), lw=0.2)
ax.plot(R[mask].std(axis=0), lw=0.2)

# Error vector from the residual matrix

In [0]:
R.shape

In [0]:
RM = R.mean(axis=0)
RM.shape

In [0]:
plt.plot(RM, lw=0.2)

In [0]:
Rstd = R.std(axis=0)
Rstd.shape

In [0]:
plt.plot(Rstd, lw=0.2)

# SVD on the residual matrix

In [0]:
_, RS, RVh = np.linalg.svd(R, full_matrices=False)

In [0]:
RS.shape, RVh.shape

In [0]:
plt.plot(1 + np.arange(RS.size), np.log10(np.cumsum(RS[::-1]**2)[::-1]/np.sum(RS**2)))
plt.semilogx()
plt.grid()

In [0]:
Rsigma = np.sum((RS[:, np.newaxis] * RVh**2), axis=0)
Rsigma.shape

In [0]:
plt.plot(Rsigma, lw=0.2)