In [None]:
import hvplot.polars  # type: ignore
import numpy as np
import polars as pl
import polars.selectors as cs
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
from pathlib import Path

from dotenv import dotenv_values

paths = dotenv_values()
paths

In [None]:
data_path = Path(paths["DATA_DIR"])

df = pl.read_parquet(data_path / "processed-data/training_data-counts_uint.parquet")
df

In [None]:
X = df.select(cs.numeric()).to_numpy()

In [None]:
X_std = (X - X.mean(axis=1)[:, np.newaxis]) / X.std(axis=1)[:, np.newaxis]

In [None]:
import matplotlib.pyplot as plt

In [None]:
pca = PCA(n_components=512, svd_solver="randomized")
pca.fit(X_std.T)

In [None]:
transformed = pca.transform(X_std.T)

In [None]:
plt.plot(np.arange(512), np.cumsum(pca.explained_variance_ratio_))

In [None]:
embedding_df = pl.DataFrame(
    data=transformed, schema=[f"latent_{i}" for i in range(512)]
).with_columns(gene_name=pl.Series(df.select(cs.numeric()).columns))
embedding_df = embedding_df.select("gene_name", cs.numeric())
embedding_df

In [None]:
embedding_df.write_parquet(data_path / "gene_embeddings/PCA-train_expression.parquet")

# Covariance

In [None]:
covariance = np.cov(X_std.T)
covariance

In [None]:
from sklearn.decomposition import MiniBatchSparsePCA

sparse_pca = MiniBatchSparsePCA(n_components=512, n_jobs=8, batch_size=100)
transformed_sparse_pca = sparse_pca.fit_transform(X_std.T)

In [None]:
transformed_sparse_pca

In [None]:
sparse_pca_embedding_df = pl.DataFrame(
    data=transformed_sparse_pca, schema=[f"latent_{i}" for i in range(512)]
).with_columns(gene_name=pl.Series(df.select(cs.numeric()).columns))
sparse_pca_embedding_df = sparse_pca_embedding_df.select("gene_name", cs.numeric())
sparse_pca_embedding_df

In [None]:
sparse_pca_embedding_df.write_parquet(
    data_path / "gene_embeddings/SparsePCA-train_expression.parquet"
)