In [37]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from numpy.linalg import qr, svd
from sklearn.cross_decomposition import CCA
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [38]:
# Parameters
cell_type = "PBMC"
Shuffle = False

In [39]:
# set paths to data
morphology_data_path = pathlib.Path(
    f"../../data/{cell_type}_preprocessed_sc_norm_aggregated.parquet"
).resolve(strict=True)
nomic_data_path = pathlib.Path(
    f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}_clean.parquet"
).resolve(strict=True)

# output path
results_file_path = pathlib.Path(f"../results/{cell_type}_redundancy_analysis.csv")
results_file_path.parent.mkdir(parents=True, exist_ok=True)

# read data
morphology_data = pd.read_parquet(morphology_data_path)
nomic_data = pd.read_parquet(nomic_data_path)

In [40]:
# get the columns that contain metadata
morphology_metadata = morphology_data[
    morphology_data.columns[morphology_data.columns.str.contains("Metadata")]
]
morphology_data = morphology_data.drop(morphology_metadata.columns, axis=1)

nomic_data_values = nomic_data[
    nomic_data.columns[nomic_data.columns.str.contains("[NSU]", regex=True)]
]
nomic_metadata = nomic_data.drop(nomic_data_values.columns, axis=1)

In [41]:
# standardize the data for nomic standard scalar
scaler = StandardScaler()
nomic_data_values = scaler.fit_transform(nomic_data_values)
nomic_data_values = pd.DataFrame(
    nomic_data_values,
    columns=nomic_data.columns[nomic_data.columns.str.contains("[NSU]", regex=True)],
)

In [42]:
# shuffle the data both rows and columns
if Shuffle:
    for column in nomic_data_values:
        np.random.shuffle(nomic_data_values[column].values)
    for column in morphology_data:
        np.random.shuffle(morphology_data[column].values)

### Variables
$Y_{M \times P} = MorphologyData$  
$X_{N \times Q} = NomicData$  
Where  
$M = Rows of MorphologyData$  
$P = Columns of MorphologyData$  
$N = Rows of NomicData$  
$Q = Columns of NomicData$  

In [43]:
# define the variables
N = morphology_data.shape[0]
P = morphology_data.shape[1]

N = nomic_data_values.shape[0]
Q = nomic_data_values.shape[1]
print("N:", N, "P:", P, "Q:", Q)
K = min(N, P, Q)
print("K:", K)

# Define dfs
X = morphology_data
Y = nomic_data_values

N: 154 P: 1245 Q: 187
K: 154


In [44]:
# rewrite the dcorr functionsin python


def dcorr(X: pd.DataFrame, Y: pd.DataFrame):
    X

In [45]:
# Perform QR decomposition on X and Y
Qx, Rx = qr(X)
Qy, Ry = qr(Y)

# Compute the SVD of Qx' * Qy
k = min(X.shape[1], Y.shape[1], X.shape[0])
L, D, M = svd(Qx.T @ Qy, full_matrices=False)

# Compute A and B
df = X.shape[0] - 1
A = np.linalg.solve(L[:, :k], Rx) * np.sqrt(df)
df = Y.shape[0] - 1
B = np.linalg.solve(M[:, :k], Ry) * np.sqrt(df)
# Compute cc
cc = np.clip(np.diag(D[:k]), 0, 1)

# # Update A and B with iX and iY
# A[iX, :] = A
# B[iY, :] = B

# Compute U and V
U = X @ A
V = Y @ B

ValueError: Dot product shape mismatch, (154, 1245) vs (154, 1245)