In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import copy

import numpy as np
import sys

import scanpy as sc
import anndata as ad

from sklearn.decomposition import TruncatedSVD

from sklearn.linear_model import LinearRegression
from scipy.sparse import csc_matrix
import logging
from torch.utils.data.dataset import Dataset
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
import numpy as np


In [None]:
par = {
    'input_train_mod2': '/mnt/data/output/datasets/common/openproblems_bmmc_multiome_phase2/openproblems_bmmc_multiome_phase2.manual_formatting.output_mod2.h5ad',
    'input_train_mod1': '/mnt/data/output/datasets/common/openproblems_bmmc_multiome_phase2/openproblems_bmmc_multiome_phase2.manual_formatting.output_rna.h5ad',
    'input_test_mod2': '/mnt/data/output/datasets/predict_modality/openproblems_bmmc_multiome_phase2_mod2/openproblems_bmmc_multiome_phase2_mod2.censor_dataset.output_test_mod1.h5ad',
    'input_test_mod1': '/mnt/data/output/datasets/predict_modality/openproblems_bmmc_multiome_phase2_mod2/openproblems_bmmc_multiome_phase2_mod2.censor_dataset.output_test_mod2.h5ad',
    'output': 'output.h5ad',
}
meta = { 'functionality_name': 'lslab' }

In [None]:
dataset_id = "gex2atac"

In [None]:
input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])

In [None]:
input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])

In [None]:
input_test_mod2= input_train_mod2[input_train_mod2.obs["is_train"]==False]

input_test_mod1= input_train_mod1[input_train_mod1.obs["is_train"]==False]

In [None]:
input_test_mod1=input_test_mod1[0:20000:2]
input_test_mod2=input_test_mod2[0:20000:2]

In [None]:
input_train_mod2= input_train_mod2[input_train_mod2.obs["is_train"]==True]

input_train_mod1= input_train_mod1[input_train_mod1.obs["is_train"]==True] 

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

In [None]:
batches = set(input_train_mod1.obs["batch"])
batch_dict = {batch:i for i, batch in enumerate(batches)}
y = []

In [None]:
for i in range(input_train_mod1.n_obs):
    y.append(int(batch_dict[input_train_mod1.obs["batch"][i]]))

In [None]:
fold = 0

In [None]:
X = input_train_mod1.obs
batches = np.array(y)

In [None]:
inp_train_mod1 = input_train_mod1.copy()
inp_train_mod2 = input_train_mod2.copy()

In [None]:
out1, out2 = 0, 0

In [None]:
if "atac2gex" in dataset_id:
    out_knn = 0

    for train_index, test_index in skf.split(X, y):
        print(fold)
        fold += 1

        input_test_mod1 = inp_train_mod1[test_index, :]
        true_test_mod2 = inp_train_mod2[test_index, :]

        input_train_mod1 = inp_train_mod1[train_index, :]
        input_train_mod2 = inp_train_mod2[train_index, :]
    
        input_mod1 = ad.concat(
                {"train": input_train_mod1, "val": input_test_mod1, "test": final_input_test_mod1},
                axis=0,
                join="outer",
                label="group",
                fill_value=0,
                index_unique="-",
            )

        # Do PCA on the input data
        logging.info('Performing dimensionality reduction on modality 1 values...')
        embedder_mod1 = TruncatedSVD(n_components=50)
        mod1_pca = embedder_mod1.fit_transform(input_mod1.X)

        logging.info('Performing dimensionality reduction on modality 2 values...')
        embedder_mod2 = TruncatedSVD(n_components=50)
        mod2_pca = embedder_mod2.fit_transform(input_train_mod2.X)

        # split dimred back up
        X_train = mod1_pca[input_mod1.obs['group'] == 'train']
        X_test = mod1_pca[input_mod1.obs['group'] == 'test']
        y_train = mod2_pca

        # Get all responses of the training data set to fit the
        # KNN regressor later on.
        # Make sure to use `toarray()` because the output might
        # be sparse and `KNeighborsRegressor` cannot handle it.

        logging.info('Running Linear regression...')
    
        reg = KNeighborsRegressor(n_neighbors=25, metric='minkowski')

        # Train the model on the PCA reduced modality 1 and 2 data
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)

        # Project the predictions back to the modality 2 feature space
        y_pred = y_pred @ embedder_mod2.components_
    
        out_knn += y_pred

    y_pred_knn = out_knn / 10
    
    out_rf = 0

    for train_index, test_index in skf.split(X, y):
        print(fold)
        fold += 1

        input_test_mod1 = inp_train_mod1[test_index, :]
        input_test_mod2 = inp_train_mod2[test_index, :]

        input_train_mod1 = inp_train_mod1[train_index, :]
        input_train_mod2 = inp_train_mod2[train_index, :]
    
        input_mod1 = ad.concat(
                {"train": input_train_mod1, "val": input_val_mod1, "test": final_input_test_mod1},
                axis=0,
                join="outer",
                label="group",
                fill_value=0,
                index_unique="-",
            )

        # Do PCA on the input data
        logging.info('Performing dimensionality reduction on modality 1 values...')
        embedder_mod1 = TruncatedSVD(n_components=50)
        mod1_pca = embedder_mod1.fit_transform(input_mod1.X)

        logging.info('Performing dimensionality reduction on modality 2 values...')
        embedder_mod2 = TruncatedSVD(n_components=50)
        mod2_pca = embedder_mod2.fit_transform(input_train_mod2.X)

        # split dimred back up
        X_train = mod1_pca[input_mod1.obs['group'] == 'train']
        X_test = mod1_pca[input_mod1.obs['group'] == 'test']
        y_train = mod2_pca

        # Get all responses of the training data set to fit the
        # KNN regressor later on.
        # Make sure to use `toarray()` because the output might
        # be sparse and `KNeighborsRegressor` cannot handle it.

        logging.info('Running Linear regression...')
    
        reg = RandomForestRegressor()

        # Train the model on the PCA reduced modality 1 and 2 data
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)

        # Project the predictions back to the modality 2 feature space
        y_pred = y_pred @ embedder_mod2.components_
    
        out_rf += y_pred

    y_pred_rf = out_rf / 10
    
    y_pred = 0.45 * y_pred_rf + 0.55 * y_pred_knn
    y_pred = csc_matrix(y_pred)

    adata = ad.AnnData(
        X=y_pred,
       obs=final_input_test_mod1.obs,
       var=inp_train_mod2.var,
       uns={
           'dataset_id': dataset_id,
           'method_id': meta["functionality_name"],
       },
    )
    
    logging.info('Storing annotated data...')
    adata.write_h5ad(par['output'], compression = "gzip")
else:
    out_knn = 0

    for train_index, test_index in skf.split(X, y):
        print(fold)
        fold += 1

        input_val_mod1 = inp_train_mod1[test_index, :]
        input_val_mod2 = inp_train_mod2[test_index, :]

        input_train_mod1 = inp_train_mod1[train_index, :]
        input_train_mod2 = inp_train_mod2[train_index, :]
    
        input_mod1 = ad.concat(
                {"train": input_train_mod1, "val": input_val_mod1, "test": input_test_mod1},
                axis=0,
                join="outer",
                label="group",
                fill_value=0,
                index_unique="-",
            )

        # Do PCA on the input data
        logging.info('Performing dimensionality reduction on modality 1 values...')
        embedder_mod1 = TruncatedSVD(n_components=50)
        mod1_pca = embedder_mod1.fit_transform(input_mod1.X)

        logging.info('Performing dimensionality reduction on modality 2 values...')
        embedder_mod2 = TruncatedSVD(n_components=50)
        mod2_pca = embedder_mod2.fit_transform(input_train_mod2.X)

        # split dimred back up
        X_train = mod1_pca[input_mod1.obs['group'] == 'train']
        X_test = mod1_pca[input_mod1.obs['group'] == 'test']
        y_train = mod2_pca

        # Get all responses of the training data set to fit the
        # KNN regressor later on.
        # Make sure to use `toarray()` because the output might
        # be sparse and `KNeighborsRegressor` cannot handle it.

        logging.info('Running Linear regression...')
    
        reg = KNeighborsRegressor(n_neighbors=25, metric='minkowski')

        # Train the model on the PCA reduced modality 1 and 2 data
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)

        # Project the predictions back to the modality 2 feature space
        y_pred = y_pred @ embedder_mod2.components_
    
        out_knn += y_pred

    y_pred = out_knn / 10
    y_pred = csc_matrix(y_pred)

    adata = ad.AnnData(
        X=y_pred,
       obs=input_test_mod1.obs,
       var=inp_train_mod2.var,
       uns={
           'dataset_id': dataset_id,
           'method_id': meta["functionality_name"],
       },
    )
    
    logging.info('Storing annotated data...')
    adata.write_h5ad(par['output'], compression = "gzip")

# Evaluate Performance


In [None]:
import sklearn

In [None]:
precision,recall,_ = sklearn.metrics.precision_recall_curve(np.reshape(np.array(input_test_mod2.X.todense()), -1), 
                                                            np.reshape(np.array(adata.X.todense()), -1))                                      


In [None]:
sklearn.metrics.PrecisionRecallDisplay(precision=precision, recall=recall).plot()

## Area under the Curve

In [None]:
AUPRC = sklearn.metrics.average_precision_score(np.reshape(np.array(input_test_mod2.X.todense()), -1),
                                                np.reshape(np.array(adata.X.todense()), -1))
AUPRC

## Root Mean Squared Error

In [None]:
diff = np.array(adata.X.todense()) - np.array(input_test_mod2.X.todense())
n,m = adata.shape
RMSE = np.sqrt(1/(n * m) * (diff **2).sum())
RMSE