# Modality Prediction Baseline ADT to GEX by Linear Regression
This notebook show basics of using datasets and making predictions

In [1]:
import anndata as ad
from lab_scripts.utils import utils
utils.change_directory_to_repo()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# We are always predict mod2 from mod1
train_mod1 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_train_mod1.h5ad")
train_mod2 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_train_mod2.h5ad")
test_mod1 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_test_mod1.h5ad")

test_mod2 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_test_mod2.h5ad")

In [3]:
# Let's see inside
train_mod1.var

Unnamed: 0,feature_types
CD86,ADT
CD274,ADT
CD270,ADT
CD155,ADT
CD112,ADT
...,...
HLA-E,ADT
CD82,ADT
CD101,ADT
CD88,ADT


In [4]:
print(f"train_mod1 has {train_mod1.var['feature_types'][0]} data")

train_mod1 has ADT data


In [5]:
# Rename for convinience
all_train_adt = train_mod1
test_adt = test_mod1

In [6]:
# Let's look into the second dataset
train_mod2.var

Unnamed: 0,gene_ids,feature_types
AL627309.5,ENSG00000241860,GEX
LINC01409,ENSG00000237491,GEX
LINC01128,ENSG00000228794,GEX
LINC00115,ENSG00000225880,GEX
FAM41C,ENSG00000230368,GEX
...,...,...
MT-CYB,ENSG00000198727,GEX
AC011043.1,ENSG00000276256,GEX
AL592183.1,ENSG00000273748,GEX
AC240274.1,ENSG00000271254,GEX


In [7]:
print(f"train_mod2 has {train_mod2.var['feature_types'][0]} data")

train_mod2 has GEX data


In [8]:
# Rename for convinience
all_train_gex = train_mod2
test_gex = test_mod2

In [9]:
# Each row is an observation of CITE-seq
print(all_train_adt.shape)
print(all_train_gex.shape)

(29077, 134)
(29077, 13953)


In [10]:
# Making toy split for demonstration purpouse.
# In real experiments we use predefined splits.
train_adt, val_adt, train_gex, val_gex = train_test_split(all_train_adt, all_train_gex)

In [11]:
# This is sparse matrix
train_gex.X

<21807x13953 sparse matrix of type '<class 'numpy.float32'>'
	with 31112924 stored elements in Compressed Sparse Column format>

In [12]:
# Let's calculate how much it would take to keep it dense
print(train_gex.shape[0] * train_gex.shape[1] * 4)  # 4 byte per float32 

1217092284


In [13]:
# It is not that big. Let's convert matrices
Y_train = train_gex.X.toarray()
Y_val = val_gex.X.toarray()
Y_test = test_gex.X.toarray()

X_train = train_adt.X.toarray()
X_val = val_adt.X.toarray()
X_test = test_adt.X.toarray()

In [14]:
# Let's fit linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)

LinearRegression()

In [15]:
Y_val_pred = lin_reg.predict(X_val)

In [16]:
Y_val_pred

array([[ 5.31201884e-02,  7.17333481e-02,  2.96295993e-02, ...,
         1.85629949e-02,  2.07222700e-02,  9.80405416e-03],
       [ 2.17648521e-02,  3.30272727e-02, -1.42812952e-02, ...,
        -3.38099897e-04,  5.31422906e-03,  7.16342311e-03],
       [ 1.19140223e-02,  4.62589860e-02,  3.06620486e-02, ...,
         2.72917412e-02,  9.81870387e-03,  6.35217503e-03],
       ...,
       [ 8.85424763e-03,  1.03556812e-02,  7.08849505e-02, ...,
         1.12022966e-01,  8.97550583e-03,  1.04043004e-03],
       [-6.67647552e-03, -3.82477231e-03,  2.23687232e-01, ...,
         8.59666616e-03,  2.94764154e-03, -9.78126284e-03],
       [ 4.30489257e-02,  3.62229683e-02,  3.42908949e-02, ...,
         1.76957510e-02, -5.23352716e-03, -6.22074585e-05]], dtype=float32)

In [17]:
mean_squared_error(Y_val, Y_val_pred, squared=False)

0.21950792

In [18]:
Y_test_pred = lin_reg.predict(X_test)

In [19]:
mean_squared_error(Y_test, Y_test_pred, squared=False)

0.2510271

In [20]:
# Is it really work? It seems strange that RMSE lower on test than on validation.
# Let's compare it with dummy regressor, which predict mean values always.

dummy_reg = DummyRegressor()
dummy_reg.fit(X_train, Y_train)

DummyRegressor()

In [21]:
Y_val_pred = dummy_reg.predict(X_val)
mean_squared_error(Y_val, Y_val_pred, squared=False)

0.23375584

In [22]:
Y_test_pred = dummy_reg.predict(X_test)
mean_squared_error(Y_test, Y_test_pred, squared=False)

0.2717182