# Modality Prediction Baseline ADT to GEX by Linear Regression
This notebook show basics of using datasets and making predictions

In [57]:
import anndata as ad

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error

In [58]:
# We are always predict mod2 from mod1
train_mod1 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_train_mod1.h5ad")
train_mod2 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_train_mod2.h5ad")
test_mod1 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_test_mod1.h5ad")

test_mod2 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_test_mod2.h5ad")

In [59]:
# Let's see inside
train_mod1.var

Unnamed: 0,feature_types
CD86,ADT
CD274,ADT
CD270,ADT
CD155,ADT
CD112,ADT
...,...
HLA-E,ADT
CD82,ADT
CD101,ADT
CD88,ADT


In [60]:
print(f"train_mod1 has {train_mod1.var['feature_types'][0]} data")

train_mod1 has ADT data


In [61]:
# Rename for convinience
all_train_adt = train_mod1
test_adt = test_mod1

In [62]:
# Let's look into the second dataset
train_mod2.var

Unnamed: 0,gene_ids,feature_types
AL627309.1,ENSG00000238009,GEX
AL627309.5,ENSG00000241860,GEX
LINC01409,ENSG00000237491,GEX
LINC01128,ENSG00000228794,GEX
LINC00115,ENSG00000225880,GEX
...,...,...
MT-CYB,ENSG00000198727,GEX
AC011043.1,ENSG00000276256,GEX
AL592183.1,ENSG00000273748,GEX
AC240274.1,ENSG00000271254,GEX


In [63]:
print(f"train_mod2 has {train_mod2.var['feature_types'][0]} data")

train_mod2 has GEX data


In [64]:
# Rename for convinience
all_train_gex = train_mod2
test_gex = test_mod2

In [66]:
# Each row is an observation of CITE-seq
print(all_train_adt.shape)
print(all_train_gex.shape)

(4721, 134)
(4721, 14910)


In [67]:
# Making toy split for demonstration purpouse.
# In real experiments we use predefined splits.
train_adt, val_adt, train_gex, val_gex = train_test_split(all_train_adt, all_train_gex)

In [68]:
# This is sparse matrix
train_gex.X

<3540x14910 sparse matrix of type '<class 'numpy.float32'>'
	with 6340383 stored elements in Compressed Sparse Column format>

In [69]:
# Let's calculate how much it would take to keep it dense
print(train_gex.shape[0] * train_gex.shape[1] * 4)  # 4 byte per float32 

211125600


In [70]:
# It is not that big. Let's convert matrices
Y_train = train_gex.X.toarray()
Y_val = val_gex.X.toarray()
Y_test = test_gex.X.toarray()

X_train = train_adt.X.toarray()
X_val = val_adt.X.toarray()
X_test = test_adt.X.toarray()

In [71]:
# Let's fit linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)

LinearRegression()

In [72]:
Y_val_pred = lin_reg.predict(X_val)

In [73]:
Y_val_pred

array([[ 0.01034048,  0.00813414,  0.11536215, ...,  0.12068152,
         0.01802794,  0.02855723],
       [-0.00331403,  0.05928669,  0.02421829, ..., -0.34450698,
         0.03285933,  0.01568299],
       [-0.00565962,  0.05317381,  0.0550539 , ..., -0.05673265,
         0.0089878 , -0.00309905],
       ...,
       [ 0.00169859, -0.00507122,  0.00833389, ...,  0.05450022,
         0.03043518,  0.00894554],
       [-0.00884723,  0.00200124,  0.0332225 , ...,  0.59352046,
        -0.00698158, -0.00333085],
       [-0.00241141, -0.02388895, -0.05890052, ...,  0.3954548 ,
        -0.01373908,  0.01506963]], dtype=float32)

In [74]:
mean_squared_error(Y_val, Y_val_pred, squared=False)

5.29023

In [75]:
Y_test_pred = lin_reg.predict(X_test)

In [76]:
mean_squared_error(Y_test, Y_test_pred, squared=False)

4.696687

In [77]:
# Is it really work? It seems strange that RMSE lower on test than on validation.
# Let's compare it with dummy regressor, which predict only mean values.

dummy_reg = DummyRegressor()
dummy_reg.fit(X_train, Y_train)

DummyRegressor()

In [78]:
Y_val_pred = dummy_reg.predict(X_val)
mean_squared_error(Y_val, Y_val_pred, squared=False)

7.976456

In [79]:
Y_test_pred = dummy_reg.predict(X_test)
mean_squared_error(Y_test, Y_test_pred, squared=False)

7.08827