# Modality Prediction Baseline ADT to GEX by Linear Regression
This notebook show basics of using datasets and making predictions

In [1]:
import anndata as ad

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# We are always predict mod2 from mod1
train_mod1 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_train_mod1.h5ad")
train_mod2 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_train_mod2.h5ad")
test_mod1 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_test_mod1.h5ad")

test_mod2 = ad.read_h5ad("data/official/predict_modality/openproblems_bmmc_cite_phase1_mod2/openproblems_bmmc_cite_phase1_mod2.censor_dataset.output_test_mod2.h5ad")

In [3]:
# Let's see inside
train_mod1.var

Unnamed: 0,feature_types
CD86,ADT
CD274,ADT
CD270,ADT
CD155,ADT
CD112,ADT
...,...
HLA-E,ADT
CD82,ADT
CD101,ADT
CD88,ADT


In [4]:
print(f"train_mod1 has {train_mod1.var['feature_types'][0]} data")

train_mod1 has ADT data


In [5]:
# Rename for convinience
all_train_adt = train_mod1
test_adt = test_mod1

In [6]:
# Let's look into the second dataset
train_mod2.var

Unnamed: 0,gene_ids,feature_types
AL627309.5,ENSG00000241860,GEX
LINC01409,ENSG00000237491,GEX
LINC01128,ENSG00000228794,GEX
LINC00115,ENSG00000225880,GEX
FAM41C,ENSG00000230368,GEX
...,...,...
MT-CYB,ENSG00000198727,GEX
AC011043.1,ENSG00000276256,GEX
AL592183.1,ENSG00000273748,GEX
AC240274.1,ENSG00000271254,GEX


In [7]:
print(f"train_mod2 has {train_mod2.var['feature_types'][0]} data")

train_mod2 has GEX data


In [8]:
# Rename for convinience
all_train_gex = train_mod2
test_gex = test_mod2

In [9]:
# Each row is an observation of CITE-seq
print(all_train_adt.shape)
print(all_train_gex.shape)

(29163, 134)
(29163, 13953)


In [10]:
# Making toy split for demonstration purpouse.
# In real experiments we use predefined splits.
train_adt, val_adt, train_gex, val_gex = train_test_split(all_train_adt, all_train_gex)

In [11]:
# This is sparse matrix
train_gex.X

<21872x13953 sparse matrix of type '<class 'numpy.float32'>'
	with 31181089 stored elements in Compressed Sparse Column format>

In [12]:
# Let's calculate how much it would take to keep it dense
print(train_gex.shape[0] * train_gex.shape[1] * 4)  # 4 byte per float32 

1220720064


In [13]:
# It is not that big. Let's convert matrices
Y_train = train_gex.X.toarray()
Y_val = val_gex.X.toarray()
Y_test = test_gex.X.toarray()

X_train = train_adt.X.toarray()
X_val = val_adt.X.toarray()
X_test = test_adt.X.toarray()

In [14]:
# Let's fit linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)

LinearRegression()

In [15]:
Y_val_pred = lin_reg.predict(X_val)

In [16]:
Y_val_pred

array([[ 1.1057407e-03,  9.8647147e-02,  1.8854186e-02, ...,
         9.4747081e-02,  2.3247585e-02,  3.1042155e-03],
       [-2.6070908e-02,  9.5203608e-02,  6.9537766e-02, ...,
         5.8041409e-02,  1.5211878e-02,  1.6670756e-02],
       [-2.2585690e-02,  9.0119690e-02, -8.7889712e-03, ...,
         1.2738451e-02,  6.2178429e-03,  4.9617011e-03],
       ...,
       [ 3.4700006e-02,  1.1063051e-01,  8.8005178e-03, ...,
         5.0541829e-02,  2.3674514e-02,  1.1761515e-02],
       [-1.2393296e-02,  7.9894252e-03,  1.2064542e+00, ...,
         1.1657126e-02, -4.8067905e-03,  2.7819213e-03],
       [ 9.0695277e-02,  1.0640955e-01,  9.1299877e-02, ...,
         7.8473695e-02,  2.0303648e-02,  1.2652895e-02]], dtype=float32)

In [17]:
mean_squared_error(Y_val, Y_val_pred, squared=False)

5.54447

In [18]:
Y_test_pred = lin_reg.predict(X_test)

In [19]:
mean_squared_error(Y_test, Y_test_pred, squared=False)

4.178273

In [20]:
# Is it really work? It seems strange that RMSE lower on test than on validation.
# Let's compare it with dummy regressor, which predict mean values always.

dummy_reg = DummyRegressor()
dummy_reg.fit(X_train, Y_train)

DummyRegressor()

In [21]:
Y_val_pred = dummy_reg.predict(X_val)
mean_squared_error(Y_val, Y_val_pred, squared=False)

6.6551456

In [22]:
Y_test_pred = dummy_reg.predict(X_test)
mean_squared_error(Y_test, Y_test_pred, squared=False)

5.217001