In [57]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import explained_variance_score, r2_score
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# 1 Preparation

## 1.1 Embeddings

In [2]:
embeddings = pd.read_pickle('test/embeddings_complete.pickle')
len(embeddings)

10060

In [3]:
# combine 2 embeddings of father and mother for each individual
sample_id = []
gene_id = []
embedding = []
for i in range(0, len(embeddings), 2):
    sid = embeddings[i][0].split(':')[1]
    gid = embeddings[i][0].split(':')[2]
    sample_id.append(sid)
    gene_id.append(gid)
    # combined_embedding = (embeddings[i][1] + embeddings[i+1][1])/2    # mean embedding
    combined_embedding = np.concatenate([embeddings[i][1], embeddings[i+1][1]]) # concatenate 2 embeddings. TODO: does order matter?
    embedding.append(combined_embedding)
df = pd.DataFrame({"sample_id": sample_id, "gene_id": gene_id, "embedding": embedding})

In [4]:
df.head()

Unnamed: 0,sample_id,gene_id,embedding
0,NA20795,ENSG00000198502.5,"[0.017137265, -0.009090343, 0.0061443364, -0.0..."
1,HG00260,ENSG00000214425.1,"[0.05563126, 0.00049458974, -0.0046671517, -0...."
2,HG01632,ENSG00000176681.9,"[-0.051695395, 0.023533892, -0.04169209, 0.067..."
3,HG00173,ENSG00000238083.3,"[0.042141862, 0.0355676, 0.023393063, -0.03903..."
4,HG00178,ENSG00000229450.2,"[0.0736973, -0.022161566, -0.008337349, -0.040..."


## 1.2 Expression Data

In [5]:
expression = pd.read_csv("datasets/GD660.GeneQuantRPKM.txt.gz", sep="\t")
expression.head()

Unnamed: 0,TargetID,Gene_Symbol,Chr,Coord,HG00096.1.M_111124_6,HG00097.7.M_120219_2,HG00099.1.M_120209_6,HG00099.5.M_120131_3,HG00100.2.M_111215_8,HG00101.1.M_111124_4,...,NA20810.2.M_111215_7,NA20811.1.M_111124_5,NA20812.2.M_111216_6,NA20813.5.M_120131_1,NA20814.2.M_111215_6,NA20815.5.M_120131_5,NA20816.3.M_120202_7,NA20819.3.M_120202_2,NA20826.1.M_111124_1,NA20828.2.M_111216_8
0,ENSG00000225538.1,ENSG00000225538.1,11,55850277,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03876,0.0
1,ENSG00000237851.1,ENSG00000237851.1,6,143109260,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSG00000243765.1,ENSG00000243765.1,15,58442766,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ENSG00000257527.1,ENSG00000257527.1,16,18505708,0.70561,0.66697,0.64004,0.26195,0.34695,1.49208,...,0.87085,0.9495,0.95837,0.51002,0.29422,0.2296,0.58671,0.27674,0.5363,0.17139
4,ENSG00000212855.5,ENSG00000212855.5,Y,9578193,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# TODO: only yields 10 rows. Maybe use also training data?
expression = expression[expression["Gene_Symbol"].isin(df["gene_id"].values)]
expression = pd.melt(expression, id_vars=["TargetID", "Gene_Symbol", "Chr", "Coord"], var_name="sample_id", value_name="expression")
expression["sample_id"] = expression["sample_id"].str.split(".", expand=True)[0]
expression.head()

Unnamed: 0,TargetID,Gene_Symbol,Chr,Coord,sample_id,expression
0,ENSG00000232629.4,ENSG00000232629.4,6,32731311,HG00096,9.96245
1,ENSG00000179344.11,ENSG00000179344.11,6,32636160,HG00096,43.78876
2,ENSG00000176681.9,ENSG00000176681.9,17,44370099,HG00096,6.34823
3,ENSG00000214425.1,ENSG00000214425.1,17,43595264,HG00096,0.40868
4,ENSG00000237541.3,ENSG00000237541.3,6,32709119,HG00096,51.03589


Prepare data for Expression prediction.

y = expression values for (sample, gene) pairs.

X = matrix containing the embeddings for each (sample, gene) pair.

In [21]:
df = pd.merge(df, expression, left_on=["sample_id", "gene_id"], right_on=["sample_id", "Gene_Symbol"])
print(f"Number of samples for prediction expression: {len(df)}")

Number of samples for prediction expression: 4830


In [31]:
y = df["expression"].values
X = np.stack(df["embedding"].values)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# 2 Prediction of Expression

## 2.1 Linear Regression

In [36]:
linear_regression_model = LinearRegression().fit(X_train, y_train)

In [41]:
predicted = linear_regression_model.predict(X_test)
print(f"R² of test set: {r2_score(y_test, predicted)}")
print(f"Explained variance score on test set: {explained_variance_score(y_test, predicted)}")

R² of test set: 0.819131032999649
Explained variance score on test set: 0.8192817247444287


## 2.2 Ridge Regression

In [43]:
ridge_regression_model = Ridge(alpha=1.0).fit(X_train, y_train)

In [44]:
predicted = ridge_regression_model.predict(X_test)
print(f"R² of test set: {r2_score(y_test, predicted)}")
print(f"Explained variance score on test set: {explained_variance_score(y_test, predicted)}")

R² of test set: 0.7981812666388883
Explained variance score on test set: 0.7982050824060438


## 2.3 Lasso Regression

In [45]:
lasso_regression_model = Lasso(alpha=1.0).fit(X_train, y_train)

In [46]:
predicted = lasso_regression_model.predict(X_test)
print(f"R² of test set: {r2_score(y_test, predicted)}")
print(f"Explained variance score on test set: {explained_variance_score(y_test, predicted)}")

R² of test set: 0.5786876000003178
Explained variance score on test set: 0.5791925441003527


## 2.4 Elastic Net Regression

In [51]:
elasticnet_regression_model = ElasticNet(random_state=53).fit(X_train, y_train)

In [52]:
predicted = elasticnet_regression_model.predict(X_test)
print(f"R² of test set: {r2_score(y_test, predicted)}")
print(f"Explained variance score on test set: {explained_variance_score(y_test, predicted)}")

R² of test set: 0.0988456245099385
Explained variance score on test set: 0.10105627576430654


## 2.5 Support Vector Machine

In [58]:
svr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2)).fit(X_train, y_train)

In [59]:
predicted = svr.predict(X_test)
print(f"R² of test set: {r2_score(y_test, predicted)}")
print(f"Explained variance score on test set: {explained_variance_score(y_test, predicted)}")

R² of test set: 0.42912545052885365
Explained variance score on test set: 0.5083688313038175
