In [21]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns 
from scipy.stats import pearsonr, spearmanr

# Gene Expression via SNPs: Linear Regression

### Read in data

In [22]:
expression_file = "datasets/GD660.GeneQuantRPKM.txt.gz"
test_set_file = "datasets/5_fold_cv_split.tsv"

Read in genotypes:

In [23]:
test_set = pd.read_csv(test_set_file, sep="\t")

In [24]:
genotypes = pd.read_csv("datasets/variants_subset.tsv", sep="\t").drop(columns=["REF", "ALT", "QUAL", "FILTER", "INFO",	"FORMAT"])
genotypes = pd.melt(genotypes, id_vars=["#CHROM", "POS", "ID"], var_name="sample_id", value_name="genotype")
genotypes["#CHROM"] = genotypes["#CHROM"].astype(str)
genotypes["POS"] = genotypes["POS"].astype(str)
genotypes.loc[genotypes.ID==".", 'ID'] = "snp_" + genotypes["#CHROM"] + "_" + genotypes.POS
genotypes = genotypes.drop(columns=["#CHROM", "POS"])
genotypes.genotype = genotypes['genotype'].str.split(':',expand=True)[0] 
conditions = [
    (genotypes['genotype'] == '0|0'), (genotypes['genotype'] == '1|0'), (genotypes['genotype'] == '0|1'), (genotypes['genotype'] == '1|1')]
choices = [0, 1, 1, 2]
genotypes['GT_encoding'] = np.select(conditions, choices)
#genotypes = pd.merge(test_set.drop(columns=["sample_id", "gene_id"]).drop_duplicates(), genotypes, left_on=["patient"], right_on=["sample_id"])
genotypes

Unnamed: 0,ID,sample_id,genotype,GT_encoding
0,rs9999724,HG00096,0|0,0
1,snp_6_32552087,HG00096,0|1,1
2,snp_6_32629257,HG00096,1|1,2
3,snp_6_32634306,HG00096,1|1,2
4,rs9899833,HG00096,0|1,1
...,...,...,...,...
6547,snp_6_32552087,NA20828,1|0,1
6548,snp_6_32629257,NA20828,1|1,2
6549,snp_6_32634306,NA20828,1|1,2
6550,rs9899833,NA20828,1|1,2


Read in embeddings to get the GENE IDs: 

In [25]:
embeddings = pd.read_pickle("test/embeddings_complete.pickle")
sample_id = []
gene_id = []
embedding = []
for i in range(0, len(embeddings), 2):
    sid = embeddings[i][0].split(':')[1]
    gid = embeddings[i][0].split(':')[2]
    sample_id.append(sid)
    gene_id.append(gid)
    # combined_embedding = np.concatenate([embeddings[i][1], embeddings[i+1][1]]) # concatenate 2 embeddings
    combined_embedding = np.array(embeddings[i][1] + embeddings[i+1][1]) # element-wise sum of 2 embeddings
    embedding.append(combined_embedding)
df = pd.DataFrame({"sample_id": sample_id, "gene_id": gene_id, "embedding": embedding})

Read in expression data: 

In [26]:
expression = pd.read_csv(expression_file, sep="\t")
expression = expression[expression["Gene_Symbol"].isin(df["gene_id"].values)]
expression = pd.melt(expression, id_vars=["TargetID", "Gene_Symbol", "Chr", "Coord"], var_name="sample_id", value_name="expression")
expression["sample_id"] = expression["sample_id"].str.split(".", expand=True)[0] + "." + expression["sample_id"].str.split(".", expand=True)[1]
data = pd.merge(test_set, expression, left_on=["gene_id", "sample_id"], right_on=["Gene_Symbol", "sample_id"])
data = pd.merge(genotypes, data, left_on=["sample_id"], right_on=["patient"])
data = data.drop(columns=["Chr", "TargetID", "Coord", "Gene_Symbol"])
#data.rename(columns={"sample_id_y":"sample_id"}, inplace=True)
data.head()

Unnamed: 0,ID,sample_id_x,genotype,GT_encoding,sample_id_y,patient,gene_id,test_split,expression
0,rs9999724,HG00096,0|0,0,HG00096.1,HG00096,ENSG00000214425.1,3,0.40868
1,rs9999724,HG00096,0|0,0,HG00096.1,HG00096,ENSG00000176681.9,3,6.34823
2,rs9999724,HG00096,0|0,0,HG00096.1,HG00096,ENSG00000179344.11,3,43.78876
3,rs9999724,HG00096,0|0,0,HG00096.1,HG00096,ENSG00000232629.4,3,9.96245
4,rs9999724,HG00096,0|0,0,HG00096.1,HG00096,ENSG00000238083.3,3,7.90419


In [27]:
reference = pd.read_csv("datasets/EUR373.gene.cis.FDR5.all.rs137.txt.gz", sep="\t")
reference = reference[reference.GENE_ID.isin(data.gene_id.unique())]
reference.GENE_ID = reference.GENE_ID.astype('category')
reference.head()

Unnamed: 0,SNP_ID,ID,GENE_ID,PROBE_ID,CHR_SNP,CHR_GENE,SNPpos,TSSpos,distance,rvalue,pvalue,log10pvalue
55450,rs10020034,-,ENSG00000229450.2,ENSG00000229450.2,4,4,120294331.0,120299287,4956.0,0.548974,1.079312e-30,29.966853
55451,rs3955380,-,ENSG00000229450.2,ENSG00000229450.2,4,4,120334410.0,120299287,35123.0,0.547157,1.832933e-30,29.736854
55452,rs28403131,-,ENSG00000229450.2,ENSG00000229450.2,4,4,120285806.0,120299287,13481.0,0.545653,2.8348809999999998e-30,29.547465
55453,rs79262354,-,ENSG00000229450.2,ENSG00000229450.2,4,4,120338335.0,120299287,39048.0,0.545126,3.30082e-30,29.481378
55454,rs78958430,-,ENSG00000229450.2,ENSG00000229450.2,4,4,120338339.0,120299287,39052.0,0.544934,3.488677e-30,29.457339


In [28]:
best_snps = reference[["GENE_ID", "SNP_ID", "rvalue"]].groupby("GENE_ID").max()
best_snps = best_snps.reset_index()
best_snps = best_snps[["GENE_ID", "SNP_ID"]]

Only keep most predictive SNP for each gene

In [29]:
data = pd.merge(data, best_snps, left_on=["ID", "gene_id"], right_on=["SNP_ID", "GENE_ID"])

### Run Linear Regression

In [30]:
splits_train = {}
splits_test = {}
for i in range(1, 6):
    r_values_train = {}
    r_values_test = {}
    for gene in data.gene_id.unique():
        # extract data
        gene_df = data[data.gene_id==gene]
        gene_df_train = gene_df[gene_df.test_split != i]
        gene_df_test = gene_df[gene_df.test_split == i]

        X_train = np.array(gene_df_train["GT_encoding"]).reshape(-1, 1)
        X_test = np.array(gene_df_test["GT_encoding"]).reshape(-1, 1)
        y_train = np.array(gene_df_train.expression)
        y_test = np.array(gene_df_test.expression)

        # train model
        model = LinearRegression().fit(X_train, y_train)

        # evaluate model on test set
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        r_values_train[gene] = pearsonr(y_train, y_pred_train)[0]
        r_values_test[gene] = pearsonr(y_test, y_pred_test)[0]
    splits_train[i] = r_values_train
    splits_test[i] = r_values_test

In [31]:
splits_test

{1: {'ENSG00000229450.2': 0.5838158873780467,
  'ENSG00000196126.6': -0.010763378614997422,
  'ENSG00000198502.5': 0.10200379539571876,
  'ENSG00000179344.11': 0.4347964807105317,
  'ENSG00000232629.4': 0.5652492282985153,
  'ENSG00000237541.3': 0.6617140720179402,
  'ENSG00000196735.6': 0.3548608619011706,
  'ENSG00000214425.1': 0.3825433425358211,
  'ENSG00000176681.9': 0.2777803066581089,
  'ENSG00000238083.3': 0.16879455130923565},
 2: {'ENSG00000229450.2': 0.4691115855795315,
  'ENSG00000196126.6': 0.27985575993872325,
  'ENSG00000198502.5': 0.07079277051916556,
  'ENSG00000179344.11': 0.5890411023450833,
  'ENSG00000232629.4': 0.4018050698611545,
  'ENSG00000237541.3': 0.44436281545505874,
  'ENSG00000196735.6': 0.560246279084256,
  'ENSG00000214425.1': 0.32156282683694326,
  'ENSG00000176681.9': 0.07276634100342154,
  'ENSG00000238083.3': 0.038449204012593045},
 3: {'ENSG00000229450.2': 0.4413664714953019,
  'ENSG00000196126.6': 0.375531038379767,
  'ENSG00000198502.5': -0.06995

### Extract and plot data

In [32]:
s = []
g = []
r = []
for k, res in splits_test.items():
    s += [k]*len(res)
    g += list(res.keys())
    r += list(res.values())
test_df = pd.DataFrame({"split": s, "gene_id": g, "rvalue": r})

s = []
g = []
r = []
for k, res in splits_train.items():
    s += [k]*len(res)
    g += list(res.keys())
    r += list(res.values())
train_df = pd.DataFrame({"split": s, "gene_id": g, "rvalue": r})

In [33]:
x1 = test_df.groupby("gene_id").mean()
x1 = x1.reset_index()
x2 = test_df.groupby("gene_id").std()
x2 = x2.reset_index()
x = pd.merge(x1[["gene_id", "rvalue"]], x2[["gene_id", "rvalue"]], on="gene_id")
x["method"] = "SNP"
x.to_csv("snp.tsv", sep="\t", index=False)

In [34]:
test_df.groupby("gene_id").std()

Unnamed: 0_level_0,split,rvalue
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000176681.9,1.581139,0.165889
ENSG00000179344.11,1.581139,0.086497
ENSG00000196126.6,1.581139,0.172982
ENSG00000196735.6,1.581139,0.143172
ENSG00000198502.5,1.581139,0.101835
ENSG00000214425.1,1.581139,0.116644
ENSG00000229450.2,1.581139,0.053658
ENSG00000232629.4,1.581139,0.169125
ENSG00000237541.3,1.581139,0.07753
ENSG00000238083.3,1.581139,0.17069


In [35]:
print(f"mean: {test_df.groupby('gene_id').mean().rvalue.mean()}")
print(f"SD: {test_df.groupby('gene_id').mean().rvalue.std()}")

mean: 0.35419810599101403
SD: 0.15248618932782093


In [36]:
test_df["method"] = "SNP"
test_df.to_csv("snp.tsv", sep="\t", index=False)