In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns 
from scipy.stats import pearsonr, spearmanr

# Gene Expression via SNPs: Linear Regression

### Read in data

In [2]:
expression_file = "datasets/GD660.GeneQuantRPKM.txt.gz"
test_set_file = "datasets/5_fold_cv_split.tsv"
genotype_file = "datasets/subset_selected.tsv.gz"
embedding_file = "test/embeddings_complete.pickle"
gene_location_file = "datasets/gene_location.tsv"

Get genotypes:

In [3]:
test_set = pd.read_csv(test_set_file, sep="\t")
gene_locations = pd.read_csv(gene_location_file, sep="\t")

In [4]:
genotypes = pd.read_csv(genotype_file, sep="\t")
genotypes = genotypes[genotypes.CHROM.isin(gene_locations.chr.unique())]
genotypes = genotypes[genotypes.vartype=="SNP"]
genotypes = genotypes.drop(columns=["ID", "REF", "ALT", "vartype"])

# filter for only cis-SNPs of our 10 genes (at most 5000 SNPs closest to TSS, distance <= 2 Mbp distance)
filtered_df = None
for idx, row in gene_locations.iterrows():
    snps_to_keep = genotypes[genotypes.CHROM == row["chr"]]
    snps_to_keep["dist_to_gene"] = np.abs(snps_to_keep.POS - row["start"])
    snps_to_keep = snps_to_keep[snps_to_keep.dist_to_gene <= 2000000]
    snps_to_keep = snps_to_keep.sort_values(by="dist_to_gene")
    if len(snps_to_keep) > 5000:
        snps_to_keep = snps_to_keep.iloc[0:5000]
    snps_to_keep = snps_to_keep.drop(columns="dist_to_gene")
    snps_to_keep["gene_id"] = row["gene"]
    
    if filtered_df is None:
        filtered_df = snps_to_keep
    else:
        snps_to_keep = snps_to_keep[~snps_to_keep.SNP_ID.isin(filtered_df.SNP_ID.values)]
        filtered_df = pd.concat([filtered_df, snps_to_keep], ignore_index=True, sort=False)
genotypes = filtered_df

genotypes = pd.melt(genotypes, id_vars=["SNP_ID", "CHROM", "POS", "gene_id"], var_name="patient", value_name="genotype")
conditions = [
    (genotypes['genotype'] == '0|0'), (genotypes['genotype'] == '1|0'), (genotypes['genotype'] == '0|1'), (genotypes['genotype'] == '1|1')]
choices = [0, 1, 1, 2]
genotypes['GT_encoding'] = np.select(conditions, choices)
genotypes

Unnamed: 0,SNP_ID,CHROM,POS,gene_id,patient,genotype,GT_encoding
0,rs9996677,4,120299148,ENSG00000229450.2,HG00096,0|0,0
1,rs9996569,4,120299004,ENSG00000229450.2,HG00096,0|0,0
2,rs11723757,4,120299669,ENSG00000229450.2,HG00096,0|0,0
3,rs11723839,4,120299673,ENSG00000229450.2,HG00096,0|0,0
4,rs10016448,4,120298845,ENSG00000229450.2,HG00096,0|0,0
...,...,...,...,...,...,...,...
6946571,rs1912153,17,46442739,ENSG00000238083.3,NA20828,1|0,1
6946572,rs12150117,17,46443258,ENSG00000238083.3,NA20828,0|0,0
6946573,rs1912154,17,46443594,ENSG00000238083.3,NA20828,1|0,1
6946574,rs17696640,17,46444420,ENSG00000238083.3,NA20828,0|0,0


Read in expression data: 

In [5]:
expression = pd.read_csv(expression_file, sep="\t")
expression = expression[expression["Gene_Symbol"].isin(gene_locations.gene.values)]
expression = pd.melt(expression, id_vars=["TargetID", "Gene_Symbol", "Chr", "Coord"], var_name="sample_id", value_name="expression")
expression["sample_id"] = expression["sample_id"].str.split(".", expand=True)[0] + "." + expression["sample_id"].str.split(".", expand=True)[1]
data = pd.merge(test_set, expression, left_on=["gene_id", "sample_id"], right_on=["Gene_Symbol", "sample_id"])
data = pd.merge(genotypes, data, left_on=["gene_id", "patient"], right_on=["gene_id", "patient"])
data = data.drop(columns=["CHROM", "POS", "TargetID", "Gene_Symbol", "Chr", "Coord"])
data

Unnamed: 0,SNP_ID,gene_id,patient,genotype,GT_encoding,sample_id,test_split,expression
0,rs9996677,ENSG00000229450.2,HG00096,0|0,0,HG00096.1,3,0.03934
1,rs9996569,ENSG00000229450.2,HG00096,0|0,0,HG00096.1,3,0.03934
2,rs11723757,ENSG00000229450.2,HG00096,0|0,0,HG00096.1,3,0.03934
3,rs11723839,ENSG00000229450.2,HG00096,0|0,0,HG00096.1,3,0.03934
4,rs10016448,ENSG00000229450.2,HG00096,0|0,0,HG00096.1,3,0.03934
...,...,...,...,...,...,...,...,...
8511459,rs1912153,ENSG00000238083.3,NA20828,1|0,1,NA20828.2,1,14.08325
8511460,rs12150117,ENSG00000238083.3,NA20828,0|0,0,NA20828.2,1,14.08325
8511461,rs1912154,ENSG00000238083.3,NA20828,1|0,1,NA20828.2,1,14.08325
8511462,rs17696640,ENSG00000238083.3,NA20828,0|0,0,NA20828.2,1,14.08325


### Run Linear Regression

In [6]:
genes_train = {}
genes_test = {}
folds = [1, 2, 3, 4, 5]
for gene in data.gene_id.unique():
    print(f"Started gene {gene}")
    gene_snp_train = {}
    gene_snp_test = {}
    gene_df = data[data.gene_id==gene]
    for snp in gene_df.SNP_ID.unique():
        snp_df = gene_df[gene_df.SNP_ID==snp]
        rvals_test = []
        rvals_train = []
        for k in folds:
            X_train = np.array(snp_df[snp_df.test_split != k].GT_encoding).reshape(-1, 1)
            X_test = np.array(snp_df[snp_df.test_split == k].GT_encoding).reshape(-1, 1)
            y_train = np.array(snp_df[snp_df.test_split != k].expression)
            y_test = np.array(snp_df[snp_df.test_split == k].expression)

            model = LinearRegression().fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)

            rvals_test.append(pearsonr(y_test, y_pred_test)[0])
            rvals_train.append(pearsonr(y_train, y_pred_train)[0])
        gene_snp_train[snp] = rvals_train
        gene_snp_test[snp] = rvals_test
    genes_train[gene] = gene_snp_train
    genes_test[gene] = gene_snp_test

Started gene ENSG00000229450.2
Started gene ENSG00000198502.5
Started gene ENSG00000196126.6
Started gene ENSG00000196735.6
Started gene ENSG00000179344.11
Started gene ENSG00000237541.3
Started gene ENSG00000232629.4
Started gene ENSG00000214425.1
Started gene ENSG00000176681.9
Started gene ENSG00000238083.3


In [33]:
genes_test

{'ENSG00000229450.2': {'rs9996677': [0.5724624167494263,
   0.46148258039807316,
   0.4312315193652566,
   0.6452514594759247,
   0.5285980755178825],
  'rs9996569': [0.5724624167494263,
   0.46148258039807316,
   0.4312315193652566,
   0.6452514594759247,
   0.5285980755178825],
  'rs11723757': [0.5724624167494263,
   0.46148258039807327,
   0.43123151936525655,
   0.6130339886926801,
   0.4329344313922814],
  'rs11723839': [0.5734649935288293,
   0.4614825803980733,
   0.4312315193652565,
   0.6452514594759247,
   0.5285980755178821],
  'rs10016448': [0.5724624167494263,
   0.46148258039807316,
   0.4312315193652566,
   0.6452514594759247,
   0.5285980755178825],
  'rs11729050': [0.5724624167494263,
   0.46148258039807316,
   0.4312315193652566,
   0.6452514594759247,
   0.5285980755178825],
  'rs28429722': [0.5724624167494263,
   0.46148258039807316,
   0.4312315193652566,
   0.6452514594759247,
   0.5285980755178825],
  'rs10005644': [0.5724624167494263,
   0.46148258039807316,
   

In [7]:
s = []
sn = []
g = []
r = []
for gene, d in genes_test.items():
    for snp, rvals in d.items():
        s += [1, 2, 3, 4, 5]
        g += [gene]*5
        sn += [snp]*5
        r += rvals
test_result_df = pd.DataFrame({"gene": g, "snp": sn, "split": s, "rvalue": r})
test_result_df["method"] = "SNP"
test_result_df.to_csv("all_cis_snps_pearson.tsv", sep="\t", index=False)

In [8]:
best_snps_test = pd.DataFrame(test_result_df[["gene", "snp", "rvalue"]].groupby(["gene", "snp"]).mean().groupby("gene").idxmax().rvalue.values.tolist(), columns=["gene", "snp"])
best_snps_test.to_csv("best_snp_pearson_name_per_gene.tsv", sep="\t", index=False)
best_snps_test = pd.merge(test_result_df, best_snps_test[["gene", "snp"]])
best_snps_test[["split", "gene", "rvalue", "method"]].to_csv("best_snps_pearson_all.tsv", sep="\t", index=False)
x1 = best_snps_test[["gene", "rvalue"]].groupby("gene").mean().reset_index()
x2 = best_snps_test[["gene", "rvalue"]].groupby("gene").std().reset_index()
pd.merge(x1, x2, on="gene").to_csv("best_snps_pearson_mean.tsv", sep="\t", index=False)

In [13]:
genes_train

{'ENSG00000229450.2': [0.5189662164284992,
  0.5610610180269247,
  0.5698212768266382,
  0.5163631071943902,
  0.5436754224723179],
 'ENSG00000198502.5': [0.4292390910665432,
  0.45437166142302454,
  0.44211537653337546,
  0.4498479243648704,
  0.43652739068088897],
 'ENSG00000196126.6': [0.007096021434576227,
  0.029589503477435653,
  0.09098957965641419,
  0.015107536248624855,
  0.003000385784389204],
 'ENSG00000196735.6': [0.1104506093372739,
  0.14309547789366997,
  0.18223368213604185,
  0.13216276734464166,
  0.11765036877180779],
 'ENSG00000179344.11': [0.07649093693395018,
  0.04783421093359422,
  0.11120597051124771,
  0.0709918043335202,
  0.05875537590379161],
 'ENSG00000237541.3': [0.1976531383037759,
  0.07947881630932267,
  0.09427209006525603,
  0.04243777552237732,
  0.07520286080697153],
 'ENSG00000232629.4': [0.12366224773311546,
  0.10892975385802983,
  0.09468604686515325,
  0.045607387128391665,
  0.09004730956723367],
 'ENSG00000214425.1': [0.38311454376240606,
 

In [9]:
s = []
sn = []
g = []
r = []
for gene, d in genes_train.items():
    for snp, rvals in d.items():
        s += [1, 2, 3, 4, 5]
        g += [gene]*5
        sn += [snp]*5
        r += rvals
train_result_df = pd.DataFrame({"gene": g, "snp": sn, "split": s, "rvalue": r})
train_result_df["method"] = "SNP"
train_result_df.to_csv("all_cis_snps_pearson_train.tsv", sep="\t", index=False)

In [10]:
best_snps_train = pd.DataFrame(train_result_df[["gene", "snp", "rvalue"]].groupby(["gene", "snp"]).mean().groupby("gene").idxmax().rvalue.values.tolist(), columns=["gene", "snp"])
best_snps_train.to_csv("best_snp_pearson_name_per_gene_train.tsv", sep="\t", index=False)
best_snps_train = pd.merge(train_result_df, best_snps_train[["gene", "snp"]])
best_snps_train[["split", "gene", "rvalue", "method"]].to_csv("best_snps_pearson_all_train.tsv", sep="\t", index=False)
x1 = best_snps_train[["gene", "rvalue"]].groupby("gene").mean().reset_index()
x2 = best_snps_train[["gene", "rvalue"]].groupby("gene").std().reset_index()
pd.merge(x1, x2, on="gene").to_csv("best_snps_pearson_mean_train.tsv", sep="\t", index=False)

----------------------------------------

In [None]:
best_snps = {}
genes_train = {}
genes_test = {}
folds = [1, 2, 3, 4, 5]
for gene in data.gene_id.unique():
    print(f"Started gene {gene}")
    best_snp = None
    best_rvals_test = None
    best_rvals_train = None
    best_r = -2
    gene_df = data[data.gene_id==gene]
    for snp in gene_df.SNP_ID.unique():
        snp_df = gene_df[gene_df.SNP_ID==snp]
        rvals_test = []
        rvals_train = []
        for k in folds:
            X_train = np.array(snp_df[snp_df.test_split != k].GT_encoding).reshape(-1, 1)
            X_test = np.array(snp_df[snp_df.test_split == k].GT_encoding).reshape(-1, 1)
            y_train = np.array(snp_df[snp_df.test_split != k].expression)
            y_test = np.array(snp_df[snp_df.test_split == k].expression)

            model = LinearRegression().fit(X_train, y_train)
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)

            rvals_test.append(pearsonr(y_test, y_pred_test)[0])
            rvals_train.append(pearsonr(y_train, y_pred_train)[0])
        if np.mean(rvals_test) > best_r:
            best_snp = snp
            best_r = np.mean(rvals_test)
            best_rvals_test = rvals_test
            best_rvals_train = rvals_train
    best_snps[gene] = best_snp
    genes_train[gene] = best_rvals_train
    genes_test[gene] = best_rvals_test

In [30]:
splits_train = {}
splits_test = {}
for i in range(1, 6):
    r_values_train = {}
    r_values_test = {}
    for gene in data.gene_id.unique():
        # extract data
        gene_df = data[data.gene_id==gene]
        gene_df_train = gene_df[gene_df.test_split != i]
        gene_df_test = gene_df[gene_df.test_split == i]

        X_train = np.array(gene_df_train["GT_encoding"]).reshape(-1, 1)
        X_test = np.array(gene_df_test["GT_encoding"]).reshape(-1, 1)
        y_train = np.array(gene_df_train.expression)
        y_test = np.array(gene_df_test.expression)

        # train model
        model = LinearRegression().fit(X_train, y_train)

        # evaluate model on test set
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        r_values_train[gene] = pearsonr(y_train, y_pred_train)[0]
        r_values_test[gene] = pearsonr(y_test, y_pred_test)[0]
    splits_train[i] = r_values_train
    splits_test[i] = r_values_test

In [31]:
splits_test

{1: {'ENSG00000229450.2': 0.5838158873780467,
  'ENSG00000196126.6': -0.010763378614997422,
  'ENSG00000198502.5': 0.10200379539571876,
  'ENSG00000179344.11': 0.4347964807105317,
  'ENSG00000232629.4': 0.5652492282985153,
  'ENSG00000237541.3': 0.6617140720179402,
  'ENSG00000196735.6': 0.3548608619011706,
  'ENSG00000214425.1': 0.3825433425358211,
  'ENSG00000176681.9': 0.2777803066581089,
  'ENSG00000238083.3': 0.16879455130923565},
 2: {'ENSG00000229450.2': 0.4691115855795315,
  'ENSG00000196126.6': 0.27985575993872325,
  'ENSG00000198502.5': 0.07079277051916556,
  'ENSG00000179344.11': 0.5890411023450833,
  'ENSG00000232629.4': 0.4018050698611545,
  'ENSG00000237541.3': 0.44436281545505874,
  'ENSG00000196735.6': 0.560246279084256,
  'ENSG00000214425.1': 0.32156282683694326,
  'ENSG00000176681.9': 0.07276634100342154,
  'ENSG00000238083.3': 0.038449204012593045},
 3: {'ENSG00000229450.2': 0.4413664714953019,
  'ENSG00000196126.6': 0.375531038379767,
  'ENSG00000198502.5': -0.06995

### Extract and plot data

In [32]:
s = []
g = []
r = []
for k, res in splits_test.items():
    s += [k]*len(res)
    g += list(res.keys())
    r += list(res.values())
test_df = pd.DataFrame({"split": s, "gene_id": g, "rvalue": r})

s = []
g = []
r = []
for k, res in splits_train.items():
    s += [k]*len(res)
    g += list(res.keys())
    r += list(res.values())
train_df = pd.DataFrame({"split": s, "gene_id": g, "rvalue": r})

In [33]:
x1 = test_df.groupby("gene_id").mean()
x1 = x1.reset_index()
x2 = test_df.groupby("gene_id").std()
x2 = x2.reset_index()
x = pd.merge(x1[["gene_id", "rvalue"]], x2[["gene_id", "rvalue"]], on="gene_id")
x["method"] = "SNP"
x.to_csv("snp.tsv", sep="\t", index=False)

In [34]:
test_df.groupby("gene_id").std()

Unnamed: 0_level_0,split,rvalue
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000176681.9,1.581139,0.165889
ENSG00000179344.11,1.581139,0.086497
ENSG00000196126.6,1.581139,0.172982
ENSG00000196735.6,1.581139,0.143172
ENSG00000198502.5,1.581139,0.101835
ENSG00000214425.1,1.581139,0.116644
ENSG00000229450.2,1.581139,0.053658
ENSG00000232629.4,1.581139,0.169125
ENSG00000237541.3,1.581139,0.07753
ENSG00000238083.3,1.581139,0.17069


In [35]:
print(f"mean: {test_df.groupby('gene_id').mean().rvalue.mean()}")
print(f"SD: {test_df.groupby('gene_id').mean().rvalue.std()}")

mean: 0.35419810599101403
SD: 0.15248618932782093


In [36]:
test_df["method"] = "SNP"
test_df.to_csv("snp.tsv", sep="\t", index=False)