In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from statsmodels.api import OLS, add_constant
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from statsmodels.api import add_constant

In [2]:
!pip install autograd
from autograd import grad
from autograd import numpy as anp

Collecting autograd
  Downloading autograd-1.6.2-py3-none-any.whl.metadata (706 bytes)
Downloading autograd-1.6.2-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: autograd
Successfully installed autograd-1.6.2


In [3]:
# Parameters
num_individuals = 300
num_snps = 400
heritability = 0.8  # Proportion of variance explained by genetics

# Set random seed for reproducibility
np.random.seed(42)

# Generate Synthetic Genotype Data
maf = np.random.uniform(0.05, 0.5, num_snps)
genotypes = np.random.binomial(2, maf, (num_individuals, num_snps))

# Simulate Phenotype Data
num_causal_snps = 10
causal_indices = np.random.choice(num_snps, num_causal_snps, replace=False)
effect_sizes = np.random.normal(0, 0.1, num_causal_snps)
genetic_contribution = genotypes[:, causal_indices].dot(effect_sizes)
environmental_contribution = np.random.normal(0, np.sqrt(1 - heritability), num_individuals)
phenotypes = genetic_contribution + environmental_contribution

In [4]:
# Parameters
num_individuals = 300
num_snps = 400
heritability = 0.8  # Proportion of variance explained by genetics

# Set random seed for reproducibility
np.random.seed(42)

# Generate Synthetic Genotype Data
maf = np.random.uniform(0.05, 0.5, num_snps)
genotypes = np.random.binomial(2, maf, (num_individuals, num_snps))

# Simulate Phenotype Data
num_causal_snps = 10
causal_indices = np.random.choice(num_snps, num_causal_snps, replace=False)
effect_sizes = np.random.normal(0, 0.1, num_causal_snps)
genetic_contribution = genotypes[:, causal_indices].dot(effect_sizes)
environmental_contribution = np.random.normal(0, np.sqrt(1 - heritability), num_individuals)
phenotypes = genetic_contribution + environmental_contribution

# Splitting data into training and validation sets
indices = np.arange(num_individuals)
np.random.shuffle(indices)
split_point = num_individuals // 2

training_indices = indices[:split_point]
validation_indices = indices[split_point:]

training_genotypes = genotypes[training_indices]
training_phenotypes = phenotypes[training_indices]

validation_genotypes = genotypes[validation_indices]
validation_phenotypes = phenotypes[validation_indices]

# Scale the genotype and phenotype data for both training and validation sets
scaler_genotypes = StandardScaler()
scaler_phenotypes = StandardScaler()

training_genotypes_scaled = scaler_genotypes.fit_transform(training_genotypes)
training_phenotypes_scaled = scaler_phenotypes.fit_transform(training_phenotypes.reshape(-1, 1)).flatten()

validation_genotypes_scaled = scaler_genotypes.transform(validation_genotypes)
validation_phenotypes_scaled = scaler_phenotypes.transform(validation_phenotypes.reshape(-1, 1)).flatten()

In [5]:
def perform_ols(training_genotypes_scaled, training_phenotypes_scaled, num_iterations=10000, learning_rate=0.0001):
    num_individuals, num_snps = training_genotypes_scaled.shape
    genotypes_with_bias = anp.hstack([anp.ones((num_individuals, 1)), training_genotypes_scaled])

    def objective(beta_coefficients):
        predictions = anp.dot(genotypes_with_bias, beta_coefficients)
        return anp.mean((training_phenotypes_scaled - predictions) ** 2)

    beta_coefficients = np.zeros(num_snps + 1)
    gradient = grad(objective)

    for iteration in range(num_iterations):
        grads = gradient(beta_coefficients)
        beta_coefficients -= learning_rate * grads

    return beta_coefficients

def perform_lasso(training_genotypes_scaled, training_phenotypes_scaled, lambda_, num_iterations=10000, learning_rate=0.0001):
    num_individuals, num_snps = training_genotypes_scaled.shape
    genotypes_with_bias = anp.hstack([anp.ones((num_individuals, 1)), training_genotypes_scaled])

    def objective(beta_coefficients):
        predictions = anp.dot(genotypes_with_bias, beta_coefficients)
        return anp.mean((training_phenotypes_scaled - predictions) ** 2) + lambda_ * anp.sum(anp.abs(beta_coefficients))

    beta_coefficients = np.zeros(num_snps + 1)
    gradient = grad(objective)

    for iteration in range(num_iterations):
        grads = gradient(beta_coefficients)
        beta_coefficients -= learning_rate * grads

    return beta_coefficients

In [6]:
print("Training:")

# Regularization strength for Lasso
lambda_strength = 0.1

# Perform OLS and Lasso regression
print("Fitting OLS...")
beta_ols = perform_ols(training_genotypes_scaled, training_phenotypes_scaled)
print("Fitting Lasso...")
beta_lasso = perform_lasso(training_genotypes_scaled, training_phenotypes_scaled, lambda_strength)

# Sort beta coefficients by absolute value and get the indices of the top 10 largest
top10_indices_ols = np.argsort(-np.abs(beta_ols))[:10]
top10_indices_lasso = np.argsort(-np.abs(beta_lasso))[:10]

# Print the top 10 largest beta coefficients for OLS
print("Top 10 largest beta coefficients from OLS:")
for index in top10_indices_ols:
    print(f"Beta[{index}] = {beta_ols[index]}")

# Print the top 10 largest beta coefficients for Lasso
print("Top 10 largest beta coefficients from Lasso:")
for index in top10_indices_lasso:
    print(f"Beta[{index}] = {beta_lasso[index]}")

Training:
Fitting OLS...
Fitting Lasso...
Top 10 largest beta coefficients from OLS:
Beta[165] = -0.09788800862287814
Beta[31] = 0.09610983861634803
Beta[148] = -0.09504057790717642
Beta[263] = 0.09182873850466326
Beta[109] = -0.08944154228043338
Beta[130] = 0.08676238961544808
Beta[154] = -0.08654664351050392
Beta[103] = 0.0860061214569668
Beta[376] = 0.08541487132076087
Beta[80] = 0.07819727509346429
Top 10 largest beta coefficients from Lasso:
Beta[263] = 0.11482955880045172
Beta[148] = -0.10709074696925505
Beta[109] = -0.1004523201332034
Beta[80] = 0.09058539470788907
Beta[103] = 0.0872781799490329
Beta[130] = 0.08372157515489113
Beta[304] = -0.08267415121659777
Beta[35] = -0.08253669499236652
Beta[284] = 0.08093000793618478
Beta[376] = 0.07872705787627657


In [7]:
print("Training:")
# Construct PRS using the scaled training genotype data
prs_ols = training_genotypes_scaled.dot(beta_ols[1:]) + beta_ols[0]
prs_lasso = training_genotypes_scaled.dot(beta_lasso[1:]) + beta_lasso[0]

# Normalize PRS
normalized_prs_ols = (prs_ols - np.mean(prs_ols)) / np.std(prs_ols)
normalized_prs_lasso = (prs_lasso - np.mean(prs_lasso)) / np.std(prs_lasso)

print("Normalized PRS (OLS):")
print("Top 3:", sorted(normalized_prs_ols)[-3:])  # Prints the top 3 values
print("Bottom 3:", sorted(normalized_prs_ols)[:3])  # Prints the bottom 3 values

print("Normalized PRS (Lasso):")
print("Top 3:", sorted(normalized_prs_lasso)[-3:])  # Prints the top 3 values
print("Bottom 3:", sorted(normalized_prs_lasso)[:3])  # Prints the bottom 3 values


print("\nValidation:")
# Construct PRS using the scaled validation genotype data
prs_ols = validation_genotypes_scaled.dot(beta_ols[1:]) + beta_ols[0]
prs_lasso = validation_genotypes_scaled.dot(beta_lasso[1:]) + beta_lasso[0]

# Normalize PRS
normalized_prs_ols = (prs_ols - np.mean(prs_ols)) / np.std(prs_ols)
normalized_prs_lasso = (prs_lasso - np.mean(prs_lasso)) / np.std(prs_lasso)

print("Normalized PRS (OLS):")
print("Top 3:", sorted(normalized_prs_ols)[-3:])  # Prints the top 3 values
print("Bottom 3:", sorted(normalized_prs_ols)[:3])  # Prints the bottom 3 values

print("Normalized PRS (Lasso):")
print("Top 3:", sorted(normalized_prs_lasso)[-3:])  # Prints the top 3 values
print("Bottom 3:", sorted(normalized_prs_lasso)[:3])  # Prints the bottom 3 values

Training:
Normalized PRS (OLS):
Top 3: [2.23538814943223, 2.834371284564362, 2.9855124668572115]
Bottom 3: [-3.164400320935247, -2.696696783895861, -1.9305980073653064]
Normalized PRS (Lasso):
Top 3: [2.428573104742349, 2.5182001793488387, 3.315364200062122]
Bottom 3: [-2.8480629827430572, -2.4831009437179126, -2.4636421753956737]

Validation:
Normalized PRS (OLS):
Top 3: [2.297281137515346, 2.3158656963489186, 2.3253957680097597]
Bottom 3: [-2.3478376595036763, -2.099449237592803, -1.910316938235531]
Normalized PRS (Lasso):
Top 3: [2.0643929817123214, 2.178546991865247, 3.2210471132757545]
Bottom 3: [-2.7355363687683587, -2.237731143291401, -2.1651596006722147]


In [8]:
from scipy.stats import pearsonr

print("Training:")
# Calculate statistics using scaled training phenotypes
corr_ols, _ = pearsonr(training_phenotypes_scaled, normalized_prs_ols)
variance_explained_ols = corr_ols ** 2
corr_lasso, _ = pearsonr(training_phenotypes_scaled, normalized_prs_lasso)
variance_explained_lasso = corr_lasso ** 2

# Output results
print("OLS PRS - Variance explained:", variance_explained_ols)
print("OLS PRS - Correlation:", corr_ols)
print("Lasso PRS - Variance explained:", variance_explained_lasso)
print("Lasso PRS - Correlation:", corr_lasso)

print("\nValidation:")
# Calculate statistics using scaled validation phenotypes
corr_ols, _ = pearsonr(validation_phenotypes_scaled, normalized_prs_ols)
variance_explained_ols = corr_ols ** 2
corr_lasso, _ = pearsonr(validation_phenotypes_scaled, normalized_prs_lasso)
variance_explained_lasso = corr_lasso ** 2

# Output results
print("OLS PRS - Variance explained:", variance_explained_ols)
print("OLS PRS - Correlation:", corr_ols)
print("Lasso PRS - Variance explained:", variance_explained_lasso)
print("Lasso PRS - Correlation:", corr_lasso)

Training:
OLS PRS - Variance explained: 0.0075344518303416385
OLS PRS - Correlation: 0.08680122021228526
Lasso PRS - Variance explained: 0.006305345864570517
Lasso PRS - Correlation: 0.07940620797249115

Validation:
OLS PRS - Variance explained: 0.0156595199030192
OLS PRS - Correlation: 0.1251380034322875
Lasso PRS - Variance explained: 0.021573451622773472
Lasso PRS - Correlation: 0.1468790373837379
