# Data import

In [2]:
import pandas as pd

In [19]:
cpg_url   = "https://zenodo.org/record/4056406/files/cpg_methylation_beta_values.tsv?download=1"
annot_url = "https://zenodo.org/record/4056406/files/cpg_methylation_cpg_to_annotation.tsv?download=1"
age_url   = "https://zenodo.org/record/4056406/files/cpg_methylation_sample_age.tsv?download=1"

## CpG methylation values

In [32]:
cpg = pd.read_csv(cpg_url, sep="\t")
cpg.shape
cpg.head()

Unnamed: 0,CpG_id,GSM712302,GSM712303,GSM712306,GSM712307,GSM712308,GSM712309,GSM712310,GSM712311,GSM712312,...,GSM712363,GSM712364,GSM712365,GSM712366,GSM712367,GSM712368,GSM712369,GSM712370,GSM712371,GSM712372
0,cg00000292,0.790114,0.683727,0.76627,0.820927,0.633326,0.747806,0.791249,0.810918,0.784657,...,0.726795,0.832091,0.73915,0.781877,0.807861,0.806516,0.810198,0.735238,0.664598,0.618429
1,cg00002426,0.682118,0.445706,0.55044,0.742424,0.181034,0.524531,0.548844,0.45853,0.698093,...,0.342844,0.625413,0.24413,0.505282,0.525227,0.603758,0.579088,0.414969,0.480901,0.31005
2,cg00003994,0.071778,0.074375,0.074744,0.077173,0.070584,0.069245,0.072755,0.078325,0.066935,...,0.130879,0.124522,0.056615,0.097008,0.1156,0.073913,0.057869,0.068152,0.143305,0.133333
3,cg00005847,0.193069,0.15573,0.188825,0.161436,0.163866,0.201299,0.172892,0.172315,0.150826,...,0.19284,0.169374,0.185104,0.238064,0.195254,0.179029,0.2251,0.231645,0.22807,0.246311
4,cg00006414,0.062389,0.061903,0.073632,0.088877,0.054102,0.072834,0.093356,0.083293,0.068694,...,0.084818,0.113481,0.068301,0.078775,0.052264,0.091395,0.094637,0.075248,0.074052,0.096774


In [31]:
tidy_cpg = cpg.melt(id_vars="CpG_id", value_name="beta", var_name="sample")
tidy_cpg.head()

Unnamed: 0,CpG_id,sample,beta
0,cg00000292,GSM712302,0.790114
1,cg00002426,GSM712302,0.682118
2,cg00003994,GSM712302,0.071778
3,cg00005847,GSM712302,0.193069
4,cg00006414,GSM712302,0.062389


In [34]:
tidy_cpg.shape

(1820148, 3)

## Age of the individuals

In [23]:
age = pd.read_csv(age_url, sep="\t")
age.head()

Unnamed: 0,sample,title,source,pair,race,age
0,GSM712302,111,saliva sample,1,White,40
1,GSM712303,112,saliva sample,1,White,40
2,GSM712306,811,saliva sample,8,White,39
3,GSM712307,812,saliva sample,8,White,39
4,GSM712308,911,saliva sample,9,White,39


In [35]:
age.shape

(66, 6)

## Merge CpG beta values (features) and age information (labels)

In [38]:
tidy_cpg_with_age = tidy_cpg.merge(age, how="left")
tidy_cpg_with_age

Unnamed: 0,CpG_id,sample,beta,title,source,pair,race,age
0,cg00000292,GSM712302,0.790114,111,saliva sample,1,White,40
1,cg00002426,GSM712302,0.682118,111,saliva sample,1,White,40
2,cg00003994,GSM712302,0.071778,111,saliva sample,1,White,40
3,cg00005847,GSM712302,0.193069,111,saliva sample,1,White,40
4,cg00006414,GSM712302,0.062389,111,saliva sample,1,White,40
...,...,...,...,...,...,...,...,...
1820143,cg27657283,GSM712372,0.119795,7412,saliva sample,74,White,45
1820144,cg27661264,GSM712372,0.499846,7412,saliva sample,74,White,45
1820145,cg27662379,GSM712372,0.045600,7412,saliva sample,74,White,45
1820146,cg27662877,GSM712372,0.066954,7412,saliva sample,74,White,45


In [69]:
cpg_with_age_pivoted = tidy_cpg_with_age.pivot(index='sample', columns='CpG_id', values='beta')
cpg_with_age_pivoted.shape

(66, 27578)

In [70]:
cpg_with_age_pivoted["age"] = age["age"].tolist()
cpg_with_age_pivoted.head()

CpG_id,cg00000292,cg00002426,cg00003994,cg00005847,cg00006414,cg00007981,cg00008493,cg00008713,cg00009407,cg00010193,...,cg27654142,cg27655855,cg27655905,cg27657249,cg27657283,cg27661264,cg27662379,cg27662877,cg27665659,age
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM712302,0.790114,0.682118,0.071778,0.193069,0.062389,0.048428,0.967453,0.02305,0.023487,0.787841,...,0.054558,0.856433,0.055375,0.092243,0.046561,0.386896,0.043485,0.045624,0.046183,40
GSM712303,0.683727,0.445706,0.074375,0.15573,0.061903,0.035971,0.958629,0.019968,0.02566,0.768339,...,0.036168,0.869446,0.082618,0.108548,0.049381,0.418838,0.034128,0.048038,0.040252,40
GSM712306,0.76627,0.55044,0.074744,0.188825,0.073632,0.03717,0.960194,0.020195,0.033395,0.797935,...,0.046588,0.860874,0.06762,0.093421,0.067761,0.405881,0.046372,0.048445,0.04154,39
GSM712307,0.820927,0.742424,0.077173,0.161436,0.088877,0.062858,0.958553,0.025576,0.03342,0.729069,...,0.043486,0.857088,0.080053,0.088147,0.070636,0.39067,0.04026,0.055248,0.049669,39
GSM712308,0.633326,0.181034,0.070584,0.163866,0.054102,0.066462,0.959126,0.022957,0.032702,0.795442,...,0.042364,0.881988,0.071271,0.11645,0.044316,0.425491,0.044659,0.060418,0.038915,39


## Drop NA values

In [86]:
df_ready_for_ml = cpg_with_age_pivoted.dropna(axis=1)
df_ready_for_ml.head()

CpG_id,cg00000292,cg00002426,cg00003994,cg00005847,cg00006414,cg00007981,cg00008493,cg00008713,cg00009407,cg00010193,...,cg27654142,cg27655855,cg27655905,cg27657249,cg27657283,cg27661264,cg27662379,cg27662877,cg27665659,age
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM712302,0.790114,0.682118,0.071778,0.193069,0.062389,0.048428,0.967453,0.02305,0.023487,0.787841,...,0.054558,0.856433,0.055375,0.092243,0.046561,0.386896,0.043485,0.045624,0.046183,40
GSM712303,0.683727,0.445706,0.074375,0.15573,0.061903,0.035971,0.958629,0.019968,0.02566,0.768339,...,0.036168,0.869446,0.082618,0.108548,0.049381,0.418838,0.034128,0.048038,0.040252,40
GSM712306,0.76627,0.55044,0.074744,0.188825,0.073632,0.03717,0.960194,0.020195,0.033395,0.797935,...,0.046588,0.860874,0.06762,0.093421,0.067761,0.405881,0.046372,0.048445,0.04154,39
GSM712307,0.820927,0.742424,0.077173,0.161436,0.088877,0.062858,0.958553,0.025576,0.03342,0.729069,...,0.043486,0.857088,0.080053,0.088147,0.070636,0.39067,0.04026,0.055248,0.049669,39
GSM712308,0.633326,0.181034,0.070584,0.163866,0.054102,0.066462,0.959126,0.022957,0.032702,0.795442,...,0.042364,0.881988,0.071271,0.11645,0.044316,0.425491,0.044659,0.060418,0.038915,39


# Create train and test sets

In [43]:
from sklearn.model_selection import train_test_split

## Extract matrix of features

In [87]:
X = df_ready_for_ml.iloc[:,0:-1]
y = df_ready_for_ml["age"].tolist()

In [88]:
X.shape

(66, 26653)

In [89]:
y[0:5]

[40, 40, 39, 39, 39]

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Regularised regression

In [91]:
from sklearn.linear_model import Lasso

In [92]:
alpha = 0.1
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train)

Lasso(alpha=0.1)