### This program strives to emulate the work done in this research paper "Human Age Prediction Based on DNA Methylation Using a Gradient Boosting Regressor"
https://www.mdpi.com/2073-4425/9/9/424

In [1]:
import pandas as pd
import matplotlib as mpl
import numpy as np
import sklearn

%matplotlib inline
HEALTHY_PATH = "./data/Healthy_Methylation_Dataset.csv"
DISEASED_PATH = "./data/Disease_Methylation_Dataset.csv"
np.random.seed(0)

#TODO: on-demand data fetching, (pre)processing and merging?

In [2]:
#Download, drop NAs, and reset the index
healthy = pd.read_csv(HEALTHY_PATH)
healthy.dropna(inplace=True)
healthy.reset_index(drop=True, inplace=True)

In [3]:
#Adding categorical age column for proportional splitting
AGE_GROUP_AMOUNT = 8
healthy["age_group"] = pd.cut(healthy["age"], bins=AGE_GROUP_AMOUNT,labels=range(AGE_GROUP_AMOUNT))

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
#Make train test split with proportional age groups
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in split.split(healthy, healthy["age_group"]):
    strat_train_set = healthy.iloc[train_index]
    strat_test_set = healthy.iloc[test_index]

healthy = strat_train_set.copy()

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class Cleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X.drop(labels=["...1", "age_group"], axis=1, inplace=True, errors="ignore")
        return X

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
        ('clean', Cleane7r()),
        ('std_scaler', StandardScaler()),
    ])

In [7]:
healthy = pd.DataFrame(pipeline.fit_transform(healthy), index=healthy.index, columns=healthy.columns)
X = healthy.drop(labels="age", axis=1)
y = healthy["age"]

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


param_grid = [
    {"n_estimators": [350, 400, 450], "max_leaf_nodes": [340, 350, 360], "max_features": [3, 4, 5]}]

rfr = RandomForestRegressor(random_state=0, n_jobs=-1, oob_score=True)

grid_search = GridSearchCV(rfr, param_grid, n_jobs=-1, return_train_score=True, cv=5, 
                           scoring="neg_mean_squared_error")

grid_search.fit(X, y)


GridSearchCV(cv=5,
             estimator=RandomForestRegressor(n_jobs=-1, oob_score=True,
                                             random_state=0),
             n_jobs=-1,
             param_grid=[{'max_features': [3, 4, 5],
                          'max_leaf_nodes': [340, 350, 360],
                          'n_estimators': [350, 400, 450]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [22]:
grid_search.score(X, y)

-0.00908127951169586

In [30]:
from sklearn.linear_model import Ridge

param_grid = [
    {"alpha": [pow(10, -x) for x in range(7)]}]

ridge = Ridge()

grid_search = GridSearchCV(ridge, param_grid, n_jobs=-1, return_train_score=True, cv=10, 
                           scoring="neg_mean_squared_error")

grid_search.fit(X, y)

GridSearchCV(cv=10, estimator=Ridge(), n_jobs=-1,
             param_grid=[{'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 1e-05,
                                    1e-06]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [33]:
grid_search.score(X, y)

-0.2543015402242401

In [37]:
from sklearn.ensemble import GradientBoostingRegressor

param_grid = [
    {"n_estimators": [10, 20, 50, 100, 200], "max_depth": [2, 3, 5, 10, 20], "learning_rate": [0.5, 0.75, 1]}]

gbr = GradientBoostingRegressor(random_state=0)

grid_search = GridSearchCV(gbr, param_grid, n_jobs=-1, return_train_score=True, cv=5, 
                           scoring="neg_mean_squared_error")

grid_search.fit(X, y)
grid_search.score(X, y)

-0.011668753213221729