In [1]:
import numpy as np
import gplearn

from gplearn.genetic import SymbolicTransformer
from sklearn.utils import check_random_state
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
__all__ = ['SymbolicRegressor', 'SymbolicClassifier', 'SymbolicTransformer']


In [2]:
rng = check_random_state(0)
diabetes = load_diabetes()

      # - age     age in years
      # - sex
      # - bmi     body mass index
      # - bp      average blood pressure
      # - s1      tc, T-Cells (a type of white blood cells)
      # - s2      ldl, low-density lipoproteins
      # - s3      hdl, high-density lipoproteins
      # - s4      tch, thyroid stimulating hormone
      # - s5      ltg, lamotrigine
      # - s6      glu, blood sugar level
        
perm = rng.permutation(diabetes.target.size)
diabetes.data = diabetes.data[perm]
diabetes.target = diabetes.target[perm]

In [3]:
est = Ridge()
est.fit(diabetes.data[:300, :], diabetes.target[:300])
print(est.score(diabetes.data[300:, :], diabetes.target[300:]))


0.43405742105789413


In [4]:
function_set = ['add', 'sub', 'mul', 'div',
                'sqrt', 'log', 'abs', 'neg', 'inv',
                'max', 'min']
gp = SymbolicTransformer(generations=20, population_size=2000,
                         hall_of_fame=100, n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9, verbose=1,
                         random_state=0, n_jobs=3)
gp.fit(diabetes.data[:300, :], diabetes.target[:300])

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    11.37         0.126618        5         0.612827             0.68      2.28m
   1     6.63         0.344375        3         0.659086         0.451797     51.66s
   2     5.36         0.473731        3         0.669019         0.321485      1.39m
   3     4.74         0.587613        3         0.673354          0.31249      1.03m
   4     4.39         0.597151       13         0.675282          0.49482      1.20m
   5     4.31         0.611995       15         0.686134         0.148203      1.14m
   6     4.89         0.611891        9         0.685649         0.199002      1.33m
   7     6.49         0.617031        9         0.688309         0.287286      1.18m
   8     9.19         0.628618       17         0.720606          0.26362  

In [6]:
gp_features = gp.transform(diabetes.data)
new_diabetes = np.hstack((diabetes.data, gp_features))

In [7]:
est = Ridge()
est.fit(new_diabetes[:300, :], diabetes.target[:300])
print(est.score(new_diabetes[300:, :], diabetes.target[300:]))


0.5336788517320445


Explain how the symbolic transformer method helps to improve the regression’s performance.

The linear model was able to take advantage of some non-linear features to fit the data better. This can be seen on the improved R^2 restults: 0.5336788517320445  versus the R^2 obtained with Ridge Regression: 0.43405742105789413.

Symbolic Regresion is between Parametric Statistics and Neural Network functions. With Symbolic Regression, the function we learn can actually have quite hight expressivity and is selected from a large space of functions 

Part of the advantages is that Symbolic Regression is fast, it leverages previous expirience, inference in a single forward pass. And it also is less overfitting-prone (does not optimize a loss on the inputs)
