In [1]:
import gplearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# import dataset
df = pd.read_csv(r'For_modeling.csv', index_col=0)

In [3]:
x = df.drop('Duration', axis=1)
print(x.columns)

Index(['Distance', 'PLong', 'PLatd', 'DLong', 'DLatd', 'Haversine', 'Pmonth',
       'Pday', 'Phour', 'Pmin', 'PDweek', 'Dmonth', 'Dday', 'Dhour', 'Dmin',
       'DDweek', 'Temp', 'Precip', 'Wind', 'Humid', 'Solar', 'Snow',
       'GroundTemp', 'Dust'],
      dtype='object')


In [4]:
y = df.loc[:,'Duration'].to_frame()
type(y)
print(y)

         Duration
0               3
1              24
2               8
3               8
4               4
...           ...
9830306        67
9830307        58
9830308       118
9830309        90
9830310       116

[9601139 rows x 1 columns]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)

In [6]:
from gplearn.genetic import SymbolicRegressor
import pickle

In [18]:
gp = SymbolicRegressor(population_size=100, stopping_criteria=0.01, verbose=1, random_state=10,
                       tournament_size=16, p_crossover=0.5,p_hoist_mutation=0.05,
                       p_point_mutation=0.25,p_subtree_mutation=0.20, n_jobs=5)

In [20]:
gp.fit(X_train, np.ravel(y_train))

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    23.46      7.04854e+14        5          18.0556              N/A      7.55m
   1     5.88          49.6308        7           18.024              N/A      2.69m
   2     5.58          538.352        5          17.6522              N/A      2.48m
   3     5.40          243.168        3          16.0351              N/A      2.36m
   4     4.92          104.726        3          16.0351              N/A      2.18m
   5     4.12          5261.19       11          16.0157              N/A      1.85m
   6     4.56          250.351       11          16.0157              N/A      1.70m
   7     7.36           6047.8       13          15.7673              N/A      2.04m
   8    11.50          339.857       15          15.6805              N/A  

SymbolicRegressor(n_jobs=5, p_crossover=0.5, p_hoist_mutation=0.05,
                  p_point_mutation=0.25, p_subtree_mutation=0.2,
                  population_size=100, random_state=10, stopping_criteria=0.01,
                  tournament_size=16, verbose=1)

In [31]:
print('R2: ' + str(gp.score(X_test, y_test)))

R2: 0.17977415898570337


In [32]:
with open('new_gp_model.pkl', 'wb') as f:
    pickle.dump(gp, f)

In [28]:
with open('new_gp_model.pkl', 'rb') as f:
    gp = pickle.load(f)

In [33]:
gp.set_params(generations=50, n_jobs=5, warm_start=1)

SymbolicRegressor(generations=50, n_jobs=5, p_crossover=0.5,
                  p_hoist_mutation=0.05, p_point_mutation=0.25,
                  p_subtree_mutation=0.2, population_size=100, random_state=10,
                  stopping_criteria=0.01, tournament_size=16, verbose=1,
                  warm_start=1)

In [34]:
gp.fit(X_train, np.ravel(y_train))

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
  40    51.44          351.181       55          13.3091              N/A      6.02m
  41    51.24          88.5087       55          13.3082              N/A      5.31m
  42    49.46          28.8193       55          13.3082              N/A      4.43m
  43    51.52          27.7043       55          13.3082              N/A      3.85m
  44    53.66          59.6917       61          13.3051              N/A      3.31m
  45    51.42          27.1855       61          13.3051              N/A      2.59m
  46    56.12           36.513       67          13.3087              N/A      2.07m
  47    51.94          33.8528       49          13.3086              N/A      1.29m
  48    50.60          33.1417       49          13.3081              N/A  

SymbolicRegressor(generations=50, n_jobs=5, p_crossover=0.5,
                  p_hoist_mutation=0.05, p_point_mutation=0.25,
                  p_subtree_mutation=0.2, population_size=100, random_state=10,
                  stopping_criteria=0.01, tournament_size=16, verbose=1,
                  warm_start=1)

In [45]:
df.columns

Index(['Duration', 'Distance', 'PLong', 'PLatd', 'DLong', 'DLatd', 'Haversine',
       'Pmonth', 'Pday', 'Phour', 'Pmin', 'PDweek', 'Dmonth', 'Dday', 'Dhour',
       'Dmin', 'DDweek', 'Temp', 'Precip', 'Wind', 'Humid', 'Solar', 'Snow',
       'GroundTemp', 'Dust'],
      dtype='object')

In [7]:
simplified_df = df.drop(['DDweek', 'GroundTemp', 'Haversine', 'Dmonth'],axis=1)

In [8]:
simplified_df.columns

Index(['Duration', 'Distance', 'PLong', 'PLatd', 'DLong', 'DLatd', 'Pmonth',
       'Pday', 'Phour', 'Pmin', 'PDweek', 'Dday', 'Dhour', 'Dmin', 'Temp',
       'Precip', 'Wind', 'Humid', 'Solar', 'Snow', 'Dust'],
      dtype='object')

In [9]:
x_s = simplified_df.drop('Duration',axis=1)
y_s = simplified_df.loc[:,'Duration'].to_frame()

In [10]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(x_s, y_s, test_size=0.3, random_state=10)

In [13]:
simplified_gp = SymbolicRegressor(population_size=100,
                           generations=20, stopping_criteria=0.01,
                           p_crossover=0.5, p_subtree_mutation=0.4,
                           p_hoist_mutation=0, p_point_mutation=0.1,
                           verbose=1,
                           parsimony_coefficient=0.01, random_state=10, n_jobs=5)

In [18]:
simplified_gp.set_params(generations=51, warm_start=True)

SymbolicRegressor(generations=51, n_jobs=5, p_crossover=0.5, p_hoist_mutation=0,
                  p_point_mutation=0.1, p_subtree_mutation=0.4,
                  parsimony_coefficient=0.01, population_size=100,
                  random_state=10, stopping_criteria=0.01, verbose=1,
                  warm_start=True)

In [19]:
simplified_gp.fit(x_s, np.ravel(y_s))

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
  20    11.90          161.755       17          8.72373              N/A      9.96m
  21    12.32          839.198       17          8.72373              N/A      9.67m
  22    12.42          5401.85       17          8.72373              N/A      9.32m
  23    12.94          4537.71       17          8.72373              N/A      9.17m
  24    11.42          78.2822       27           8.7223              N/A      8.21m
  25    11.50          1695.51       17          8.71892              N/A      7.99m
  26    12.42          90.4515       17          8.72373              N/A      7.88m
  27    11.26          3800.17       17          8.72373              N/A      7.28m
  28    11.74          912.977       17          8.72373              N/A  

SymbolicRegressor(generations=51, n_jobs=5, p_crossover=0.5, p_hoist_mutation=0,
                  p_point_mutation=0.1, p_subtree_mutation=0.4,
                  parsimony_coefficient=0.01, population_size=100,
                  random_state=10, stopping_criteria=0.01, verbose=1,
                  warm_start=True)

In [20]:
with open('simplified.pkl', 'wb') as f:
    pickle.dump(simplified_gp, f)