**Imports**

In [1]:
# Imports
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split

**Dataset imports**

In [2]:
# import dataset (cleaned)
df = pd.read_csv(r'cleaned_data_simplified.csv', index_col=0)
df.columns

Index(['Duration', 'Distance', 'Pickup_longitude', 'Pickup_latitude',
       'Haversine', 'Pmonth', 'Pickup_day', 'Pickup_hour', 'Pickup_minute',
       'Pickup_weekday', 'Dropoff_hour', 'Dropoff_minute', 'Temp', 'Precip',
       'Wind', 'Humid', 'Solar', 'Snow', 'Dust'],
      dtype='object')

In [3]:
train_df, test_df = train_test_split(df, test_size=0.3)

In [4]:
# Printing the column headings to better understand the features
print(train_df.columns.values)

['Duration' 'Distance' 'Pickup_longitude' 'Pickup_latitude' 'Haversine'
 'Pmonth' 'Pickup_day' 'Pickup_hour' 'Pickup_minute' 'Pickup_weekday'
 'Dropoff_hour' 'Dropoff_minute' 'Temp' 'Precip' 'Wind' 'Humid' 'Solar'
 'Snow' 'Dust']


In [5]:
len(train_df.columns.values)

19

In [6]:
from deap import gp, creator, base, tools, algorithms
import operator

In [7]:
def protected_div(x,y):
    if y == 0:
        return 1
    else:
        return x/y

In [8]:
pset = gp.PrimitiveSet("MAIN", 18)
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)
pset.addPrimitive(protected_div,2)
# pset.addPrimitive(operator.pow, 2)

creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin, pset=pset)

In [9]:
# Toolbox setup
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=2)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

In [10]:
def eval_fitness(individual, dataset : pd.DataFrame):
    func = toolbox.compile(expr=individual)
    dataset.reset_index()

    raw_fitness = 0

    sample = dataset.sample(n=10000, random_state=10)

    for index, row in sample.iterrows():
        durations = row.get("Duration")
        distances = row.get("Distance")
        pickup_longs = row.get("Pickup_longitude")
        pickup_lats = row.get("Pickup_latitude")
        haversines = row.get("Haversine")
        pmonths = row.get("Pmonth")
        pdays = row.get("Pickup_day")
        phours = row.get("Pickup_hour")
        pmins = row.get("Pickup_minute")
        pweekdays = row.get("Pickup_weekday")
        dhours = row.get("Dropoff_hour")
        dmins = row.get("Dropoff_minute")
        temps = row.get("Temp")
        precips = row.get("Precip")
        winds = row.get("Wind")
        humids = row.get("Humid")
        solars = row.get("Solar")
        snows = row.get("Snow")
        dusts = row.get("Dust")

        estimate = func(distances, pickup_longs, pickup_lats,
                        haversines,
                        pmonths, pdays, phours, pmins,
                        pweekdays, dhours, dmins, 
                        temps, precips, winds,
                        humids, solars, snows, dusts)
        
        actual = durations

        raw_fitness = raw_fitness + abs(actual - estimate)
    
    average_fitness = raw_fitness / len(sample)
    # using average fitness to reward overall good even if it has one or two weird cases

    

    return average_fitness,

In [11]:
toolbox.register("evaluate", eval_fitness, dataset=train_df)
toolbox.register("select", tools.selTournament, tournsize=10)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=25))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))

In [12]:
stats_fit = tools.Statistics(lambda ind: ind.fitness.values)
stats_size = tools.Statistics(len)
mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
mstats.register("avg", np.mean)
mstats.register("std", np.std)
mstats.register("min", np.min)
mstats.register("max", np.max)

In [14]:
pop = toolbox.population(n=150)
hof = tools.HallOfFame(1)
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.4, mutpb=0.6, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=20)

gen	nevals	avg   	std        	min    	max        
0  	150   	148837	1.76015e+06	14.9576	2.16326e+07
1  	118   	1548.06	11972.6    	14.3586	138788     
2  	115   	9618.36	114545     	14.3586	1.40761e+06
3  	114   	1.25534e+07	1.53223e+08	14.3586	1.88288e+09
4  	111   	914.28     	9679.47    	14.1188	118890     
5  	114   	1.14234e+07	1.39437e+08	14.1188	1.71347e+09
6  	109   	3.44888e+06	4.20913e+07	12.8848	5.17239e+08
7  	111   	1471.62    	8945.36    	12.8848	88539.8    
8  	108   	119339     	1.44155e+06	12.8848	1.77155e+07
9  	113   	5597.92    	26963      	12.8848	192467     
10 	118   	28951.2    	346335     	12.8848	4.25638e+06
11 	104   	10808.8    	127012     	12.8848	1.56085e+06
12 	125   	5067.3     	25578.9    	12.8848	230247     
13 	117   	3891.98    	25886.7    	12.7804	280966     
14 	116   	347119     	4.18318e+06	12.7804	5.14082e+07
15 	111   	126691     	1.44087e+06	12.7804	1.76689e+07
16 	116   	7503.84    	53758.9    	12.7804	518642     
17 	106   	1350.05    	10080

In [15]:
pop2 = toolbox.population(n=150)
hof2 = tools.HallOfFame(1)
pop2, log2 = algorithms.eaSimple(pop2, toolbox, cxpb=0.4, mutpb=0.6, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=20)

gen	nevals	avg   	std       	min    	max        
0  	150   	120550	1.4339e+06	13.5649	1.76217e+07
1  	113   	35519.7	408013    	13.5649	5.01371e+06
2  	120   	1810.23	14503.6   	13.5649	170326     
3  	114   	500504 	5.90252e+06	13.5649	7.25255e+07
4  	109   	4015.54	23593.5    	13.5649	170326     
5  	109   	1973.45	14664.6    	13.5649	170401     
6  	109   	6456.82	37283.4    	13.5649	364630     
7  	119   	490.778	2175.06    	13.5649	23608.8    
8  	111   	10177.1	68502.9    	13.5649	701620     
9  	114   	10562.7	80052.8    	13.5649	887458     
10 	116   	4705.52	36415.4    	13.5649	419958     
11 	113   	139892 	1.45214e+06	13.5649	1.76217e+07
12 	108   	2832.34	29584.6    	13.5649	363558     
13 	99    	6341.4 	42243.2    	13.5649	368133     
14 	117   	8484.99	65687.8    	13.5649	702021     
15 	112   	7926.41	74718.9    	13.5649	908610     
16 	109   	52734.6	583299     	13.5621	7.16037e+06
17 	115   	3084.63	24142.6    	13.3552	291802     
18 	112   	3.45013e+06	4.18327e+07	12

In [17]:
print(str(hof[0]))

protected_div(mul(ARG4, ARG13), protected_div(ARG13, ARG3))


In [20]:
def f(x4,x13,x3):
    return protected_div((x4*x13), protected_div(x13, x3))


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error

In [21]:
import pickle
checkpoint = dict(population=pop, generation=10, halloffame=hof,
                      logbook=log, rndstate=10)

with open("deap_report_model.pkl", "wb") as cp_file:
    pickle.dump(checkpoint, cp_file)

**ABOVE USED FOR ACTUAL REPORT, BELOW IS FOR OWN USE**

In [58]:
pop = toolbox.population(n=100)
hof = tools.HallOfFame(1)
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.6, mutpb=0.4, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=20)

gen	nevals	avg   	min    
0  	100   	2454.2	8.82621
1  	76    	2215.3	8.82621
2  	81    	1464.46	8.82621
3  	73    	7381.15	8.77462
4  	77    	5014.73	8.50087
5  	72    	394.931	8.46374
6  	84    	4540.87	7.93321
7  	82    	4747.42	7.93321
8  	79    	218.961	7.75871
9  	80    	600.488	7.75391
10 	68    	739.138	7.71315
11 	76    	2099.35	7.59096
12 	80    	1653.9 	7.58504
13 	66    	469325 	7.57819
14 	69    	93.7175	7.57819
15 	76    	184.956	7.54628
16 	77    	4033.83	7.52595
17 	75    	4912.86	7.52595
18 	66    	559.089	7.45965
19 	68    	177.944	7.40844
20 	77    	107.059	7.40844


In [59]:
str(hof[0])

'protected_div(sub(sub(sub(ARG0, ARG4), mul(sub(ARG1, ARG16), mul(ARG4, ARG3))), sub(mul(sub(mul(ARG4, ARG3), ARG15), ARG3), ARG11)), ARG2)'

In [60]:
# clearly not complete, let's keep going

pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.7, mutpb=0.3, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=20)

gen	nevals	avg    	min    
0  	0     	107.059	7.40844
1  	72    	1271.09	7.40844
2  	73    	1479.74	7.34972
3  	84    	131.247	7.32959
4  	82    	390.82 	7.32959
5  	84    	85.1676	7.32959
6  	64    	177.55 	7.32135
7  	83    	111383 	7.32135
8  	79    	28.6044	7.31718
9  	72    	39.4054	7.31718
10 	81    	236.228	7.25117
11 	74    	7543.3 	7.25117
12 	77    	1.88899e+06	7.2399 
13 	77    	78.6997    	7.2399 
14 	77    	119.711    	7.21941
15 	91    	476.803    	7.19738
16 	84    	12491.9    	7.19738
17 	84    	240.285    	7.19577
18 	83    	11281.5    	7.19517
19 	79    	837.624    	7.18608
20 	81    	496802     	7.17706


In [37]:
str(hof[0])

'sub(add(mul(ARG3, ARG3), ARG6), protected_div(add(sub(add(ARG11, ARG2), mul(ARG8, ARG3)), ARG6), mul(ARG3, ARG1)))'

In [22]:
pop = toolbox.population(n=100)
hof = tools.HallOfFame(1)
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.6, mutpb=0.4, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=50)

gen	nevals	avg    	min    
0  	100   	421.491	9.06984
1  	80    	82.7819	13.3953
2  	68    	101.196	13.3953
3  	84    	5620.99	13.3953
4  	74    	2598.9 	13.3953
5  	69    	12279.7	13.3953
6  	81    	160.194	13.3953
7  	76    	3071.2 	13.3953
8  	82    	4478.93	13.3953
9  	67    	141.106	13.3953
10 	86    	605.612	13.3953
11 	72    	7248.42	13.3953
12 	73    	1640.14	13.3953


KeyboardInterrupt: 

In [17]:
import pickle

In [63]:
with open("deap_with_div_pop_2.pkl", "wb") as deap_savefile:
    pickle.dump(pop, deap_savefile)

# saved with best of 7.whateverthehell