**Imports**

In [21]:
# Imports
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split

**Dataset imports**

In [22]:
# import dataset (cleaned)
df = pd.read_csv(r'cleaned_data_simplified.csv', index_col=0)
df.columns

Index(['Duration', 'Distance', 'Pickup_longitude', 'Pickup_latitude',
       'Haversine', 'Pmonth', 'Pickup_day', 'Pickup_hour', 'Pickup_minute',
       'Pickup_weekday', 'Dropoff_hour', 'Dropoff_minute', 'Temp', 'Precip',
       'Wind', 'Humid', 'Solar', 'Snow', 'Dust'],
      dtype='object')

In [23]:
train_df, test_df = train_test_split(df, test_size=0.3)

In [24]:
# Printing the column headings to better understand the features
print(train_df.columns.values)

['Duration' 'Distance' 'Pickup_longitude' 'Pickup_latitude' 'Haversine'
 'Pmonth' 'Pickup_day' 'Pickup_hour' 'Pickup_minute' 'Pickup_weekday'
 'Dropoff_hour' 'Dropoff_minute' 'Temp' 'Precip' 'Wind' 'Humid' 'Solar'
 'Snow' 'Dust']


In [25]:
len(train_df.columns.values)

19

In [26]:
from deap import gp, creator, base, tools, algorithms
import operator

In [27]:
def protected_div(x,y):
    if y == 0:
        return 1
    else:
        return x/y

In [28]:
pset = gp.PrimitiveSet("MAIN", 18)
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)
pset.addPrimitive(protected_div,2)
# pset.addPrimitive(operator.pow, 2)

creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin, pset=pset)



In [29]:
# Toolbox setup
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=2)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

In [30]:
def eval_fitness(individual, dataset):
    func = toolbox.compile(expr=individual)
    #print("Function compiled for individual!")
    dataset.reset_index()

    raw_fitness = 0

    for index, row in dataset.iterrows():
        durations = row.get("Duration")
        distances = row.get("Distance")
        pickup_longs = row.get("Pickup_longitude")
        pickup_lats = row.get("Pickup_latitude")
        haversines = row.get("Haversine")
        pmonths = row.get("Pmonth")
        pdays = row.get("Pickup_day")
        phours = row.get("Pickup_hour")
        pmins = row.get("Pickup_minute")
        pweekdays = row.get("Pickup_weekday")
        dhours = row.get("Dropoff_hour")
        dmins = row.get("Dropoff_minute")
        temps = row.get("Temp")
        precips = row.get("Precip")
        winds = row.get("Wind")
        humids = row.get("Humid")
        solars = row.get("Solar")
        snows = row.get("Snow")
        dusts = row.get("Dust")

        estimate = func(distances, pickup_longs, pickup_lats,
                        haversines,
                        pmonths, pdays, phours, pmins,
                        pweekdays, dhours, dmins, 
                        temps, precips, winds,
                        humids, solars, snows, dusts)
        
        actual = durations

        raw_fitness = raw_fitness + abs(actual - estimate)
    
    average_fitness = raw_fitness / len(dataset)
    #print("Inividual fitness: " + str(average_fitness))
    # using average fitness to reward overall good even if it has one or two weird cases
    return average_fitness,

In [31]:
def fitness_function(individual):
    # print("evaluating fitness...")
    # print("individual:")
    # print(type(individual))
    trip_sample = train_df.sample(100, ignore_index=True, random_state=10)

    # print(trip_sample.columns.values)

    # break into iteratable arrays
    durations = trip_sample.loc[:,"Duration"]
    distances = trip_sample.loc[:,"Distance"]
    pickup_longs = trip_sample.loc[:,"Pickup_longitude"]
    pickup_lats = trip_sample.loc[:,"Pickup_latitude"]
    dropoff_longs = trip_sample.loc[:,"Dropoff_latitude.1"]
    dropoff_lats = trip_sample.loc[:,"Dropoff_latitude"]
    haversines = trip_sample.loc[:,"Haversine"]
    pmonths = trip_sample.loc[:,"Pmonth"]
    pdays = trip_sample.loc[:,"Pickup_day"]
    phours = trip_sample.loc[:,"Pickup_hour"]
    pmins = trip_sample.loc[:,"Pickup_minute"]
    pweekdays = trip_sample.loc[:,"Pickup_weekday"]
    ddays = trip_sample.loc[:,"Dropoff_day"]
    dhours = trip_sample.loc[:,"Dropoff_hour"]
    dmins = trip_sample.loc[:,"Dropoff_minute"]
    dweekdays = trip_sample.loc[:,"Dropoff_weekday"]
    temps = trip_sample.loc[:,"Temp"]
    precips = trip_sample.loc[:,"Precip"]
    winds = trip_sample.loc[:,"Wind"]
    humids = trip_sample.loc[:,"Humid"]
    solars = trip_sample.loc[:,"Solar"]
    snows = trip_sample.loc[:,"Snow"]
    groundtemps = trip_sample.loc[:,"GroundTemp"]
    dusts = trip_sample.loc[:,"Dust"]
    print("Compiling function...")
    func = toolbox.compile(expr=individual)
    print("function compiled")
    fitness = 0
    for i in range(len(trip_sample)):
        # print("fitness eval loop run " + str(i))
        # PSUEDOCODE: individual_fitness = abs(correct_duration - calculated_duration)
        # PSEUDOCODE: fitness = fitness + individual_fitness
        sample_row = trip_sample.iloc[[i]]
        # print("sample row: " + str(type(sample_row)))
        sample_features = sample_row.drop("Duration",axis=1)
        # print("sample features: " + str(type(sample_features)))

        correct_duration = durations[i]
        # print("correct duration for item " + str(i) + " is " + str(correct_duration))
        calculated_duration = func(
            distances[i], pickup_longs[i], pickup_lats[i], dropoff_longs[i], dropoff_lats[i],
            haversines[i], pmonths[i], pdays[i], phours[i], pmins[i], pweekdays[i], ddays[i],
            dhours[i], dmins[i], dweekdays[i], temps[i], precips[i], winds[i], humids[i], solars[i],
            snows[i], groundtemps[i], dusts[i]
        )
        # print("calculated duration for item " + str(i) + " is " + str(calculated_duration))

        individual_fitness = abs(correct_duration - calculated_duration)
        fitness = fitness + individual_fitness
    # print("Fitness: " + str(fitness))
    return fitness,

In [32]:
train_sample = train_df.sample(10000, ignore_index=True, random_state=10)

toolbox.register("evaluate", eval_fitness, dataset=train_sample)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))

In [33]:
stats_fit = tools.Statistics(lambda ind: ind.fitness.values)
stats_size = tools.Statistics(len)
mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
mstats.register("avg", np.mean)
# mstats.register("std", np.std)
mstats.register("min", np.min)
# mstats.register("max", np.max)

In [34]:
pop = toolbox.population(n=100)
hof = tools.HallOfFame(1)
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.7, mutpb=0.3, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=20)

gen	nevals	avg    	min   
0  	100   	11695.1	15.715
1  	86    	1494.73	15.715
2  	89    	891.822	14.9543
3  	79    	498.999	14.333 
4  	87    	1477.51	14.333 
5  	74    	849.862	14.333 
6  	78    	27160.7	14.333 
7  	78    	17282.5	14.333 
8  	82    	192.513	13.5745
9  	71    	2744.44	14.072 
10 	82    	962.434	13.8407
11 	78    	105.603	13.8407
12 	84    	407.275	13.8407
13 	73    	54.5174	12.895 
14 	79    	4579.36	12.895 
15 	73    	314.362	12.7058
16 	78    	55.9045	12.895 
17 	82    	574.102	12.7058
18 	77    	9216.48	12.7058
19 	75    	328.279	12.7058
20 	76    	208.545	12.6882


In [35]:
str(hof[0])

'sub(add(mul(ARG3, ARG3), ARG6), protected_div(ARG6, mul(ARG8, ARG3)))'

In [36]:
# clearly not complete, let's keep going

pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.7, mutpb=0.3, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=20)

gen	nevals	avg    	min    
0  	0     	208.545	12.6882
1  	74    	13110.3	12.6704
2  	66    	141.614	12.7021
3  	83    	181.297	12.5718
4  	80    	1290.97	12.5718
5  	76    	52674.5	12.5718
6  	78    	171.249	12.6971
7  	77    	189.346	12.6886
8  	81    	242.772	12.6886
9  	83    	217.8  	12.6361
10 	87    	2741.64	12.6183
11 	85    	226.431	12.6183
12 	75    	96.8682	12.6177
13 	74    	654.548	12.5101
14 	83    	7079.02	12.5157
15 	76    	365.811	12.4261
16 	76    	225.799	12.4313
17 	77    	33.6636	12.5157
18 	83    	1022.17	12.5157
19 	80    	67.4671	12.3904
20 	82    	163.397	12.4602


In [37]:
str(hof[0])

'sub(add(mul(ARG3, ARG3), ARG6), protected_div(add(sub(add(ARG11, ARG2), mul(ARG8, ARG3)), ARG6), mul(ARG3, ARG1)))'

In [38]:
# clearly not complete, let's keep going -added protectedDiv

pop = toolbox.population(n=100)
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.7, mutpb=0.3, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=20)

gen	nevals	avg    	min    
0  	100   	8486.45	15.7596
1  	77    	526.354	15.923 
2  	74    	6894.73	14.9789


KeyboardInterrupt: 

In [None]:
str(hof[0])

'add(add(mul(protected_div(ARG3, ARG9), mul(add(ARG13, ARG9), ARG3)), protected_div(mul(protected_div(protected_div(ARG3, add(protected_div(mul(protected_div(ARG3, add(add(ARG6, protected_div(ARG3, mul(ARG5, ARG1))), ARG6)), mul(ARG9, ARG3)), ARG14), ARG0)), mul(protected_div(ARG3, ARG9), mul(add(ARG13, ARG9), ARG3))), mul(ARG6, add(ARG13, ARG9))), ARG14)), ARG6)'

In [None]:
# getting closer...?
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.7, mutpb=0.3, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=20)

gen	nevals	avg    	min    
0  	0     	592.367	12.1815
1  	83    	142.676	12.1815
2  	79    	806.678	12.1815
3  	84    	11049.1	12.1815
4  	76    	431.34 	12.1814
5  	86    	498.822	11.5923
6  	87    	156847 	10.4407
7  	81    	19298.4	10.433 
8  	73    	126.129	10.433 
9  	82    	86.3524	10.433 
10 	82    	197.87 	9.27182
11 	79    	72.8986	9.27182
12 	74    	196.705	9.27183
13 	79    	132057 	9.27183
14 	73    	217777 	9.14664
15 	77    	104.427	9.27184
16 	70    	34.894 	9.17636
17 	83    	1.33265e+13	9.17636
18 	71    	23196.6    	9.17636
19 	75    	1.33265e+13	9.17191
20 	78    	345.789    	9.17537


In [None]:
str(hof[0])

'add(mul(ARG20, protected_div(protected_div(ARG16, ARG7), add(protected_div(mul(ARG0, ARG22), protected_div(ARG5, ARG6)), mul(ARG14, add(ARG13, ARG2))))), protected_div(sub(mul(sub(ARG8, ARG6), sub(ARG6, ARG11)), ARG0), sub(sub(ARG10, ARG18), ARG2)))'

In [None]:
import pickle

In [None]:
with open("deap_with_div_pop.pkl", "wb") as deap_savefile:
    pickle.dump(pop, deap_savefile)

# saved with best of 9.147

In [None]:
# div seemed to help a lot...
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.7, mutpb=0.3, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=20)

gen	nevals	avg    	min    
0  	0     	345.789	9.17537
1  	87    	1.15319e+06	9.17474
2  	78    	232.265    	9.17338
3  	80    	31.8668    	9.17177
4  	80    	114.789    	9.16745
5  	82    	621.456    	9.16753
6  	80    	34338.9    	9.13052
7  	73    	17.5476    	9.13052
8  	79    	8577.4     	9.09184
9  	82    	1769.11    	9.13047
10 	72    	16.2175    	9.13047
11 	82    	3181.14    	9.12905
12 	78    	1743.75    	9.12612
13 	82    	36986.7    	9.12612
14 	81    	2148.56    	9.10348
15 	74    	85.4853    	9.08093
16 	86    	347.586    	9.08093
17 	80    	2.09266e+07	9.08019
18 	80    	833.977    	9.04315
19 	82    	269.43     	9.043  
20 	82    	2.31055e+06	9.04315


In [20]:
str(hof[0])

'add(add(mul(protected_div(ARG3, ARG9), mul(add(ARG13, ARG9), ARG3)), protected_div(mul(protected_div(protected_div(ARG3, add(protected_div(mul(protected_div(ARG3, add(add(ARG6, protected_div(ARG3, mul(ARG5, ARG1))), ARG6)), mul(ARG9, ARG3)), ARG14), ARG0)), mul(protected_div(ARG3, ARG9), mul(add(ARG13, ARG9), ARG3))), mul(ARG6, add(ARG13, ARG9))), ARG14)), ARG6)'

In [78]:
with open("deap_with_div_pop_2.pkl", "wb") as deap_savefile:
    pickle.dump(pop, deap_savefile)

# saved with best of 9.043