**Imports**

In [1]:
# Imports
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
import Levenshtein

**Dataset imports**

In [2]:
# import dataset (cleaned)
df = pd.read_csv(r'cleaned_data_simplified.csv', index_col=0)
df.columns

Index(['Duration', 'Distance', 'Pickup_longitude', 'Pickup_latitude',
       'Haversine', 'Pmonth', 'Pickup_day', 'Pickup_hour', 'Pickup_minute',
       'Pickup_weekday', 'Dropoff_hour', 'Dropoff_minute', 'Temp', 'Precip',
       'Wind', 'Humid', 'Solar', 'Snow', 'Dust'],
      dtype='object')

In [3]:
train_df, test_df = train_test_split(df, test_size=0.3)

In [4]:
# Printing the column headings to better understand the features
print(train_df.columns.values)

['Duration' 'Distance' 'Pickup_longitude' 'Pickup_latitude' 'Haversine'
 'Pmonth' 'Pickup_day' 'Pickup_hour' 'Pickup_minute' 'Pickup_weekday'
 'Dropoff_hour' 'Dropoff_minute' 'Temp' 'Precip' 'Wind' 'Humid' 'Solar'
 'Snow' 'Dust']


In [5]:
len(train_df.columns.values)

19

In [6]:
from deap import gp, creator, base, tools, algorithms
import operator

In [7]:
def protected_div(x,y):
    if y == 0:
        return 1
    else:
        return x/y

In [8]:
pset = gp.PrimitiveSet("MAIN", 18)
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)
# pset.addPrimitive(protected_div,2)
# pset.addPrimitive(operator.pow, 2)

creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin, pset=pset)

In [9]:
# Toolbox setup
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=2)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

In [10]:
# helper similarity function for strucutred implementation
def calculate_similarity(tree1, tree2):
    return Levenshtein.distance(tree1, tree2)

In [11]:
past_local_minima : list = []

In [12]:
def eval_fitness(individual, dataset : pd.DataFrame):
    # print("started evaluation of fitness")
    func = toolbox.compile(expr=individual)
    dataset.reset_index()

    raw_fitness = 0

    sample = dataset.sample(n=10000, random_state=10)

    for index, row in sample.iterrows():
        durations = row.get("Duration")
        distances = row.get("Distance")
        pickup_longs = row.get("Pickup_longitude")
        pickup_lats = row.get("Pickup_latitude")
        haversines = row.get("Haversine")
        pmonths = row.get("Pmonth")
        pdays = row.get("Pickup_day")
        phours = row.get("Pickup_hour")
        pmins = row.get("Pickup_minute")
        pweekdays = row.get("Pickup_weekday")
        dhours = row.get("Dropoff_hour")
        dmins = row.get("Dropoff_minute")
        temps = row.get("Temp")
        precips = row.get("Precip")
        winds = row.get("Wind")
        humids = row.get("Humid")
        solars = row.get("Solar")
        snows = row.get("Snow")
        dusts = row.get("Dust")

        estimate = func(distances, pickup_longs, pickup_lats,
                        haversines,
                        pmonths, pdays, phours, pmins,
                        pweekdays, dhours, dmins, 
                        temps, precips, winds,
                        humids, solars, snows, dusts)
        
        actual = durations

        raw_fitness = raw_fitness + abs(actual - estimate)
    
    average_fitness = raw_fitness / len(sample)
    # print("averge fitness: " + str(average_fitness))

    total_distance = 0
    average_distance = 0.9
    if past_local_minima != None:
        for past_solution in past_local_minima:
            total_distance = total_distance + calculate_similarity(str(individual), str(past_solution))

        average_distance = total_distance / len(sample)

    # adjusted fitness is penalised by adding a number which increases with smaller distances
    # essentially, we make the individual less fit if it's exploring already-explored minima
    adjusted_fitness = average_fitness + ( 1 / 0.1 + average_distance)
    # print("adjusted fitness: " + str(adjusted_fitness))

    # using average fitness to reward overall good even if it has one or two weird cases
    return average_fitness,

In [13]:
def compare_past_minima(minima : list):
    best = None
    best_fitness = None
    for tree in minima:
        # do some fancy thing that calculates its fitness
        fitness = 0
        if best_fitness == None or fitness > best_fitness:
            best = tree
            best_fitness = fitness
    return best

In [14]:
toolbox.register("evaluate", eval_fitness, dataset=train_df)
toolbox.register("select", tools.selTournament, tournsize=10)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=25))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))

In [15]:
stats_fit = tools.Statistics(lambda ind: ind.fitness.values)
stats_size = tools.Statistics(len)
mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
mstats.register("avg", np.mean)
mstats.register("std", np.std)
mstats.register("min", np.min)
mstats.register("max", np.max)

In [16]:
hof = tools.HallOfFame(1)

In [17]:
# now have to repeat process 'num_loops_per_minima' times to explore as much as possible
num_loops_per_minima = 10
for i in range (0,num_loops_per_minima):
    pop = toolbox.population(n=150)
    print("MINIMA LOOP COMPLETED: " + str(i))
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.4, mutpb=0.6, stats=stats_fit,
                                       halloffame=hof, verbose=True, ngen=1)
    past_local_minima.append(hof[0])

# TODO - at present, first iteration faces advantage, because there is no penalty applied to it - you'll need to do one last check against the whole "past_local_minima" list to see which item there is actually the fittest individual

MINIMA LOOP COMPLETED: 0
gen	nevals	avg    	std    	min    	max   
0  	150   	13637.2	62513.8	15.1105	395027
1  	105   	2.40474e+06	2.88013e+07	13.2613	3.53914e+08
MINIMA LOOP COMPLETED: 1
gen	nevals	avg    	std    	min    	max   
0  	150   	10633.9	42695.3	15.1105	360161
1  	108   	12967  	79567.2	15.1105	738835
MINIMA LOOP COMPLETED: 2
gen	nevals	avg    	std   	min    	max        
0  	150   	19031.8	143606	14.5146	1.69978e+06
1  	103   	12635.4	89133.5	14.5146	985445     
MINIMA LOOP COMPLETED: 3
gen	nevals	avg   	std    	min    	max   
0  	150   	4083.7	16882.6	14.9838	116777
1  	113   	5762.44	38463.3	14.5146	391172
MINIMA LOOP COMPLETED: 4
gen	nevals	avg    	std    	min    	max   
0  	150   	4215.77	33155.2	14.4868	395027
1  	108   	59789.5	636338 	14.3256	7.77077e+06
MINIMA LOOP COMPLETED: 5
gen	nevals	avg   	std        	min    	max        
0  	150   	121280	1.43181e+06	15.1565	1.75976e+07
1  	113   	8155.28	80850.6    	14.9838	984823     
MINIMA LOOP COMPLETED: 6
gen	nevals	avg 

In [18]:
best_individual_after_comparison = compare_past_minima(past_local_minima)
print(str(best_individual_after_comparison))

mul(ARG4, ARG3)


In [21]:
str(past_local_minima[0])

'add(add(ARG3, ARG3), add(add(add(add(add(ARG3, ARG3), add(ARG3, ARG15)), ARG13), ARG3), ARG3))'

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error

In [21]:
import pickle
checkpoint = dict(population=pop, generation=10, halloffame=hof,
                      logbook=log, rndstate=10)

with open("deap_report_model.pkl", "wb") as cp_file:
    pickle.dump(checkpoint, cp_file)