**Imports**

In [3]:
# Imports
import pandas as pd
import random
import numpy as np

**Dataset imports**

In [4]:
train_df = pd.read_csv(r'training.csv')
test_df = pd.read_csv(r'testing.csv')
# Printing the column headings to better understand the features
print(train_df.columns.values)

['Duration' 'Distance' 'Pickup_longitude' 'Pickup_latitude'
 'Dropoff_latitude' 'Dropoff_latitude.1' 'Haversine' 'Pmonth' 'Pickup_day'
 'Pickup_hour' 'Pickup_minute' 'Pickup_weekday' 'Dropoff_month'
 'Dropoff_day' 'Dropoff_hour' 'Dropoff_minute' 'Dropoff_weekday' 'Temp'
 'Precip' 'Wind' 'Humid' 'Solar' 'Snow' 'GroundTemp' 'Dust']


In [5]:
len(train_df.columns.values)

25

In [6]:
from deap import gp, creator, base, tools, algorithms
import operator

In [7]:
pset = gp.PrimitiveSet("MAIN", 23)
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)

creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin, pset=pset)

In [18]:
# Toolbox setup
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=2)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

In [38]:
def eval_fitness(individual, dataset):
    func = toolbox.compile(expr=individual)
    #print("Function compiled for individual!")
    dataset.reset_index()

    raw_fitness = 0

    for index, row in dataset.iterrows():
        durations = row.get("Duration")
        distances = row.get("Distance")
        pickup_longs = row.get("Pickup_longitude")
        pickup_lats = row.get("Pickup_latitude")
        dropoff_longs = row.get("Dropoff_latitude.1")
        dropoff_lats = row.get("Dropoff_latitude")
        haversines = row.get("Haversine")
        pmonths = row.get("Pmonth")
        pdays = row.get("Pickup_day")
        phours = row.get("Pickup_hour")
        pmins = row.get("Pickup_minute")
        pweekdays = row.get("Pickup_weekday")
        ddays = row.get("Dropoff_day")
        dhours = row.get("Dropoff_hour")
        dmins = row.get("Dropoff_minute")
        dweekdays = row.get("Dropoff_weekday")
        temps = row.get("Temp")
        precips = row.get("Precip")
        winds = row.get("Wind")
        humids = row.get("Humid")
        solars = row.get("Solar")
        snows = row.get("Snow")
        groundtemps = row.get("GroundTemp")
        dusts = row.get("Dust")

        estimate = func(distances, pickup_longs, pickup_lats,
                        dropoff_longs, dropoff_lats, haversines,
                        pmonths, pdays, phours, pmins,
                        pweekdays, ddays, dhours, dmins, 
                        dweekdays, temps, precips, winds,
                        humids, solars, snows, groundtemps, dusts)
        
        actual = durations

        raw_fitness = raw_fitness + abs(actual - estimate)
    
    average_fitness = raw_fitness / len(dataset)
    #print("Inividual fitness: " + str(average_fitness))
    # using average fitness to reward overall good even if it has one or two weird cases
    return average_fitness,

In [39]:
def fitness_function(individual):
    # print("evaluating fitness...")
    # print("individual:")
    # print(type(individual))
    trip_sample = train_df.sample(100, ignore_index=True, random_state=10)

    # print(trip_sample.columns.values)

    # break into iteratable arrays
    durations = trip_sample.loc[:,"Duration"]
    distances = trip_sample.loc[:,"Distance"]
    pickup_longs = trip_sample.loc[:,"Pickup_longitude"]
    pickup_lats = trip_sample.loc[:,"Pickup_latitude"]
    dropoff_longs = trip_sample.loc[:,"Dropoff_latitude.1"]
    dropoff_lats = trip_sample.loc[:,"Dropoff_latitude"]
    haversines = trip_sample.loc[:,"Haversine"]
    pmonths = trip_sample.loc[:,"Pmonth"]
    pdays = trip_sample.loc[:,"Pickup_day"]
    phours = trip_sample.loc[:,"Pickup_hour"]
    pmins = trip_sample.loc[:,"Pickup_minute"]
    pweekdays = trip_sample.loc[:,"Pickup_weekday"]
    ddays = trip_sample.loc[:,"Dropoff_day"]
    dhours = trip_sample.loc[:,"Dropoff_hour"]
    dmins = trip_sample.loc[:,"Dropoff_minute"]
    dweekdays = trip_sample.loc[:,"Dropoff_weekday"]
    temps = trip_sample.loc[:,"Temp"]
    precips = trip_sample.loc[:,"Precip"]
    winds = trip_sample.loc[:,"Wind"]
    humids = trip_sample.loc[:,"Humid"]
    solars = trip_sample.loc[:,"Solar"]
    snows = trip_sample.loc[:,"Snow"]
    groundtemps = trip_sample.loc[:,"GroundTemp"]
    dusts = trip_sample.loc[:,"Dust"]
    print("Compiling function...")
    func = toolbox.compile(expr=individual)
    print("function compiled")
    fitness = 0
    for i in range(len(trip_sample)):
        # print("fitness eval loop run " + str(i))
        # PSUEDOCODE: individual_fitness = abs(correct_duration - calculated_duration)
        # PSEUDOCODE: fitness = fitness + individual_fitness
        sample_row = trip_sample.iloc[[i]]
        # print("sample row: " + str(type(sample_row)))
        sample_features = sample_row.drop("Duration",axis=1)
        # print("sample features: " + str(type(sample_features)))

        correct_duration = durations[i]
        # print("correct duration for item " + str(i) + " is " + str(correct_duration))
        calculated_duration = func(
            distances[i], pickup_longs[i], pickup_lats[i], dropoff_longs[i], dropoff_lats[i],
            haversines[i], pmonths[i], pdays[i], phours[i], pmins[i], pweekdays[i], ddays[i],
            dhours[i], dmins[i], dweekdays[i], temps[i], precips[i], winds[i], humids[i], solars[i],
            snows[i], groundtemps[i], dusts[i]
        )
        # print("calculated duration for item " + str(i) + " is " + str(calculated_duration))

        individual_fitness = abs(correct_duration - calculated_duration)
        fitness = fitness + individual_fitness
    # print("Fitness: " + str(fitness))
    return fitness,

In [44]:
train_sample = train_df.sample(10000, ignore_index=True, random_state=10)

toolbox.register("evaluate", eval_fitness, dataset=train_df)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))

In [45]:
stats_fit = tools.Statistics(lambda ind: ind.fitness.values)
stats_size = tools.Statistics(len)
mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
mstats.register("avg", np.mean)
# mstats.register("std", np.std)
mstats.register("min", np.min)
# mstats.register("max", np.max)

In [46]:
pop = toolbox.population(n=100)
hof = tools.HallOfFame(1)
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.5, stats=stats_fit,
                                   halloffame=hof, verbose=True, ngen=20)