In [290]:
import random
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [291]:
raw_data = pd.read_csv('./src/data/insurance.csv')
raw_features = pd.DataFrame(raw_data, columns=raw_data.columns.drop('charges'))
raw_labels = pd.DataFrame(raw_data.charges)

In [292]:
# we use this to map the categorical features into indices, we can map back later for presentation
def extract_category_mappings(dataframe, category_columns):
    to_map = lambda uniques: {v:k[0] for k, v in np.ndenumerate(uniques)}
    return [(to_map(dataframe[col].unique()), col) for col in category_columns]

In [293]:
mappings = extract_category_mappings(raw_features, ['sex', 'smoker', 'region'])

In [294]:
# transform the categorical features to binary representations of their indices;
# TODO: we might want to bucketize the continuous features?
def transform_features(dataframe, mappings):
    transformed_df = pd.DataFrame(dataframe, columns=['age', 'bmi', 'children'])
    
    for mapping, name in mappings:
        for i in mapping.values():
            transformed_df[name + str(i)] = dataframe[name].apply(lambda x: int(mapping[x] == i))

    return transformed_df

In [295]:
features = transform_features(raw_features, mappings)

In [296]:
scaler = preprocessing.MinMaxScaler()

def normalize(series, columns, scaler):
    return scaler.fit_transform(series[columns])

def denormalize(series, columns, scaler):
    return scaler.inverse_transform(series[columns])

labels = raw_labels
labels['charges'] = normalize(raw_labels, ['charges'], scaler)
print(labels.head())


    charges
0  0.251611
1  0.009636
2  0.053115
3  0.333010
4  0.043816


In [297]:
X_train, X_test, y_train, y_test = train_test_split(features, labels)

In [298]:
print(X_train.head())
print(X_test.head())
print(y_train.head())
print(y_test.head())

      age    bmi  children  sex0  sex1  smoker0  smoker1  region0  region1  \
533    37  36.19         0     0     1        0        1        0        1   
153    42  23.37         0     1     0        1        0        0        0   
1187   62  32.68         0     1     0        0        1        0        0   
21     30  32.40         1     1     0        0        1        1        0   
172    18  15.96         0     0     1        0        1        0        0   

      region2  region3  
533         0        0  
153         0        1  
1187        1        0  
21          0        0  
172         0        1  
     age     bmi  children  sex0  sex1  smoker0  smoker1  region0  region1  \
367   42  24.985         2     1     0        0        1        0        0   
430   19  33.100         0     0     1        0        1        1        0   
255   55  25.365         3     1     0        0        1        0        0   
352   30  27.700         0     1     0        0        1        1    

In [299]:
#TODO warning, actually setting values on y_train if we run this
#test_denormalized_labels = y_train
#test_denormalized_labels['charges'] = denormalize(y_train, ['charges'], scaler)

#print(test_denormalized_labels.head())



# we probably only actually care about denormalising on predictions

In [300]:
X_train.size

11033

In [310]:
# TODO: ending condition
# count generations
# cache fitnesses?
# want to chart progress (mean and best i guess)
# TODO: use less raw python, more pandas/numpy
class GeneticAlgorithm:
    def __init__(self, parameters, features, labels):
        self.population_size = parameters["population_size"]
        self.breeding_ratio = 1 / parameters["breeding_rate"]
        self.crossover_rate = parameters["crossover_rate"]
        self.mutation_rate = parameters["mutation_rate"]
        self.mutation_range = parameters["mutation_range"]
        
        self.features = features
        self.labels = labels
        self.population = pd.Series([self.random_chromosome() for i in range(self.population_size)])
        
    def random_chromosome(self):
        _, shape_cols = self.features.shape
        return [random.random() for c in range(shape_cols)]

    def next_generation(self):
        weakest, middle, parents = self.divide_population()
        offspring = self.breed(parents)
        self.population = pd.concat([middle, parents, offspring], ignore_index=True)
        self.display_fitness()
    
    def divide_population(self):
        parent_count = int(len(self.population) / self.breeding_ratio)

        # TODO: select parents stochastically, based on fitness, rather than just taking the best ones
        fitnesses = self.population.apply(lambda chromosome: self.chromosome_fitness(chromosome))
        
        parents = self.population[fitnesses.sort_values('charges')[:parent_count].index]
        middle = self.population[fitnesses.sort_values('charges')[parent_count:-parent_count].index]
        weakest = self.population[fitnesses.sort_values('charges')[-parent_count:].index]
        
        return weakest, middle, parents
    
    def chromosome_fitness(self, chromosome):
        predictions = np.dot(self.features, chromosome)
        differences = pd.DataFrame(self.labels.charges - predictions)
        squared_differences = differences.apply(lambda diff: diff ** 2)
        return squared_differences.sum()
    
    def breed(self, parents):
        pair_count = int(len(parents) / 2)
        couples = list(zip(parents[:pair_count], parents[pair_count:]))

        # if we have an uneven number of parents, the last one is a hermaphrodite
        if len(parents) % 2 != 0:
            couples.append((parents[-1], parents[-1]))
        
        children = self.crossover(couples)
        self.mutate(children)
        return pd.Series(children)

    def crossover(self, couples):
        children = []
        for couple in couples:
            p1, p2 = couple
            if random.random() < self.crossover_rate:
                x_point = random.randrange(0, len(p1) + 1)
                children.append(p1[:x_point] + p2[x_point:])
                children.append(p2[:x_point] + p1[x_point:])
            else:
                children.append(p1)
                children.append(p2)
        return children
    
    def mutate(self, children):
        for child in children:
            for feature_idx, feature in enumerate(child):
                if random.random() < self.mutation_rate:
                    m = random.uniform(-self.mutation_range, self.mutation_range)
                    child[feature_idx] += m
        return children
        
    def display_fitness(self):
        f_values = self.population.apply(lambda chromosome: self.chromosome_fitness(chromosome)).values
        print('FITNESS - best: {:.5f}, mean: {:.5f}'.format(f_values.min(), f_values.mean()))
    

In [311]:
params = {
    "population_size": 100,
    "breeding_rate": 0.2,
    "crossover_rate": 0.7,
    'mutation_rate': 0.08,
    'mutation_range': 0.1,
}

g = GeneticAlgorithm(params, X_train, y_train)

In [312]:
for i in range(100):
    g.next_generation()

FITNESS - best: 14441.20846, mean: 1030050.27208
FITNESS - best: 14441.20846, mean: 659571.71681
FITNESS - best: 14441.20846, mean: 418642.04237
FITNESS - best: 13222.06651, mean: 252000.07688
FITNESS - best: 4656.88225, mean: 153868.49970
FITNESS - best: 951.63512, mean: 95964.03222
FITNESS - best: 952.48621, mean: 57921.03439
FITNESS - best: 952.48621, mean: 31325.39867
FITNESS - best: 867.31537, mean: 13421.95084
FITNESS - best: 541.33031, mean: 8899.25170
FITNESS - best: 532.67126, mean: 6123.01474
FITNESS - best: 300.52011, mean: 3798.86617
FITNESS - best: 300.52011, mean: 2044.53377
FITNESS - best: 262.33075, mean: 1434.33804
FITNESS - best: 278.93178, mean: 1025.69413
FITNESS - best: 277.73770, mean: 660.21177
FITNESS - best: 271.96317, mean: 1035.24168
FITNESS - best: 267.02576, mean: 972.08767
FITNESS - best: 261.63584, mean: 406.01679
FITNESS - best: 212.83648, mean: 381.27675
FITNESS - best: 212.83648, mean: 354.00838
FITNESS - best: 212.83648, mean: 295.85316
FITNESS - best

In [313]:
g.population


0     [0.0030720787087652884, -0.049483931877841975,...
1     [0.0030720787087652884, -0.049483931877841975,...
2     [0.0030720787087652884, -0.049483931877841975,...
3     [0.0030720787087652884, -0.049483931877841975,...
4     [0.0030720787087652884, -0.049483931877841975,...
5     [0.0030720787087652884, -0.049483931877841975,...
6     [0.0030720787087652884, -0.049483931877841975,...
7     [0.0030720787087652884, -0.049483931877841975,...
8     [0.0030720787087652884, -0.049483931877841975,...
9     [0.0030720787087652884, -0.049483931877841975,...
10    [0.0030720787087652884, -0.049483931877841975,...
11    [0.0030720787087652884, -0.049483931877841975,...
12    [0.0030720787087652884, -0.049483931877841975,...
13    [0.0030720787087652884, -0.049483931877841975,...
14    [0.0030720787087652884, -0.049483931877841975,...
15    [0.0030720787087652884, -0.049483931877841975,...
16    [0.0030720787087652884, -0.049483931877841975,...
17    [0.0030720787087652884, -0.049483931877841

In [304]:
def predict(features, weights):
    return np.dot(features, weights)