In [25]:
# TODO - create structured project
# comments/docs
import random
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
raw_data = pd.read_csv('./src/data/insurance.csv')
raw_features = pd.DataFrame(raw_data, columns=raw_data.columns.drop('charges'))
raw_labels = pd.DataFrame(raw_data.charges)

In [3]:
# we use this to map the categorical features into indices, we can map back later for presentation
def extract_category_mappings(dataframe, category_columns):
    to_map = lambda uniques: {v:k[0] for k, v in np.ndenumerate(uniques)}
    return [(to_map(dataframe[col].unique()), col) for col in category_columns]

In [4]:
mappings = extract_category_mappings(raw_features, ['sex', 'smoker', 'region'])

In [5]:

def normalize(series, columns, scaler):
    return scaler.fit_transform(series[columns])

def denormalize(series, columns, scaler):
    return scaler.inverse_transform(series[columns])


In [6]:
age_scaler = preprocessing.MinMaxScaler()
bmi_scaler = preprocessing.MinMaxScaler()
scalers = [(age_scaler,'age'), (bmi_scaler,'bmi')]

In [26]:
# transform the categorical features to binary representations of their indices;
def transform_features(dataframe, mappings, scalers):
    transformed_df = pd.DataFrame(dataframe, columns=['children'])
    
    for mapping, name in mappings:
        for i in mapping.values():
            transformed_df[name + str(i)] = dataframe[name].apply(lambda x: int(mapping[x] == i))
            
    for scaler, name in scalers:
        transformed_df[name] = normalize(dataframe, [name], scaler)

    return transformed_df

In [14]:
features = transform_features(raw_features, mappings, scalers)

In [15]:
charges_scaler = preprocessing.MinMaxScaler()

labels = raw_labels
labels['charges'] = normalize(raw_labels, ['charges'], charges_scaler)
print(labels.head())


    charges
0  0.251611
1  0.009636
2  0.053115
3  0.333010
4  0.043816


In [16]:
X_train, X_test, y_train, y_test = train_test_split(features, labels)

In [17]:
print(X_train.head())
print(X_test.head())
print(y_train.head())
print(y_test.head())

      children  sex0  sex1  smoker0  smoker1  region0  region1  region2  \
746          2     0     1        0        1        1        0        0   
1229         0     0     1        0        1        0        0        0   
940          0     0     1        0        1        0        1        0   
902          3     0     1        0        1        0        0        0   
1180         1     1     0        0        1        0        0        0   

      region3       age       bmi  
746         0  0.347826  0.297014  
1229        1  0.869565  0.385930  
940         0  0.000000  0.195050  
902         1  0.173913  0.304143  
1180        1  0.521739  0.682405  
      children  sex0  sex1  smoker0  smoker1  region0  region1  region2  \
638          0     0     1        1        0        0        0        0   
706          0     1     0        1        0        0        1        0   
1173         2     0     1        0        1        0        0        1   
242          1     1     0       

In [18]:
#TODO warning, actually setting values on y_train if we run this
#test_denormalized_labels = y_train
#test_denormalized_labels['charges'] = denormalize(y_train, ['charges'], scaler)

#print(test_denormalized_labels.head())



# we probably only actually care about denormalising on predictions

In [19]:
X_train.size

11033

In [31]:
# TODO: ending condition
# count generations
# cache fitnesses?
# want to chart progress (mean and best i guess)
# TODO: use less raw python, more pandas/numpy
class GeneticAlgorithm:
    def __init__(self, parameters, features, labels):
        self.population_size = parameters["population_size"]
        self.breeding_ratio = 1 / parameters["breeding_rate"]
        self.crossover_rate = parameters["crossover_rate"]
        self.mutation_rate = parameters["mutation_rate"]
        self.mutation_range = parameters["mutation_range"]
        self.generations = parameters["generations"]
        
        self.features = features
        self.labels = labels
        self.population = pd.Series([self._random_chromosome() for i in range(self.population_size)])
        
    def fit(self):
        for i in range(self.generations):
            self._next_generation()
            self._display_fitness()
        
    def _random_chromosome(self):
        _, shape_cols = self.features.shape
        return [random.random() for c in range(shape_cols)]

    def _next_generation(self):
        weakest, middle, parents = self._divide_population()
        offspring = self._breed(parents)
        self.population = pd.concat([middle, parents, offspring], ignore_index=True)
    
    def _divide_population(self):
        parent_count = int(len(self.population) / self.breeding_ratio)

        # TODO: select parents stochastically, based on fitness, rather than just taking the best ones
        fitnesses = self.population.apply(lambda chromosome: self._chromosome_fitness(chromosome))
        
        parents = self.population[fitnesses.sort_values('charges')[:parent_count].index]
        middle = self.population[fitnesses.sort_values('charges')[parent_count:-parent_count].index]
        weakest = self.population[fitnesses.sort_values('charges')[-parent_count:].index]
        
        return weakest, middle, parents
    
    def _chromosome_fitness(self, chromosome):
        predictions = np.dot(self.features, chromosome)
        differences = pd.DataFrame(self.labels.charges - predictions)
        squared_differences = differences.apply(lambda diff: diff ** 2)
        return squared_differences.sum()
    
    def _breed(self, parents):
        pair_count = int(len(parents) / 2)
        couples = list(zip(parents[:pair_count], parents[pair_count:]))

        # if we have an uneven number of parents, the last one is a hermaphrodite
        if len(parents) % 2 != 0:
            couples.append((parents[-1], parents[-1]))
        
        children = self._crossover(couples)
        self._mutate(children)
        return pd.Series(children)

    def _crossover(self, couples):
        children = []
        for couple in couples:
            p1, p2 = couple
            if random.random() < self.crossover_rate:
                x_point = random.randrange(0, len(p1) + 1)
                children.append(p1[:x_point] + p2[x_point:])
                children.append(p2[:x_point] + p1[x_point:])
            else:
                children.append(p1)
                children.append(p2)
        return children
    
    def _mutate(self, children):
        for child in children:
            for feature_idx, feature in enumerate(child):
                if random.random() < self.mutation_rate:
                    m = random.uniform(-self.mutation_range, self.mutation_range)
                    child[feature_idx] += m
        return children
        
    def _display_fitness(self):
        f_values = self.population.apply(lambda chromosome: self._chromosome_fitness(chromosome)).values
        print('FITNESS - best: {:.5f}, mean: {:.5f}'.format(f_values.min(), f_values.mean()))
    

In [32]:
params = {
    "generations": 100,
    "population_size": 100,
    "breeding_rate": 0.2,
    "crossover_rate": 0.7,
    'mutation_rate': 0.08,
    'mutation_range': 0.1,
}

g = GeneticAlgorithm(params, X_train, y_train)

In [33]:
g.fit()

FITNESS - best: 1362.60467, mean: 4754.47611
FITNESS - best: 1268.66113, mean: 3732.47375
FITNESS - best: 1014.97307, mean: 2895.54269
FITNESS - best: 1014.97307, mean: 2288.79706
FITNESS - best: 552.24268, mean: 1834.64700
FITNESS - best: 329.94506, mean: 1466.04912
FITNESS - best: 329.94506, mean: 1206.72824
FITNESS - best: 317.63892, mean: 988.16884
FITNESS - best: 284.42909, mean: 822.37528
FITNESS - best: 236.67631, mean: 668.86000
FITNESS - best: 227.02429, mean: 527.66821
FITNESS - best: 224.86799, mean: 418.44223
FITNESS - best: 164.90318, mean: 343.36097
FITNESS - best: 151.88383, mean: 291.32797
FITNESS - best: 102.35849, mean: 259.00179
FITNESS - best: 73.73302, mean: 224.20524
FITNESS - best: 73.73302, mean: 192.93644
FITNESS - best: 68.08653, mean: 163.12093
FITNESS - best: 68.08653, mean: 135.85504
FITNESS - best: 36.20658, mean: 114.25363
FITNESS - best: 52.39373, mean: 97.85281
FITNESS - best: 47.18455, mean: 82.76815
FITNESS - best: 32.63566, mean: 73.53547
FITNESS - b

In [23]:
g.population


0     [0.010148593157507733, -0.033089464140898445, ...
1     [0.010148593157507733, -0.033089464140898445, ...
2     [0.010148593157507733, -0.033089464140898445, ...
3     [0.010148593157507733, -0.033089464140898445, ...
4     [0.010148593157507733, -0.033089464140898445, ...
5     [-0.00702926748408033, -0.033089464140898445, ...
6     [-0.00702926748408033, -0.033089464140898445, ...
7     [-0.00702926748408033, -0.033089464140898445, ...
8     [-0.00702926748408033, -0.033089464140898445, ...
9     [-0.00702926748408033, -0.024762415891477818, ...
10    [-0.00702926748408033, -0.024762415891477818, ...
11    [0.010148593157507733, -0.033089464140898445, ...
12    [0.010148593157507733, -0.033089464140898445, ...
13    [0.010148593157507733, -0.033089464140898445, ...
14    [0.010148593157507733, -0.033089464140898445, ...
15    [0.010148593157507733, -0.033089464140898445, ...
16    [0.010148593157507733, -0.033089464140898445, ...
17    [0.010148593157507733, -0.0330894641408984

In [24]:
g.population[0]

[0.010148593157507733,
 -0.033089464140898445,
 -0.024001181039204837,
 0.18197679422346424,
 -0.15446819992853655,
 0.2023156256168494,
 0.2155223946397454,
 0.2140229233698855,
 0.22091328315045083,
 0.14682588950353004,
 0.05934405821703646]

In [304]:
def predict(features, weights):
    return np.dot(features, weights)