In [3]:
import random
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [84]:
raw_data = pd.read_csv('./src/data/insurance.csv')
raw_features = pd.DataFrame(raw_data, columns=raw_data.columns.drop('charges'))
raw_labels = pd.DataFrame(raw_data.charges)

In [85]:
# we use this to map the categorical features into indices, we can map back later for presentation
def extract_category_mappings(dataframe, category_columns):
    to_map = lambda uniques: {v:k[0] for k, v in np.ndenumerate(uniques)}
    return [(to_map(dataframe[col].unique()), col) for col in category_columns]

In [86]:
mappings = extract_category_mappings(raw_features, ['sex', 'smoker', 'region'])

In [87]:
# transform the categorical features to binary representations of their indices;
# TODO: we might want to bucketize the continuous features?
def transform_features(dataframe, mappings):
    transformed_df = pd.DataFrame(dataframe, columns=['age', 'bmi', 'children'])
    
    for mapping, name in mappings:
        for i in mapping.values():
            transformed_df[name + str(i)] = dataframe[name].apply(lambda x: int(mapping[x] == i))

    return transformed_df

In [88]:
features = transform_features(raw_features, mappings)

In [89]:
scaler = preprocessing.MinMaxScaler()

def normalize(series, columns, scaler):
    return scaler.fit_transform(series[columns])

def denormalize(series, columns, scaler):
    return scaler.inverse_transform(series[columns])

labels = raw_labels
labels['charges'] = normalize(raw_labels, ['charges'], scaler)
print(labels.head())


    charges
0  0.251611
1  0.009636
2  0.053115
3  0.333010
4  0.043816


In [10]:
X_train, X_test, y_train, y_test = train_test_split(features, labels)

In [11]:
print(X_train.head())
print(X_test.head())
print(y_train.head())
print(y_test.head())

      age    bmi  children  sex0  sex1  smoker0  smoker1  region0  region1  \
885    32  28.93         1     0     1        1        0        0        1   
483    51  39.50         1     1     0        0        1        1        0   
1236   63  21.66         0     1     0        0        1        0        0   
780    30  24.40         3     0     1        1        0        1        0   
841    59  24.70         0     0     1        0        1        0        0   

      region2  region3  
885         0        0  
483         0        0  
1236        0        1  
780         0        0  
841         0        1  
      age     bmi  children  sex0  sex1  smoker0  smoker1  region0  region1  \
1036   22  37.070         2     0     1        1        0        0        1   
1123   27  32.395         1     1     0        0        1        0        0   
16     52  30.780         1     1     0        0        1        0        0   
361    35  30.500         1     0     1        0        1        

In [12]:
#TODO warning, actually setting values on y_train if we run this
#test_denormalized_labels = y_train
#test_denormalized_labels['charges'] = denormalize(y_train, ['charges'], scaler)

#print(test_denormalized_labels.head())



# we probably only actually care about denormalising on predictions

In [13]:
X_train.size

11033

In [255]:
# TODO: ending condition
# count generations
# cache fitnesses?
# want to chart progress (mean and best i guess)
class GeneticAlgorithm:
    def __init__(self, parameters, features, labels):
        self.population_size = parameters["population_size"]
        self.breeding_ratio = 1 / parameters["breeding_rate"]
        
        self.features = features
        self.labels = labels
        self.population = pd.Series([self.random_chromosome() for i in range(self.population_size)])
        
    def random_chromosome(self):
        _, shape_cols = self.features.shape
        return [random.random() for c in range(shape_cols)]

    def next_generation(self):
        weakest, middle, parents = self.divide_population()
        offspring = self.breed(parents)
        self.population = pd.concat([middle, parents, offspring], ignore_index=True)
        self.display_fitness()
    
    def divide_population(self):
        parent_count = int(len(self.population) / self.breeding_ratio)

        # TODO: select parents stochastically, based on fitness, rather than just taking the best ones
        fitnesses = self.population.apply(lambda chromosome: self.chromosome_fitness(chromosome))
        
        parents = self.population[fitnesses.sort_values('charges')[:parent_count].index]
        middle = self.population[fitnesses.sort_values('charges')[parent_count:-parent_count].index]
        weakest = self.population[fitnesses.sort_values('charges')[-parent_count:].index]
        
        return weakest, middle, parents
    
    def chromosome_fitness(self, chromosome):
        predictions = np.dot(self.features, chromosome)
        differences = pd.DataFrame(self.labels.charges - predictions)
        squared_differences = differences.apply(lambda diff: diff ** 2)
        return squared_differences.sum()
    
    def breed(self, parents):
        # TODO: produce offspring
        # TODO: crossover
        # TODO: mutation
        return parents
        
    def display_fitness(self):
        f_values = self.population.apply(lambda chromosome: self.chromosome_fitness(chromosome)).values
        print('FITNESS - best: {:.5f}, mean: {:.5f}'.format(f_values.min(), f_values.mean()))
    

In [256]:
params = {
    "population_size": 100,
    "breeding_rate": 0.2,
}

g = GeneticAlgorithm(params, X_train, y_train)

In [257]:
g.next_generation()
g.next_generation()
g.next_generation()
g.next_generation()

FITNESS - best: 43822.23408, mean: 1163804.38591
FITNESS - best: 43822.23408, mean: 762549.08577
FITNESS - best: 43822.23408, mean: 490456.97012
FITNESS - best: 43822.23408, mean: 317508.88641


In [192]:
def predict(features, weights):
    return np.dot(features, weights)