In [1]:
import numpy as np
import pandas as pd

import math
import random
import operator
import itertools

from deap import algorithms,base,creator,tools,gp
from features1 import fill_age_1,add_title

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_x = train.drop("Survived",axis=1)
train_y = train['Survived']

def MungeData(data):
    # Sex
#    data=add_title(data)   
#    data=fill_age_1(data)
#    data.drop('Title', axis=1, inplace=True)
    data.drop(['Ticket', 'Name', 'PassengerId'], inplace=True, axis=1)
    data.Sex.fillna('0', inplace=True)
    data.loc[data.Sex != 'male', 'Sex'] = 0
    data.loc[data.Sex == 'male', 'Sex'] = 1
    # Cabin
    data.Cabin.fillna('0', inplace=True)
    data.loc[data.Cabin.str[0] == 'A', 'Cabin'] = 1
    data.loc[data.Cabin.str[0] == 'B', 'Cabin'] = 2
    data.loc[data.Cabin.str[0] == 'C', 'Cabin'] = 3
    data.loc[data.Cabin.str[0] == 'D', 'Cabin'] = 4
    data.loc[data.Cabin.str[0] == 'E', 'Cabin'] = 5
    data.loc[data.Cabin.str[0] == 'F', 'Cabin'] = 6
    data.loc[data.Cabin.str[0] == 'G', 'Cabin'] = 7
    data.loc[data.Cabin.str[0] == 'T', 'Cabin'] = 8
    # Embarked
    data.loc[data.Embarked == 'C', 'Embarked'] = 1
    data.loc[data.Embarked == 'Q', 'Embarked'] = 2
    data.loc[data.Embarked == 'S', 'Embarked'] = 3
    data.Embarked.fillna(0, inplace=True)
    data.fillna(-1, inplace=True)

    return data.astype(float)
train_x = MungeData(train_x).values.tolist()
test = MungeData(test).values.tolist()
train_y = train_y.tolist()

In [2]:
def proba(data):
    return (1.-(1./(1.+np.exp(-data))))

def predict(prob):
    return np.round(prob)

def protectedDiv(a,b):
    try:
        return a/b
    except ZeroDivisionError:
        return 1
def protectedLog(a):
    x=np.log(a)
    if np.isnan(x):
        return 728
    return x

pset = gp.PrimitiveSetTyped('MAIN',itertools.repeat(float, 8), float)
pset.addPrimitive(protectedLog, [float], float)
pset.addPrimitive(operator.add, [float,float],float)
pset.addPrimitive(operator.sub, [float,float],float)
pset.addPrimitive(operator.mul, [float,float],float)
pset.addPrimitive(protectedDiv, [float,float],float)
pset.addPrimitive(operator.neg, [float],float)
pset.addPrimitive(np.cos,[float],float)
pset.addPrimitive(np.sin,[float],float)
pset.addPrimitive(np.tanh,[float],float)
pset.addPrimitive(np.minimum, [float,float],float)
pset.addPrimitive(np.maximum, [float,float],float)
# Terminals
pset.addEphemeralConstant("rand1", lambda: random.random()*2-1, float)
#pset.addEphemeralConstant("rand1000", lambda: round(random.random()*1000),float)
pset.addTerminal(np.pi,float)
pset.addTerminal(2.0,float)
pset.renameArguments(ARG0='PClass')
pset.renameArguments(ARG1='Sex')
pset.renameArguments(ARG2='Age')
pset.renameArguments(ARG3='SibSp')
pset.renameArguments(ARG4='Parch')
pset.renameArguments(ARG5='Fare')
pset.renameArguments(ARG6='Cabin')
pset.renameArguments(ARG7='Embarked')

In [4]:
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register('expr', gp.genHalfAndHalf, pset = pset, min_=1, max_=10)
toolbox.register('individual', tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)
toolbox.register('compile', gp.compile, pset=pset)
def log_loss(y_true, y_prob):
    n = len(y_true)
    result=0.0
    for i in range(n):
        y_prob[i]=np.minimum(np.maximum(1e-15, y_prob[i]),1-1e-15)
        if y_true[i]:
            result+=np.log(y_prob[i])
        else:
            result+=np.log(1-y_prob[i])
    return 999.0 if np.isnan(result) else -1.0*result/n

def evalFitness(individual):
    func = toolbox.compile(expr=individual)
    return log_loss(train_y,[proba(func(*x)) for x in train_x]),
    
toolbox.register('evaluate', evalFitness)
def staticLimitCrossover(ind1, ind2, heightLimit):
    keepInd1, keepInd2 = toolbox.clone(ind1), toolbox.clone(ind2)
    gp.cxOnePoint(ind1, ind2)
    if ind1.height > heightLimit:
        ind1[:] = keepInd1
    if ind2.height > heightLimit:
        ind2[:] = keepInd2
        
def staticLimitCrossover1(ind1, ind2, heightLimit):
    keepInd1, keepInd2 = toolbox.clone(ind1), toolbox.clone(ind2)
    gp.cxOnePointLeafBiased(ind1, ind2, 0.2)
    if ind1.height > heightLimit:
        ind1[:] = keepInd1
    if ind2.height > heightLimit:
        ind2[:] = keepInd2

def staticLimitMutation(individual, expr, pset, heightLimit):
    keepInd = toolbox.clone(individual)
    gp.mutUniform(individual, expr,pset)
    if individual.height > heightLimit:
        individual[:] = keepInd 

def staticLimitMutation1(individual, heightLimit):
    keepInd = toolbox.clone(individual)
    gp.mutEphemeral(individual, mode='one')
    if individual.height > heightLimit:
        individual[:] = keepInd 

def staticLimitMutation2(individual, pset, heightLimit):
    keepInd = toolbox.clone(individual)
    gp.mutNodeReplacement(individual, pset)
    if individual.height > heightLimit:
        individual[:] = keepInd 

def staticLimitMutation3(individual, pset, heightLimit):
    keepInd = toolbox.clone(individual)
    gp.mutInsert(individual, pset)
    if individual.height > heightLimit:
        individual[:] = keepInd 

def staticLimitMutation4(individual, heightLimit):
    keepInd = toolbox.clone(individual)
    gp.mutShrink(individual)
    if individual.height > heightLimit:
        individual[:] = keepInd 
        
def selDoubleTournament(individuals, k, fitTournSize, sizeTournSize):
    def _sizeTournament(individuals, tournamentSize):
        chosen = []
        aspirant1 = random.choice(individuals)
        aspirant2 = random.choice(individuals)
        s1, s2 = aspirant1.height, aspirant2.height
        if s1 < s2:
            return aspirant1 if random.random() < tournamentSize / 2.0 else aspirant2
        elif s1 > s2:
            return aspirant2 if random.random() < tournamentSize / 2.0 else aspirant1
        else:
            return random.choice([aspirant1, aspirant2])
    chosen = []
    for i in xrange(k):
        chosen.append(_sizeTournament(individuals, sizeTournSize))
        for j in xrange(fitTournSize - 1):
            aspirant = _sizeTournament(individuals, sizeTournSize)
            if aspirant.fitness > chosen[i].fitness:
                chosen[i] = aspirant
    return chosen 

toolbox.register('selectW', tools.selWorst)
toolbox.register('selectR', tools.selRoulette)
toolbox.register('selectT', tools.selTournament, tournsize=3)
toolbox.register('selectB', tools.selBest)
toolbox.register('selectSPEA', tools.selSPEA2)
toolbox.register('select', selDoubleTournament)


toolbox.register("mate", staticLimitCrossover, heightLimit=17)
toolbox.register("mateL", staticLimitCrossover1, heightLimit=17)
toolbox.register("mutateS",  staticLimitMutation4, heightLimit=17) 
toolbox.register("mutateI",  staticLimitMutation3, pset=pset, heightLimit=17) 
toolbox.register("mutateR",  staticLimitMutation2, pset=pset, heightLimit=17) 
toolbox.register("mutateE",  staticLimitMutation1, heightLimit=17) 
toolbox.register("mutateU", staticLimitMutation, expr=toolbox.expr, pset=pset, heightLimit=17) 



In [None]:
# Elite Approach + New Blood Mechanism (2016)
# Target fitness(want to minimize)
size=100
target = 0.3
gen=0
PR_X = 0.75
PR_M = 0.05


pop = toolbox.population(n=size)
fitnesses = map(toolbox.evaluate, pop)
for ind,fit in zip(pop, fitnesses):
    ind.fitness.values = fit
hof = tools.HallOfFame(1)
kick_start = toolbox.selectB(pop,1)[0]
hof.insert(kick_start)
previous_fitness_df=[]
cleanse = False
while hof[0].fitness.values[0]>0.3:
    gen+=1   
    # copy best individual
    best=pop[0]
    for ind in pop:
        if ind.fitness>best.fitness:
            best=ind
    
    
    selected = toolbox.select(pop,size, 3, 1.4)
    parents = map(toolbox.clone, selected)
    previous_fitness_df.append(np.average([ind.fitness.values[0] for ind in selected]))
    if len(previous_fitness_df)>7:
        del previous_fitness_df[0]
    if len(previous_fitness_df)==7 and np.fabs(previous_fitness_df[0]-previous_fitness_df[6])<1e-4:
        print "Reset!!"
        pop1 = toolbox.selectB(parents, int(round(0.1*size)))
        pop2 = toolbox.population(int(round(0.9*size)))
        pop11 = map(toolbox.clone,pop1)
        pop22 = map(toolbox.clone,pop2)
        parents[:] = pop11+pop22
        invalid_ind = [ind for ind in parents if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
        cleanse = True
        
    print gen, best.fitness.values[0], np.average([ind.fitness.values[0] for ind in selected])
    
    # crossing
    n=PR_X*len(parents)
    n-=n%2
    if cleanse:
        m = round((random.random()/2)+0.5)
        crossing = toolbox.selectR(pop22,int(round(m)))+toolbox.selectR(pop11,int(n)-int(round(m)))
        random.shuffle(crossing)
        cleanse=False
    else:    
        crossing = toolbox.selectSPEA(parents,int(n))
        
    for ind1,ind2 in zip(crossing[::2],crossing[1::2]):
        toolbox.mate(ind1,ind2)
        del ind1.fitness.values
        del ind2.fitness.values
    
    offspring=map(toolbox.clone,selected+crossing)

    # mutation
    for ind in offspring:
        if random.random() < PR_M:
            toolbox.mutateU(ind)
            del ind.fitness.values
            
    # re-evaluate fitness
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit
        
    pop[:]=offspring+[best]
    
    
        

1 0.569472736693 1.2507267618
2 0.56802816906 0.784656670185
3 0.56802816906 0.584721162723
4 0.56802816906 0.569443845341
5 0.56802816906 0.569371616959
6 0.56802816906 0.569198268843
7 0.56802816906 0.569097149109
8 0.56802816906 0.568851572611
9 0.56802816906 0.568692670171
10 0.56802816906 0.568475985026
11 0.56802816906 0.570538782105
12 0.56802816906 0.569377678942
13 0.56802816906 0.568681255486
14 0.56802816906 0.648071156336
15 0.56802816906 0.571425417418
16 0.56802816906 0.570408771018
17 0.56802816906 0.568695701163
18 0.56802816906 0.568681255486
19 0.56802816906 0.568085951765
20 0.56802816906 0.574343540392
21 0.56802816906 0.568071506089
22 0.56802816906 0.569741238915
23 0.56802816906 0.568796820897
24 0.56802816906 0.568071506089
25 0.567314780634 0.570358300105
26 0.567314780634 0.570151257874
Reset!!
27 0.567314780634 0.568057594136
28 0.567314780634 0.588462207905
29 0.567314780634 0.569213065881
30 0.567314780634 0.575857361837
31 0.567314780634 0.569306071975
32 

In [40]:
(0.563850447348-0.56349857308)/0.563850447348

0.0006240560234634785