In [1]:
import numpy as np
import pandas as pd

import math
import random
import operator
import itertools

from deap import algorithms,base,creator,tools,gp
from features1 import fill_age_1,add_title

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_x = train.drop("Survived",axis=1)
train_y = train['Survived']



In [2]:
def MungeData(data):
    # Sex
#    data=add_title(data)   
#    data=fill_age_1(data)
#    data.drop('Title', axis=1, inplace=True)
    data.drop(['Ticket', 'Name', 'PassengerId'], inplace=True, axis=1)
    data.Sex.fillna('0', inplace=True)
    data.loc[data.Sex != 'male', 'Sex'] = 0
    data.loc[data.Sex == 'male', 'Sex'] = 1
    # Cabin
    data.Cabin.fillna('0', inplace=True)
    data.loc[data.Cabin.str[0] == 'A', 'Cabin'] = 1
    data.loc[data.Cabin.str[0] == 'B', 'Cabin'] = 2
    data.loc[data.Cabin.str[0] == 'C', 'Cabin'] = 3
    data.loc[data.Cabin.str[0] == 'D', 'Cabin'] = 4
    data.loc[data.Cabin.str[0] == 'E', 'Cabin'] = 5
    data.loc[data.Cabin.str[0] == 'F', 'Cabin'] = 6
    data.loc[data.Cabin.str[0] == 'G', 'Cabin'] = 7
    data.loc[data.Cabin.str[0] == 'T', 'Cabin'] = 8
    # Embarked
    data.loc[data.Embarked == 'C', 'Embarked'] = 1
    data.loc[data.Embarked == 'Q', 'Embarked'] = 2
    data.loc[data.Embarked == 'S', 'Embarked'] = 3
    data.Embarked.fillna(0, inplace=True)
    data.fillna(-1, inplace=True)

    return data.astype(float)
train_x = MungeData(train_x).values.tolist()
test = MungeData(test).values.tolist()
train_y = train_y.tolist()

In [3]:
def proba(data):
    return (1.-(1./(1.+np.exp(-data))))

def predict(prob):
    return np.round(prob)

def protectedDiv(a,b):
    try:
        return a/b
    except ZeroDivisionError:
        return 1
def protectedLog(a):
    x=np.log(a)
    if np.isnan(x):
        return 728
    return x

pset = gp.PrimitiveSetTyped('MAIN',itertools.repeat(float, 8), float)
pset.addPrimitive(protectedLog, [float], float)
pset.addPrimitive(operator.add, [float,float],float)
pset.addPrimitive(operator.sub, [float,float],float)
pset.addPrimitive(operator.mul, [float,float],float)
pset.addPrimitive(protectedDiv, [float,float],float)
pset.addPrimitive(operator.neg, [float],float)
pset.addPrimitive(np.cos,[float],float)
pset.addPrimitive(np.sin,[float],float)
pset.addPrimitive(np.tanh,[float],float)
pset.addPrimitive(np.minimum, [float,float],float)
pset.addPrimitive(np.maximum, [float,float],float)
# Terminals
pset.addEphemeralConstant("rand1", lambda: random.random()*2-1, float)
#pset.addEphemeralConstant("rand1000", lambda: round(random.random()*1000),float)
pset.addTerminal(np.pi,float)
pset.addTerminal(2.0,float)
pset.renameArguments(ARG0='PClass')
pset.renameArguments(ARG1='Sex')
pset.renameArguments(ARG2='Age')
pset.renameArguments(ARG3='SibSp')
pset.renameArguments(ARG4='Parch')
pset.renameArguments(ARG5='Fare')
pset.renameArguments(ARG6='Cabin')
pset.renameArguments(ARG7='Embarked')

In [4]:
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register('expr', gp.genHalfAndHalf, pset = pset, min_=1, max_=4)
toolbox.register('individual', tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)
toolbox.register('compile', gp.compile, pset=pset)

In [5]:
def log_loss(y_true, y_prob):
    n = len(y_true)
    result=0.0
    for i in range(n):
        y_prob[i]=np.minimum(np.maximum(1e-15, y_prob[i]),1-1e-15)
        if y_true[i]:
            result+=np.log(y_prob[i])
        else:
            result+=np.log(1-y_prob[i])
    return 999.0 if np.isnan(result) else -1.0*result/n

def evalFitness(individual):
    func = toolbox.compile(expr=individual)
    return log_loss(train_y,[proba(func(*x)) for x in train_x]),
    
toolbox.register('evaluate', evalFitness)

In [6]:
def staticLimitCrossover(ind1, ind2, heightLimit):
    keepInd1, keepInd2 = toolbox.clone(ind1), toolbox.clone(ind2)
    gp.cxOnePoint(ind1, ind2)
    if ind1.height > heightLimit:
        ind1[:] = keepInd1
    if ind2.height > heightLimit:
        ind2[:] = keepInd2
        
def staticLimitCrossover1(ind1, ind2, heightLimit):
    keepInd1, keepInd2 = toolbox.clone(ind1), toolbox.clone(ind2)
    gp.cxOnePointLeafBiased(ind1, ind2, 0.2)
    if ind1.height > heightLimit:
        ind1[:] = keepInd1
    if ind2.height > heightLimit:
        ind2[:] = keepInd2

def staticLimitMutation(individual, expr, pset, heightLimit):
    keepInd = toolbox.clone(individual)
    gp.mutUniform(individual, expr,pset)
    if individual.height > heightLimit:
        individual[:] = keepInd 

def staticLimitMutation1(individual, heightLimit):
    keepInd = toolbox.clone(individual)
    gp.mutEphemeral(individual, mode='one')
    if individual.height > heightLimit:
        individual[:] = keepInd 

def staticLimitMutation2(individual, pset, heightLimit):
    keepInd = toolbox.clone(individual)
    gp.mutNodeReplacement(individual, pset)
    if individual.height > heightLimit:
        individual[:] = keepInd 

def staticLimitMutation3(individual, pset, heightLimit):
    keepInd = toolbox.clone(individual)
    gp.mutInsert(individual, pset)
    if individual.height > heightLimit:
        individual[:] = keepInd 

def staticLimitMutation4(individual, heightLimit):
    keepInd = toolbox.clone(individual)
    gp.mutShrink(individual)
    if individual.height > heightLimit:
        individual[:] = keepInd 
toolbox.register('selectW', tools.selWorst)
toolbox.register('selectR', tools.selRoulette)
toolbox.register('selectT', tools.selTournament, tournsize=3)
toolbox.register('selectB', tools.selBest)
toolbox.register('selectSPEA', tools.selSPEA2)
toolbox.register('select', tools.selDoubleTournament)


toolbox.register("mate", staticLimitCrossover, heightLimit=17)
toolbox.register("mateL", staticLimitCrossover1, heightLimit=17)
toolbox.register("mutateS",  staticLimitMutation4, heightLimit=17) 
toolbox.register("mutateI",  staticLimitMutation3, pset=pset, heightLimit=17) 
toolbox.register("mutateR",  staticLimitMutation2, pset=pset, heightLimit=17) 
toolbox.register("mutateE",  staticLimitMutation1, heightLimit=17) 
toolbox.register("mutateU", staticLimitMutation, expr=toolbox.expr, pset=pset, heightLimit=17) 


def same_hof(h1,h2):
    for i in range(len(h1)):
        if not h1[i][0]==h2[i][0]:
            return False
    return True    


In [28]:
# Initialize parameters
NGEN = 40000
size=500
MUT_R = 0.2
MUT_I = 0.2
MUT_U = 0.4
MUT_E = 0.4
MUT_S = 0.2

PROB_X_MAX = 0.9
PROB_X_MIN = 0.2

# Initialize containers
pop = toolbox.population(n=size)
hof = tools.HallOfFame(1)


fitnesses  = map(toolbox.evaluate, pop)
for ind, fit in zip(pop, fitnesses):
    ind.fitness.values = fit
current_best = toolbox.selectB(pop,1)
hof.insert(current_best[0])


for g in range(NGEN):
    # clone population
    offspring = map(toolbox.clone, pop)
    
    # replace individuals with invalid fitness
    shitz = [ind for ind in offspring if ind.fitness.values[0]<30.0]
    part1 = map(toolbox.clone, shitz)
    for i in range(len(offspring)-len(part1)):
        x=999.0
        ind = toolbox.population(n=1)
        while x >30:
            ind = toolbox.population(n=1)
            f = map(toolbox.evaluate, ind)
            x=f[0][0]
            ind[0].fitness.values = f[0]
        part1+=ind
    
    fitnesses = [ind.fitness.values[0] for ind in part1]
    f_ave = np.mean(fitnesses)
    f_min = min(fitnesses)
    
    probab = []
    for i in range(len(fitnesses)):
        if fitnesses[i]>f_ave:
            probab.append(PROB_X_MAX)
        else:
            probab.append(((PROB_X_MAX-PROB_X_MIN)/(1+np.exp(.5*(fitnesses[i]-f_ave)/(f_min-f_ave))))+PROB_X_MIN)
    
    for i in range (10):
        print f_min, f_ave, fitnesses[i], probab[i] 
    # Mate
    a = []
    b = []
    for i in range(len(part1)):
        if random.random < fitnesses[i]:
            a.append(part1[i])
        else:
            b.append(part1[i])
    if len(a)%2==1:
        a.pop() 
        
    for c1,c2 in zip(a[::2],a[1::2]):
        toolbox.mate(c1,c2)
        del c1.fitness.values
        del c2.fitness.values
    
    offspring[:] = a+b
    # Mutate
    #b = toolbox.selectW(offspring, size/2)
    for mutant in offspring:
        if random.random()<0.1:
            if random.random()<MUT_I:
                toolbox.mutateI(mutant)
            if random.random()< MUT_R:
                toolbox.mutateR(mutant)
            if random.random()< MUT_U:
                toolbox.mutateU(mutant)
            if random.random()< MUT_E:
                toolbox.mutateE(mutant)
            if random.random()< MUT_S:
                toolbox.mutateS(mutant)
            del mutant.fitness.values

    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]

    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):

        ind.fitness.values = fit
        
    pop[:]=offspring
    hof_t=map(toolbox.clone, hof)
    hof.update(pop)

    result = str(g) + " " + str(hof[0].fitness.values[0])
    print result

#    print str(gp.PrimitiveTree(hof[0]))
#    if same_hof(hof_t,hof):
#        count+=1
#    else:
#        count =0
#    if count>30:
        

0.564092966862 2.98230991581 0.68183457279 0.468295012977
0.564092966862 2.98230991581 0.676896122421 0.468126079677
0.564092966862 2.98230991581 13.2573081112 0.9
0.564092966862 2.98230991581 0.806479587622 0.472571976705
0.564092966862 2.98230991581 0.877604705986 0.475023539875
0.564092966862 2.98230991581 0.621806175814 0.466244328539
0.564092966862 2.98230991581 0.567763532363 0.464403327439
0.564092966862 2.98230991581 1.25251404778 0.488068400173
0.564092966862 2.98230991581 0.848240706701 0.474010461639
0.564092966862 2.98230991581 0.694171031051 0.468717191317
0 0.564092966862
0.564092966862 3.09707121577 0.68183457279 0.468112533356
0.564092966862 3.09707121577 0.676896122421 0.46795129463
0.564092966862 3.09707121577 13.2573081112 0.9
0.564092966862 3.09707121577 0.806479587622 0.472194157271
0.564092966862 3.09707121577 0.877604705986 0.474533318387
0.564092966862 3.09707121577 0.621806175814 0.466155139116
0.564092966862 3.09707121577 0.567763532363 0.464397669956
0.564092

0.569472736693 3.18159177906 1.8836182334 0.506743148122
0.569472736693 3.18159177906 13.2573081112 0.9
0.569472736693 3.18159177906 0.681248983933 0.467807228026
0.569472736693 3.18159177906 0.621806175814 0.465928371603
0.569472736693 3.18159177906 0.70983946274 0.468712706854
0.569472736693 3.18159177906 1.35368469737 0.489386418632
0.569472736693 3.18159177906 0.848240706701 0.473112036458
0.569472736693 3.18159177906 0.694171031051 0.468216334209
0.569472736693 3.18159177906 0.695670244527 0.468263813964
20 0.564092966862
0.569472736693 3.21185760348 0.68183457279 0.467785028762
0.569472736693 3.21185760348 1.8836182334 0.506246724504
0.569472736693 3.21185760348 13.2573081112 0.9
0.569472736693 3.21185760348 0.681248983933 0.467766707763
0.569472736693 3.21185760348 0.621806175814 0.465909451008
0.569472736693 3.21185760348 0.70983946274 0.468661757469
0.569472736693 3.21185760348 1.35368469737 0.489094598271
0.569472736693 3.21185760348 0.848240706701 0.473010246746
0.5694727366

0.569472736693 3.31407635674 5.74856440083 0.9
0.569472736693 3.31407635674 23.9178559394 0.9
0.569472736693 3.31407635674 0.699211534151 0.468177632624
0.569472736693 3.31407635674 0.70983946274 0.468498013666
0.569472736693 3.31407635674 1.45242380748 0.491211551341
0.569472736693 3.31407635674 0.69177643889 0.467953586345
0.569472736693 3.31407635674 13.2573081112 0.9
0.569472736693 3.31407635674 0.695670244527 0.468070911914
46 0.541749323165
0.569472736693 3.32078710453 0.68307484131 0.467683120952
0.569472736693 3.32078710453 1.07135051916 0.479441086132
0.569472736693 3.32078710453 5.74856440083 0.9
0.569472736693 3.32078710453 23.9178559394 0.9
0.569472736693 3.32078710453 0.699211534151 0.468168095495
0.569472736693 3.32078710453 0.70983946274 0.468487690618
0.569472736693 3.32078710453 1.45242380748 0.491144831874
0.569472736693 3.32078710453 0.69177643889 0.467944598618
0.569472736693 3.32078710453 13.2573081112 0.9
0.569472736693 3.32078710453 0.695670244527 0.468061636505


KeyboardInterrupt: 

SyntaxError: invalid syntax (<ipython-input-36-f3a5b0639d3f>, line 1)

In [25]:
str(gp.PrimitiveTree(hof[0]))


'add(add(sub(Sex, 3.141592653589793), PClass), sub(mul(Age, minimum(SibSp, 3.141592653589793)), sub(protectedDiv(maximum(3.141592653589793, Embarked), Age), tanh(3.141592653589793))))'