In [68]:
import pandas as pd
import numpy as np
import xgboost as xgb
from random import randint,uniform
from sklearn.model_selection import train_test_split
from time import time
from sklearn.preprocessing import LabelEncoder

In [3]:
popSize = 30
eliteSize = 0.1
mutation_rate = 0.2
nGeneration = 100

In [4]:
paramList = ['depth','max_bin','eta','gamma','min_child_weight','colsample_bytree','subsample',\
            'fitness']
population = pd.DataFrame(np.zeros(shape=(popSize,len(paramList))),columns=paramList)
population.depth = [randint(6,15) for p in range(0,popSize)]
population.max_bin = [randint(80,150) for p in range(0,popSize)]
population.eta = [uniform(0.1,1) for p in range(0,popSize)]
population.gamma = [uniform(0.01,0.03) for p in range(0,popSize)]
population.min_child_weight = [randint(1,20) for p in range(0,popSize)]
population.colsample_bytree = [uniform(0.7,0.95) for p in range(0,popSize)]
population.subsample = [uniform(0.7,0.95) for p in range(0,popSize)]
population.fitness = [uniform(100,100) for p in range(0,popSize)]

In [5]:
population.head()

Unnamed: 0,depth,max_bin,eta,gamma,min_child_weight,colsample_bytree,subsample,fitness
0,13,85,0.585907,0.020017,13,0.804254,0.822173,100.0
1,13,94,0.992587,0.018321,2,0.864237,0.713242,100.0
2,14,139,0.218783,0.029458,16,0.880375,0.906083,100.0
3,7,137,0.97205,0.028656,9,0.768531,0.830018,100.0
4,13,101,0.540155,0.022369,12,0.870652,0.913449,100.0


In [6]:
population.shape

(30, 8)

In [7]:
def createNewPopulation(population,eliteSize=eliteSize,mutation_rate=mutation_rate,popSize=popSize):
    population.sort_values(['fitness'],ascending=True,inplace=True)
    population.reset_index(drop=True,inplace=True)
    popSize = population.shape[0]
    nElite = int(round(eliteSize*popSize))
    
    new_population = population.copy(deep=True)
    for i in range(nElite,popSize):
        # create father and mother
        p1 = randint(0,int(popSize/2))
        p2 = randint(0,int(popSize/2))
        
        for attr in list(new_population.columns.values):
            if(uniform(0,1)>0.5):
                new_population.loc[i,attr] = population.loc[p1,attr]
            else:
                new_population.loc[i,attr] = population.loc[p2,attr]
                
            if(uniform(0,1)<mutation_rate):
                attr = list(new_population.columns.values)[randint(0,new_population.shape[1]-2)]
                if(attr=='depth'):
                    new_population.loc[i,attr] = max(3,new_population.loc[i,attr]+randint(-2,2))
                elif(attr=='max_bin'):
                    new_population.loc[i,attr] = max(70,new_population.loc[i,attr]+randint(-20,20))
                elif(attr=='eta'):
                    new_population.loc[i,attr] = max(0.1,new_population.loc[i,attr]+uniform(-0.05,0.05))
                elif(attr=='gamma'):
                    new_population.loc[i,attr] = max(0.1,new_population.loc[i,attr]+uniform(-0.005,0.005))
                elif(attr=='min_child_weight'):
                    new_population.loc[i,attr] = max(0,new_population.loc[i,attr]+randint(-2,2))
                elif(attr=='colsample_bytree'):
                    new_population.loc[i,attr] = min(max(0.6,new_population.loc[i,attr]+uniform(-0.05,0.05)),1)
                elif(attr=='subsample'):
                    new_population.loc[i,attr] = min(max(0.6,new_population.loc[i,attr]+uniform(-0.05,0.05)),1)
    return new_population

In [42]:
def testInstance(population,i,dtrain,num_class):
    params = {
        'objective':'binary:logitraw',
        'tree_method':'hist',
        'eta':population.eta[i],
        'max_depth':population.depth[i],
        'subsample':population.subsample[i],
        'colsample_bytree':population.colsample_bytree[i],
        'max_bin':population.max_bin[i],
        'gamma':population.gamma[i],
        'min_child_weight':population.min_child_weight[i],
        'silent':1,
        'seed':401,
        'num_class':num_class
    }
    history = xgb.cv(
        params,
        dtrain,
        num_boost_round = 30,
        nfold=3,
        verbose_eval = False,
        metrics = 'auc'
        )
    return history['test-auc-mean'].iloc[-1]

In [10]:
def printResult(filename,population,i,generation):
    f1 = open(filename,'a')
    f1.write('Generation %d Best fitness %f\n' % (generation,population.fitness[i]))
    f1.write('"eta":%f\n' % population.eta[i])
    f1.write('"max_depth":%f\n' %population.depth[i])
    f1.write('"subsample":%f\n' %population.subsample[i])
    f1.write('"colsample_bytree":%f\n' %population.colsample_bytree[i])
    f1.write('"min_child_weight":%f\n' %population.min_child_weight[i])
    f1.write('"max_bin":%f\n' %population.max_bin[i])
    f1.close()

In [12]:
def evolve(dtrain,nGeneration=nGeneration,popSize=popSize,eliteSize=eliteSize,population=population):
    for gen in range(nGeneration):
        print('Generation %d\n' %gen)
        population = createNewPopulation(population)
        nElite = int(round(eliteSize*popSize))
        for i in range(nElite,popSize):
            print('testing instance %d' %i)
            population.loc[i,'fitness'] = testInstance(population,i,dtrain)
            print('--Fitness %f \n' %population.fitness[i])
        population.sort_values(['fitness'],ascending=True,inplace=True)
        population.reset_index(drop=True,inplace=True)
        printResult('xgb_result.txt',population,0,gen)
        print('Generation %d Best fitness (5-fold mae cv): %f' %(gen,population.fitness[0]))

In [44]:
data = pd.read_csv('titanic_data.csv')

In [45]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [61]:
for col in data.columns.values:
    print(col,sum(data[col].isnull()))

PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2


In [63]:
data.Age.fillna(np.mean,inplace=True)

In [64]:
data.Cabin.fillna(0,inplace=True)

In [65]:
data.Embarked.fillna(0,inplace=True)

In [66]:
for col in data.columns.values:
    print(col,sum(data[col].isnull()))

PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 0
Embarked 0


In [67]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,0,S


In [69]:
sex_encoder = LabelEncoder()

In [27]:
xtrain,xtest,ytrain,ytest = train_test_split(iris.drop('target',axis=1),iris.target,test_size=0.1,random_state=401)

In [28]:
dtrain = xgb.DMatrix(xtrain,ytrain)
dtest = xgb.DMatrix(xtest,ytest)

In [40]:
set(iris.target)

{0, 1, 2}