In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from random import randint,uniform
from sklearn.model_selection import train_test_split
from time import time

In [2]:
popSize = 30
eliteSize = 0.1
mutation_rate = 0.2
nGeneration = 100

In [3]:
paramList = ['depth','max_bin','eta','gamma','min_child_weight','lamda','alpha','colsample_bytree','subsample',\
            'fitness']
population = pd.DataFrame(np.zeros(shape=(popSize,len(paramList))),columns=paramList)
population.depth = [randint(6,15) for p in range(0,popSize)]
population.max_bin = [randint(80,150) for p in range(0,popSize)]
population.eta = [uniform(0.1,1) for p in range(0,popSize)]
population.gamma = [uniform(0.01,0.03) for p in range(0,popSize)]
population.min_child_weight = [randint(1,20) for p in range(0,popSize)]
population.lamda = [uniform(0.1,0.95) for p in range(0,popSize)]
population.alpha = [uniform(0.1,0.95) for p in range(0,popSize)]
population.colsample_bytree = [uniform(0.7,0.95) for p in range(0,popSize)]
population.subsample = [uniform(0.7,0.95) for p in range(0,popSize)]
population.fitness = [uniform(100,100) for p in range(0,popSize)]

In [4]:
population.head()

Unnamed: 0,depth,max_bin,eta,gamma,min_child_weight,lamda,alpha,colsample_bytree,subsample,fitness
0,13,139,0.799857,0.026391,16,0.214907,0.150815,0.87982,0.784375,100.0
1,9,104,0.344592,0.015329,1,0.283432,0.204148,0.865601,0.86765,100.0
2,6,86,0.826849,0.012918,3,0.406222,0.513127,0.786941,0.715817,100.0
3,9,108,0.354926,0.018907,3,0.216761,0.744156,0.814789,0.721447,100.0
4,12,132,0.31228,0.010006,15,0.265752,0.597981,0.711948,0.868156,100.0


In [5]:
population.shape

(30, 10)

In [36]:
def createNewPopulation(population,eliteSize=eliteSize,mutation_rate=mutation_rate,popSize=popSize):
    population.sort_values(['fitness'],ascending=True,inplace=True)
    population.reset_index(drop=True,inplace=True)
    popSize = population.shape[0]
    nElite = int(round(eliteSize*popSize))
    
    new_population = population.copy(deep=True)
    for i in range(nElite,popSize):
        # create father and mother
        p1 = randint(0,int(popSize/2))
        p2 = randint(0,int(popSize/2))
        
        for attr in list(new_population.columns.values):
            if(uniform(0,1)>0.5):
                new_population.loc[i,attr] = population.loc[p1,attr]
            else:
                new_population.loc[i,attr] = population.loc[p2,attr]
                
            if(uniform(0,1)<mutation_rate):
                attr = list(new_population.columns.values)[randint(0,new_population.shape[1]-2)]
                if(attr=='depth'):
                    new_population.loc[i,attr] = max(3,new_population.loc[i,attr]+randint(-2,2))
                elif(attr=='max_bin'):
                    new_population.loc[i,attr] = max(70,new_population.loc[i,attr]+randint(-20,20))
                elif(attr=='eta'):
                    new_population.loc[i,attr] = max(0.1,new_population.loc[i,attr]+uniform(-0.05,0.05))
                elif(attr=='gamma'):
                    new_population.loc[i,attr] = max(0.1,new_population.loc[i,attr]+uniform(-0.005,0.005))
                elif(attr=='min_child_weight'):
                    new_population.loc[i,attr] = max(0,new_population.loc[i,attr]+randint(-2,2))
                elif(attr=='lamda'):
                    new_population.loc[i,attr] = min(max(0.1,new_population.loc[i,attr]+uniform(-0.05,0.05)),1)
                elif(attr=='alpha'):
                    new_population.loc[i,attr] = min(max(0.1,new_population.loc[i,attr]+uniform(-0.05,0.05)),1)
                elif(attr=='colsample_bytree'):
                    new_population.loc[i,attr] = min(max(0.6,new_population.loc[i,attr]+uniform(-0.05,0.05)),1)
                elif(attr=='subsample'):
                    new_population.loc[i,attr] = min(max(0.6,new_population.loc[i,attr]+uniform(-0.05,0.05)),1)
    return new_population

In [38]:
def testInstance(population,i,dtrain):
    params = {
        'objective':'reg:linear',
        'tree_method':'hist',
        'eta':population.eta[i],
        'max_depth':population.depth[i],
        'subsample':population.subsample[i],
        'colsample_bytree':population.colsample_bytree[i],
        'max_bin':population.max_bin[i],
        'lambda':population.lamda[i],
        'alpha':population.alpha[i],
        'gamma':population.gamma[i],
        'min_child_weight':population.min_child_weight[i],
        'silent':1,
        'seed':401
    }
    history = xgb.cv(
        params,
        dtrain,
        num_boost_round = 30,
        nfold=3,
        verbose_eval = False,
        metrics = 'mae'
        )
    return history['test-mae-mean'].iloc[-1]

In [18]:
def printResult(filename,population,i,generation):
    f1 = open(filename,'a')
    f1.write('Generation %d Best fitness %f\n' % (generation,population.fitness[i]))
    f1.write('"eta":%f\n' % population.eta[i])
    f1.write('"max_depth":%f\n' %population.depth[i])
    f1.write('"subsample":%f\n' %population.subsample[i])
    f1.write('"colsample_bytree":%f\n' %population.colsample_bytree[i])
    f1.write('"lamdba":%f\n' %population.lamda[i])
    f1.write('"alpha":%f\n' %population.alpha[i])
    f1.write('"min_child_weight":%f\n' %population.min_child_weight[i])
    f1.write('"max_bin":%f\n' %population.max_bin[i])
    f1.close()

In [39]:
def evolve(dtrain,nGeneration=nGeneration,popSize=popSize,eliteSize=eliteSize,population=population):
    for gen in range(nGeneration):
        print('Generation %d\n' %gen)
        population = createNewPopulation(population)
        nElite = int(round(eliteSize*popSize))
        for i in range(nElite,popSize):
            print('testing instance %d' %i)
            population.loc[i,'fitness'] = testInstance(population,i,dtrain)
            print('--Fitness %f \n' %population.fitness[i])
        population.sort_values(['fitness'],ascending=True,inplace=True)
        population.reset_index(drop=True,inplace=True)
        printResult('xgb_result.txt',population,0,gen)
        print('Generation %d Best fitness (3-fold mae cv): %f' %(gen,population.fitness[0]))

In [20]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [21]:
train_data.head()

Unnamed: 0,StateHoliday,Store,DayOfWeek,Open,Promo,SchoolHoliday,Year,Month,DayOfMonth,DayOfYear,...,ThisWeekStholi,NextWeekStholi,LastWeekScholi,ThisWeekScholi,NextWeekScholi,PrevQSalesMed,PrevYSalesMed,Assortment,PromoInterval,Sales
0,0,1,5,1.0,1,1,2015,7,31,212,...,0,0,0,0,0,4111.0,4327.0,0,3,5263.0
1,0,1,4,1.0,1,1,2015,7,30,211,...,0,0,0,0,0,4111.0,4327.0,0,3,5020.0
2,0,1,3,1.0,1,1,2015,7,29,210,...,0,0,0,0,0,4111.0,4327.0,0,3,4782.0
3,0,1,2,1.0,1,1,2015,7,28,209,...,0,0,0,1,0,4111.0,4327.0,0,3,5011.0
4,0,1,1,1.0,1,1,2015,7,27,208,...,0,0,0,0,1,4111.0,4327.0,0,3,6102.0


In [22]:
test_data.head()

Unnamed: 0,Id,StateHoliday,Store,DayOfWeek,Open,Promo,SchoolHoliday,Year,Month,DayOfMonth,...,LastWeekStholi,ThisWeekStholi,NextWeekStholi,LastWeekScholi,ThisWeekScholi,NextWeekScholi,PrevQSalesMed,PrevYSalesMed,Assortment,PromoInterval
0,1,0,1,4,1.0,1,0,2015,9,17,...,0,0,1,0,0,1,4111.0,4327.0,0,3
1,2,0,3,4,1.0,1,0,2015,9,17,...,0,0,1,0,0,1,6253.0,5675.0,0,1
2,3,0,7,4,1.0,1,0,2015,9,17,...,0,0,1,0,0,1,8168.0,7925.0,2,3
3,4,0,8,4,1.0,1,0,2015,9,17,...,0,0,1,0,0,1,5752.0,4888.0,0,3
4,5,0,9,4,1.0,1,0,2015,9,17,...,0,0,1,0,0,1,6937.0,6109.0,2,3


In [52]:
no_zeros = train_data[train_data.Sales!=0]
no_zeros.shape

(844338, 28)

In [55]:
X = train_data[train_data.Sales!=0].drop('Sales',axis=1)
y = np.log1p(train_data[train_data.Sales!=0].Sales)

In [57]:
xtrain,xval,ytrain,yval = train_test_split(X,y,test_size=0.049,random_state=40)
xtrain.shape,xval.shape

((802965, 27), (41373, 27))

In [64]:
def rmspe(yvalid,preds):
    sums = 0
    for y,yhat in zip(yvalid,preds):
        sums+=((y-yhat)/y)**2
    return np.sqrt(sums/len(yvalid))

In [65]:
dtrain = xgb.DMatrix(xtrain,ytrain)
dvalid = xgb.DMatrix(xval,yval)

In [67]:
param = {
        'eta':0.35,'max_depth':16,'subsample':0.9,'colsample_bytree':0.9,'lamdba':0.3,'alpha':0.8,\
        'min_child_weight':3,'num_boost_round':100,'objective':'reg:linear','booster':'gbtree',\
        'tree_method':'hist','max_bin':150
    }
start = time()
bst = xgb.train(param,dtrain,30)
preds = np.expm1(bst.predict(dvalid))
print(rmspe(np.expm1(yval),preds))
time()-start

0.512388575074


19.34966468811035

In [53]:
start = time()
print(testInstance(population,1,dtrain))
time() - start

0.0794623333333


25.085363149642944

In [28]:
xgb.__version__

'0.7'

In [42]:
evolve(dtrain)

Generation 0

testing instance 3
--Fitness 0.076392 

testing instance 4
--Fitness 0.097412 

testing instance 5
--Fitness 0.075875 

testing instance 6
--Fitness 0.080374 

testing instance 7
--Fitness 0.116978 

testing instance 8
--Fitness 0.092855 

testing instance 9
--Fitness 0.085979 

testing instance 10
--Fitness 0.091703 

testing instance 11
--Fitness 0.084807 

testing instance 12
--Fitness 0.090659 

testing instance 13
--Fitness 0.086307 

testing instance 14
--Fitness 0.079978 

testing instance 15
--Fitness 0.089943 

testing instance 16
--Fitness 0.086818 

testing instance 17
--Fitness 0.083096 

testing instance 18
--Fitness 0.077412 

testing instance 19
--Fitness 0.090244 

testing instance 20
--Fitness 0.073672 

testing instance 21
--Fitness 0.083982 

testing instance 22
--Fitness 0.067914 

testing instance 23
--Fitness 0.079584 

testing instance 24
--Fitness 0.070516 

testing instance 25
--Fitness 0.101354 

testing instance 26
--Fitness 0.091831 

testing i

KeyboardInterrupt: 

In [43]:
ytrain.head()

720672    8.718991
857309    8.819961
185503    9.447623
964085    8.682369
794950    8.875706
Name: Sales, dtype: float64

In [44]:
ytrain.any(0)

True

In [45]:
from collections import Counter

In [49]:
c = Counter(ytrain)
c[0]

165592

In [51]:
ytrain.shape[0]-c[0]

809911