In [13]:
from multiprocessing.dummy import Pool as ThreadPool
import numpy as np
import pandas as pd
import pickle
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
class GeneticAlgSelect():
    # initial model type, model pool and load in data_in,
    def __init__(self, data_in, data_out, mdl_type, mdl_para, **para):
        # load input and output data, para
        self.data_in = data_in
        self.data_out = data_out
        self.para = para

        # define model type
        self.mdl_type = mdl_type
        self.mdl_para = mdl_para

        # init model pool and elite list
        self.mdl_pool = []
        self.elite_list = []

        # load in para
        self.__set_constant()

    def __set_constant(self):
        # define default constants
        self.num_worker = self.para['num_worker'] if 'num_worker' in self.para else 4
        self.max_iter = self.para['max_iter'] if 'max_iter' in self.para else 5
        self.pool_size = self.para['pool_size'] if 'pool_size' in self.para else 150
        self.mutateLR = self.para['mutateLR'] if 'mutateLR' in self.para else 0.1
        self.mutateUR = self.para['mutateUR'] if 'mutateUR' in self.para else 0.2
        self.mutateBIT = self.para['mutateBIT'] if 'mutateBIT' in self.para else 0.1
        self.crossLR = self.para['crossLR'] if 'crossLR' in self.para else 0.1
        self.crossUR = self.para['crossUR'] if 'crossUR' in self.para else 0.2
        self.eliteR = self.para['elite_rate'] if 'elite_rate' in self.para else 0.02
        self.savefileName = self.para['savefile'] if 'savefile' in self.para else 'GeneticAlgResult.p'

    # perform interation and evaluate local model in parallel
    def _perform_iter(self):
        self.__gen_pool()
        for i in range(self.max_iter):
            print("Iteration :", i)
            
            for i in tqdm.tqdm(range(self.pool_size)):
                self._train_mdl(i)
                
            #print "performing iteration: %d /n" %(i)
            # process data_in in parallel
#             p = ThreadPool(self.num_worker)
#             l=range(0,self.pool_size)
#             # train the local models
#             result=p.map(self._train_mdl,l)
#             p.close()
#             p.join()
            # eliticism, cross_over and mutation
            self.__eliticism()
            self.__cross_over()
            self.__mutation()
            self._save_mdl()

    # generate model pool
    def __gen_pool(self):
        self.gene_len = self.data_in.shape[0] if len(self.data_in.shape)==1 else self.data_in.shape[1]
        for i in range(self.pool_size):
            gene = np.random.random_integers(0,high=1,size=self.gene_len)
            while gene[np.nonzero(gene)].size==0:
                gene = np.random.random_integers(0,high=1,size=self.gene_len)
            self.mdl_pool.append(GeneticAlgLocalModel(gene, self.mdl_type, self.mdl_para))

    # train local model
    def _train_mdl(self,i):
#         print('training model: %d /n' %(i))
        data_in = self.data_in[:,self.mdl_pool[i].gene==1]
        self.mdl_pool[i]._train_mdl(data_in,self.data_out)

    # get model with highest score (best performance)
    def __eliticism(self):
        score_arr=np.zeros(self.pool_size)
        for i in range(self.pool_size):
            score_arr[i] = self.mdl_pool[i].score
        num_elite = round(self.pool_size*self.eliteR)
        print("No of elites :",num_elite)
        print("max score :", max(score_arr))
        
        self.elite_list = np.argsort(score_arr)[-num_elite:]

    # exchange gene pieces
    def __cross_over(self):
        min_c, max_c = round(self.pool_size*self.crossLR), round(self.pool_size*self.crossUR+1)
        num_cross = np.random.randint(min_c, high=max_c)
        
        for i in tqdm.tqdm(range(num_cross)):
            # randomly pick model A and model B
            idxA = np.random.randint(0, high=self.pool_size)
            idxB = np.random.randint(0, high=self.pool_size)
            while idxA in self.elite_list:
                idxA = np.random.randint(0, high=self.pool_size)
            while (idxB in self.elite_list) or (idxB == idxA):
                idxB = np.random.randint(0, high=self.pool_size)

            # generate cross over start pt and end pt
            pt_s = np.random.randint(0, high=self.gene_len)
            pt_e = np.random.randint(0, high=self.gene_len)
            pt_s, pt_e = (pt_e, pt_s) if pt_s>pt_e else (pt_s, pt_e)

            # exchange gene
            self.mdl_pool[idxA].gene[pt_s:pt_e], self.mdl_pool[idxB].gene[pt_s:pt_e] = (self.mdl_pool[idxB].gene[pt_s:pt_e],self.mdl_pool[idxA].gene[pt_s:pt_e])
            self.mdl_pool[idxA].changed = True
            self.mdl_pool[idxB].changed = True
            

    # mutate some bits
    def __mutation(self):
        min_m, max_m = round(self.pool_size*self.mutateLR), round(self.pool_size*self.mutateUR+1)
        num_mute = np.random.randint(min_m, high=max_m)
        print("No. of mutation ",num_mute)
        bit_mute = round(self.mutateBIT*self.gene_len)

        for i in range(num_mute):
            idx = np.random.randint(0, high=self.pool_size)
            while idx in self.elite_list:
                idx = np.random.randint(0, high=self.pool_size)
            pts = np.random.randint(0, high=self.gene_len, size=bit_mute)
            self.mdl_pool[idx].gene[pts] = (1-self.mdl_pool[idx].gene[pts])
            self.mdl_pool[idx].changed = True

    def print_best_mdl(self):
        score_arr=np.zeros(len(self.elite_list))
        for i in range(len(score_arr)):
            score_arr[i] = self.mdl_pool[self.elite_list[i]].score
        max_idx = score_arr.argmax()
        print("Max score ", max(score_arr))
        #print 'The best model calculated is as following'
        #print self.mdl_pool[self.elite_list[i]].gene
        #print self.mdl_pool[self.elite_list[i]].score

    def _save_mdl(self):
        pickle.dump(self.mdl_pool, open(self.savefileName,"wb"))

In [4]:
# local model structure for genetic algorithm that stores local model and gene structure
class GeneticAlgLocalModel():
    def __init__(self, gene, mdl_type, mdl_para):
        self.gene=gene
        self.mdl=mdl_type(**mdl_para)
        self.changed = True
        self.score=0

    def _train_mdl(self, data_in, data_out):
        if self.changed:
            kf = KFold(n_splits=5)
            kf.get_n_splits(data_in)

#             kf = KFold(data_in.shape[0], n_folds = 5, shuffle=False)
            i = 0
            for train_idx,test_idx in kf.split(data_in):
                if (i<1):
                    self.mdl.fit(data_in[train_idx,:], data_out[train_idx])
                    self.changed = False
                    fpr, tpr, _ = roc_curve(data_out[test_idx], self.mdl.predict(data_in[test_idx]))
                    self.score = auc(fpr, tpr) #self.mdl.score
                    #print self.score
                    i = i+1

In [5]:
import zipfile

zf = zipfile.ZipFile('./data/train.csv.zip') 
train_df = pd.read_csv(zf.open('train.csv'))


In [6]:
y_train = train_df["TARGET"]
x_train = train_df.drop("TARGET", axis=1).values

In [40]:
from sklearn.tree import DecisionTreeClassifier

In [41]:
mdl = DecisionTreeClassifier # LogisticRegression # RandomForestClassifier
mdl_para = {} # {"n_estimators":100,"max_depth":5}

In [42]:
ga = GeneticAlgSelect(x_train[:2000], y_train[:2000], mdl, mdl_para=mdl_para, para={"num_worker":-1})

In [43]:
ga._perform_iter()

Iteration : 0




  0%|                                                                                          | 0/150 [00:00<?, ?it/s]

  1%|█                                                                                 | 2/150 [00:00<00:08, 17.10it/s]

  3%|██▏                                                                               | 4/150 [00:00<00:08, 17.33it/s]

  5%|███▊                                                                              | 7/150 [00:00<00:07, 18.59it/s]

  7%|█████▍                                                                           | 10/150 [00:00<00:07, 19.94it/s]

  9%|███████                                                                          | 13/150 [00:00<00:06, 21.28it/s]

 11%|████████▋                                                                        | 16/150 [00:00<00:05, 22.53it/s]

 13%|██████████▎                                                                      | 19/150 [00:00<00:05, 23.17it/s]

 15%|███████████▉             

No of elites : 3
max score : 0.6503609276608816




  0%|                                                                                           | 0/18 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 3001.77it/s]

No. of mutation  21
Iteration : 1




  0%|                                                                                          | 0/150 [00:00<?, ?it/s]

  3%|██▏                                                                               | 4/150 [00:00<00:04, 32.54it/s]

  7%|█████▍                                                                           | 10/150 [00:00<00:03, 37.30it/s]

 11%|████████▋                                                                        | 16/150 [00:00<00:03, 37.65it/s]

 13%|██████████▊                                                                      | 20/150 [00:00<00:03, 33.43it/s]

 15%|████████████▍                                                                    | 23/150 [00:00<00:04, 29.73it/s]

 19%|███████████████                                                                  | 28/150 [00:00<00:03, 33.18it/s]

 21%|█████████████████▎                                                               | 32/150 [00:00<00:03, 34.52it/s]

 29%|███████████████████████▏ 

No of elites : 3
max score : 0.6503609276608816




  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 3502.06it/s]

No. of mutation  27
Iteration : 2




  0%|                                                                                          | 0/150 [00:00<?, ?it/s]

  5%|████▎                                                                             | 8/150 [00:00<00:02, 53.01it/s]

 15%|████████████▍                                                                    | 23/150 [00:00<00:02, 62.57it/s]

 19%|███████████████                                                                  | 28/150 [00:00<00:02, 55.30it/s]

 22%|█████████████████▊                                                               | 33/150 [00:00<00:03, 34.17it/s]

 25%|███████████████████▉                                                             | 37/150 [00:00<00:03, 35.36it/s]

 28%|██████████████████████▋                                                          | 42/150 [00:00<00:02, 38.50it/s]

 31%|█████████████████████████▍                                                       | 47/150 [00:01<00:03, 33.81it/s]

 34%|█████████████████████████

No of elites : 3
max score : 0.6503609276608816




  0%|                                                                                           | 0/15 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 2144.11it/s]

No. of mutation  23
Iteration : 3




  0%|                                                                                          | 0/150 [00:00<?, ?it/s]

  3%|██▏                                                                               | 4/150 [00:00<00:03, 37.76it/s]

  7%|█████▍                                                                           | 10/150 [00:00<00:03, 42.13it/s]

 11%|████████▋                                                                        | 16/150 [00:00<00:03, 43.55it/s]

 15%|███████████▉                                                                     | 22/150 [00:00<00:03, 39.34it/s]

 17%|██████████████                                                                   | 26/150 [00:00<00:03, 33.57it/s]

 23%|██████████████████▎                                                              | 34/150 [00:00<00:03, 38.26it/s]

 25%|████████████████████▌                                                            | 38/150 [00:00<00:03, 37.16it/s]

 31%|████████████████████████▊

No of elites : 3
max score : 0.6555828597757642




  0%|                                                                                           | 0/30 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 2308.96it/s]

No. of mutation  24
Iteration : 4




  0%|                                                                                          | 0/150 [00:00<?, ?it/s]

  1%|█                                                                                 | 2/150 [00:00<00:11, 13.08it/s]

  3%|██▏                                                                               | 4/150 [00:00<00:10, 13.67it/s]

  7%|█████▉                                                                           | 11/150 [00:00<00:07, 17.85it/s]

 10%|████████                                                                         | 15/150 [00:00<00:06, 19.45it/s]

 12%|█████████▋                                                                       | 18/150 [00:00<00:06, 19.42it/s]

 14%|███████████▎                                                                     | 21/150 [00:00<00:06, 19.00it/s]

 17%|██████████████                                                                   | 26/150 [00:01<00:05, 21.31it/s]

 23%|██████████████████▎      

No of elites : 3
max score : 0.6555828597757642




  0%|                                                                                           | 0/17 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 5671.13it/s]

No. of mutation  16


In [50]:
from functools import reduce

In [44]:
ga.print_best_mdl()

Max score  0.6555828597757642


In [45]:
ga.elite_list

array([101,  88,  89], dtype=int64)

In [46]:
gene_list = []
for elite in ga.elite_list:
    gene_list.append(ga.mdl_pool[elite].gene)
    

In [59]:
np.array(list(map(lambda x: 1 if x else 0, reduce(np.logical_or,gene_list))))

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,