/
modgen_classifier_models.py
354 lines (290 loc) · 16.3 KB
/
modgen_classifier_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
"""
This file holds all of the model not related to neural networks. New models can be added with relative ease by
making a new function and passing in use_previous_model and params. Params is a dictionary which can store any parameters
to be fed into the model. Currently, only classifiers are included, but regressors could easily be added in time.
Variables:
params: Dictionary - Holds parameters to set into models
use_previous_model: Boolean - Variable set in modgen_main. This boolean lets the program know if you are grabbing from
a previous index or if you are making new models.
is_classifier: Boolean - Determines if a classifier model will be made or a regressor
Classifier / Regressor Models:
Logistic Regression
Ridge
Lasso
Decision Tree
KNN
SVM
Ensembles:
Random Forest
Boosting Models:
AdaBoost
GradientBoost
LightGBM
XGBoost
"""
import pandas as pd
import numpy as np
from modgen_utils import *
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Perceptron, Ridge, Lasso
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier,
GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
def getModelRidge(use_previous_model, params):
"""
Ridge model used as L2 Regularization. The model itself will keep the coef from blowing up or shrinking
down to a superlow or superhigh number causing massive overfitting of the data. Generally alpha will be
between 1e-15 and 1e-3. Can use all features at once, but computationally heavy as you add more features.
"""
if not use_previous_model:
params = {}
params['Model_type'] = 'ridge'
params['alpha'] = getRandomNumber(-15,2, random_type = 'exp')
model = Ridge(alpha = params['alpha'], max_iter = 1e6, random_state = 0)
return params, model
def getModelLasso(use_previous_model, params):
"""
Lasso model used as L1 regularization. The model will create more and more sparse coef by deciding which
features are important and ignorning the rest. The term lasso means lassoing up all the good
features and leaving out the bad ones.
"""
if not use_previous_model:
params = {}
params['Model_type'] = 'lasso'
params['alpha'] = getRandomNumber(-15,2, random_type = 'exp')
model = Lasso(alpha = params['alpha'], max_iter = 1e6, random_state = 0)
return params, model
def getModelKNN(use_previous_model, params, is_classifier):
"""
K Nearest Neighbor Model used as an unsupervised classifier. Generating decision boundaries based off the
probability of each points to its nearest neighbors to determine class boundaries.
"""
if not use_previous_model:
params = {}
params['Model_type'] = 'knn'
params['n_neighbors'] = getRandomNumber(1,30)
params['p'] = getRandomNumber(1,2)
if is_classifier:
model = KNeighborsClassifier(n_neighbors = params['n_neighbors'], p = params['p'])
else:
model = KNeighborsRegressor(n_neighbors = params['n_neighbors'], p = params['p'])
return params, model
def getModelGradientBoosting(use_previous_model, params, is_classifier):
"""
Gradient Boosting Model using weaker regression trees to distribute a high variance across an ensemble of
classifiers. Used as the premise for many gradient boosting based algorithms.
"""
if not use_previous_model:
params = {}
params['Model_type'] = 'gradboost'
params['learning_rate'] = getRandomNumber(-3,2, random_type = 'exp_random')
params['n_estimators'] = getRandomNumber(1,500)
params['max_depth'] = getRandomNumber(1,32)
params['min_samples_split'] = getRandomNumber(0.1,1, random_type = 'float')
params['min_samples_leaf'] = getRandomNumber(0.1,0.5, random_type = 'float')
if is_classifier:
model = GradientBoostingClassifier(learning_rate = params['learning_rate'],
n_estimators = params['n_estimators'],
max_depth = params['max_depth'],
min_samples_split = params['min_samples_split'],
min_samples_leaf = params['min_samples_leaf'],
random_state = 0)
else:
model = GradientBoostingRegressor(learning_rate = params['learning_rate'],
n_estimators = params['n_estimators'],
max_depth = params['max_depth'],
min_samples_split = params['min_samples_split'],
min_samples_leaf = params['min_samples_leaf'],
random_state = 0)
return params, model
def getModelSVM(use_previous_model,params, is_classifier):
"""
Support Vector Machine model is used for linear / kernel modeling. Each parameter is taken care of for each of
the options for each kernel. Kernel trick (rbf) used to move the data to a higher dimensionality to seperate
non-linearly seperable data.
"""
if not use_previous_model:
params = {}
params['Model_type'] = 'svm'
params['kernel'] = getRandomFromList(['linear', 'rbf', 'poly']) # Can only be one rbf/poly non-linear
if params['kernel'] == 'rbf' or params['kernel'] == 'poly':
params['gamma'] = getRandomNumber(1,25) # For non-linear only [0.1 - 100]
else:
params['gamma'] = 'auto'
params['C'] = getRandomNumber(1,50) # [0.1 - 1000]
if params['kernel'] == 'poly':
params['degree'] = getRandomNumber(1,4) # Only used on POLY - [1,2,3,4]
else:
params['degree'] = 3
if is_classifier:
model = SVC(kernel = params['kernel'], gamma = params['gamma'], C = params['C'], degree = params['degree'],
max_iter = 1e6, random_state = 0)
else:
model = SVR(kernel = params['kernel'], gamma = params['gamma'], C = params['C'], degree = params['degree'],
max_iter = 1e6)
return params, model
def getModelAdaBoostTree(use_previous_model, params, is_classifier):
"""
Ada Boosting with Decision Tree Classifier. Uses adaboost technique to boost weak classifiers from decision trees.
Has shown good results in the past models.
"""
if not use_previous_model:
params = {}
params['Model_type'] = 'adaboost'
params['learning_rate'] = getRandomNumber(-3,1, random_type = 'exp_random')
params['n_estimators'] = getRandomNumber(2,300)
params['max_depth'] = getRandomNumber(1,100)
params['min_samples_split'] = getRandomNumber(0.1,1, random_type = 'float')
params['min_samples_leaf'] = getRandomNumber(0.1,0.5, random_type = 'float')
if is_classifier:
model = AdaBoostClassifier(DecisionTreeClassifier(max_depth = params['max_depth'],
min_samples_split = params['min_samples_split'],
min_samples_leaf = params['min_samples_leaf'], random_state = 0),
learning_rate = params['learning_rate'], n_estimators = params['n_estimators'],
random_state = 0)
else:
model = AdaBoostRegressor(DecisionTreeRegressor(max_depth = params['max_depth'],
min_samples_split = params['min_samples_split'],
min_samples_leaf = params['min_samples_leaf'], random_state = 0),
learning_rate = params['learning_rate'], n_estimators = params['n_estimators'],
random_state = 0)
return params, model
def getModelDecisionTree(use_previous_model, params, is_classifier):
"""
Decision Tree Classifiers. Very fast classifier computationally. Results have been promising (in top 90%)
"""
if not use_previous_model:
params = {}
params['Model_type'] = 'decisiontree'
params['max_depth'] = getRandomNumber(1,50)
params['min_samples_split'] = getRandomNumber(0.1,1, random_type = 'float')
params['min_samples_leaf'] = getRandomNumber(0,0.5, random_type = 'float')
if is_classifier:
model = DecisionTreeClassifier(max_depth = params['max_depth'],
min_samples_split = params['min_samples_split'],
min_samples_leaf = params['min_samples_leaf'], random_state = 0)
else:
model = DecisionTreeRegressor(max_depth = params['max_depth'],
min_samples_split = params['min_samples_split'],
min_samples_leaf = params['min_samples_leaf'], random_state = 0)
return params, model
def getModelRandomForest(use_previous_model, params, is_classifier):
"""
Random Forest models is an ensemble of weaker decision tree classifiers (with large max_depth)
to build one strong classifier n_estimators determines how many trees will be built.
"""
if not use_previous_model:
params = {}
params['Model_type'] = 'randomforest'
params['n_estimators'] = getRandomNumber(2,500)
params['max_depth'] = getRandomNumber(1,50)
params['min_samples_split'] = getRandomNumber(0.1,1, random_type = 'float')
params['min_samples_leaf'] = getRandomNumber(0,0.5, random_type = 'float')
if is_classifier:
model = RandomForestClassifier(n_estimators = params['n_estimators'],
max_depth = params['max_depth'],
min_samples_split = params['min_samples_split'],
min_samples_leaf = params['min_samples_leaf'], random_state = 0)
else:
model = RandomForestRegressor(n_estimators = params['n_estimators'],
max_depth = params['max_depth'],
min_samples_split = params['min_samples_split'],
min_samples_leaf = params['min_samples_leaf'], random_state = 0)
return params, model
def getModelXGBoost(use_previous_model, params, is_classifier):
"""
XGBoost model boosting technique. Best results for this program have been with XGBoost. This is the sklearn
implementation as opposed to the original implementation (with different parameter names).
"""
if not use_previous_model:
params = {}
params['Model_type'] = 'xgboost'
params['learning_rate'] = getRandomNumber(-3,-1, random_type = 'exp_random')
params['min_child_weight'] = getRandomNumber(1,10)
params['max_depth'] = getRandomNumber(1,10)
params['gamma'] = getRandomNumber(0,20, random_type = 'int')
params['subsample'] = getRandomNumber(0.5,1, random_type = 'float')
params['colsample_bytree'] = getRandomNumber(0.5,1, random_type = 'float')
params['n_estimators'] = getRandomNumber(2,100)
if is_classifier:
objective_ = 'binary:logistic'
model = XGBClassifier(learning_rate = params['learning_rate'], min_child_weight = params['min_child_weight'],
max_depth = params['max_depth'], gamma = params['gamma'], subsample = params['subsample'],
colsample_bytree = params['colsample_bytree'], objective = objective_,
n_estimators = params['n_estimators'], random_state = 0)
else:
objective_ = 'reg:linear'
model = XGBRegressor(learning_rate = params['learning_rate'], min_child_weight = params['min_child_weight'],
max_depth = params['max_depth'], gamma = params['gamma'], subsample = params['subsample'],
colsample_bytree = params['colsample_bytree'], objective = objective_,
n_estimators = params['n_estimators'], random_state = 0)
return params, model
def getModelLightGBM(use_previous_model, params, is_classifier):
"""
Microsoft's LightGBM model with sklearn implementation. Can do 3 different boosting types at random. If
you want to do only one boosting type, then take out the boosting types you dont want from the boosting type
list. The available boosting types are 'dart' - dropout, 'gdbt' - gradient boosting, 'rf' - random forest.
Early stopping is enabled, so n_estimators is set to 10k, but usually does not get to 10k iteration before early
stopping takes effect.
"""
if not use_previous_model:
params = {}
params['boosting_type'] = getRandomFromList(['dart', 'gbdt', 'rf'])
params['Model_type'] = 'lightgbm_' + params['boosting_type']
params['learning_rate'] = getRandomNumber(-3,-1, random_type = 'exp_random')
params['num_leaves'] = getRandomNumber(2,100)
params['min_child_samples'] = getRandomNumber(2,100)
params['max_depth'] = getRandomNumber(1,10)
params['reg_lambda'] = getRandomNumber(-9,3, random_type = 'exp_random')
params['colsample_bytree'] = getRandomNumber(0.5,1, random_type = 'float')
params['subsample'] = getRandomNumber(0.5,1, random_type = 'float')
params['subsample_freq'] = getRandomNumber(1,10)
params['n_estimators'] = 10000
if is_classifier:
objective_ = 'binary'
params['metric'] = ['binary_logloss','auc']
model = LGBMClassifier(boosting_type = params['boosting_type'], num_leaves = params['num_leaves'],
max_depth = params['max_depth'], learning_rate = params['learning_rate'],
n_estimators = params['n_estimators'], objective = objective_,
reg_lambda = params['reg_lambda'],colsample_bytree = params['colsample_bytree'],
min_child_samples = params['min_child_samples'], subsample = params['subsample'],
subsample_freq = params['subsample_freq'], random_state = 0)
else:
objective_ = 'regression'
params['metric'] = ['l2']
model = LGBMRegressor(boosting_type = params['boosting_type'], num_leaves = params['num_leaves'],
max_depth = params['max_depth'], learning_rate = params['learning_rate'],
n_estimators = params['n_estimators'], objective = objective_,
reg_lambda = params['reg_lambda'],colsample_bytree = params['colsample_bytree'],
min_child_samples = params['min_child_samples'], subsample = params['subsample'],
subsample_freq = params['subsample_freq'], random_state = 0)
return params, model
def getModelNeuralNetwork(use_previous_model, params, is_classifier):
"""
Sets the paramters for how many layers to make and the amount of activation nodes. Model is set to 'None' due to
NNModelSelector creating the model with the designated parameters.
"""
if not use_previous_model:
params = {}
params['Model_type'] = 'neuralnetwork'
params['epoch'] = getRandomNumber(15, 50, 'int')
params['kernel_initializer'] = getRandomFromList(['random_uniform', 'random_normal', 'glorot_normal',
'glorot_uniform'])
params['optimizer'] = getRandomFromList(['adam', 'rmsprop', 'sgd', 'adagrad'])
params['learning_rate'] = getRandomNumber(-3, 1, 'exp_random')
params['batch_size'] = getRandomFromList([2,4,8,16,32,64])
params['n_layers'] = getRandomNumber(1, 4, 'int')
for i in range(1, params['n_layers'] + 1):
params['n_layers_neurons_' + str(i)] = getRandomNumber(2, 12, 'int')
if is_classifier:
params['final_activation'] = 'sigmoid'
params['loss_optimize'] = 'binary_crossentropy'
else:
params['final_activation'] = None
params['loss_optimize'] = 'mean_squared_error'
# Model not instantiated like other models
model = None
return params, model