In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [11]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2)

In [12]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score baseline:', accuracy_score(y_test, y_pred))

Accuracy score baseline: 0.5142857142857142


In [13]:
lgb_params = {
    'boosting_type': 'dart',
    'objective': 'multiclass',
    'metric':'multi_logloss',
    'learning_rate': 0.3,
    'max_bin':255,
    'num_leaves': 31,  
    'max_depth': -1,  
    'subsample': 0.7,  
    'colsample_bytree': 0.7,  
    'min_split_gain': 0,  
    'verbose': 0,
    'nthread': 32,
    'num_class': 10
}

In [6]:
xgtrain = lgb.Dataset(train.values, y_train.values)
xgvalid = lgb.Dataset(test.values,y_test.values)

In [14]:
evals_results = {}
bst = lgb.train(lgb_params, 
                 xgtrain, 
                 valid_sets=[xgvalid], 
                 valid_names=['valid'], 
                 evals_result=evals_results, 
                 num_boost_round=10000000,
                 early_stopping_rounds=50,
                 verbose_eval=10, 
                 feval=None)

Training until validation scores don't improve for 50 rounds.
[10]	valid's multi_logloss: 1.06707
[20]	valid's multi_logloss: 0.954299
[30]	valid's multi_logloss: 0.920779
[40]	valid's multi_logloss: 0.907438
[50]	valid's multi_logloss: 0.899887
[60]	valid's multi_logloss: 0.892363
[70]	valid's multi_logloss: 0.88277
[80]	valid's multi_logloss: 0.884876
[90]	valid's multi_logloss: 0.87855
[100]	valid's multi_logloss: 0.875236
[110]	valid's multi_logloss: 0.870639
[120]	valid's multi_logloss: 0.871007
[130]	valid's multi_logloss: 0.871669
[140]	valid's multi_logloss: 0.868134
[150]	valid's multi_logloss: 0.870127
[160]	valid's multi_logloss: 0.864329
[170]	valid's multi_logloss: 0.876368
[180]	valid's multi_logloss: 0.876699
[190]	valid's multi_logloss: 0.882322
[200]	valid's multi_logloss: 0.887006
[210]	valid's multi_logloss: 0.883306
Early stopping, best iteration is:
[160]	valid's multi_logloss: 0.864329


In [15]:
pred = bst.predict(test.values)
print('accuracy score with lgb',accuracy_score(y_test,np.argmax(pred,axis=1)))

accuracy score with lgb 0.6724489795918367


In [16]:
from bayes_opt import BayesianOptimization

In [29]:
def lgb_bayes(num_leaves,min_child_samples,max_bin,subsample,colsample_bytree,
              min_child_weight,reg_alpha,reg_lambda):    

    lgb_params = {
        'boosting_type': 'dart',
        'objective': 'multiclass',
        'metric':'multi_logloss',
        'learning_rate': 0.3,
        'num_leaves': int(num_leaves),  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': int(min_child_samples),  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': int(max_bin),  # Number of bucketed bin for feature values
        'subsample': subsample,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': colsample_bytree,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': int(min_child_weight),  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': reg_alpha,  # L1 regularization term on weights
        'reg_lambda': reg_lambda,  # L2 regularization term on weights
        'verbose': 0,
        'nthread': 32,
        'num_class': 10
    }
    

    evals_results = {}
    bst = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgvalid], 
                     valid_names=['valid'], 
                     evals_result=evals_results, 
                     num_boost_round=10000000,
                     early_stopping_rounds=20,
                     verbose_eval=0, 
                     feval=None)
    pred = bst.predict(test.values)
    score = accuracy_score(y_test,np.argmax(pred,axis=1))
    return score

In [31]:
tun = BayesianOptimization(lgb_bayes, {'num_leaves': (2, 120),

                                            'min_child_samples': (1, 200),
                                            'max_bin': (50, 300),
                                            'subsample': (0.4, 0.95),
                                            'colsample_bytree': (0.4, 0.95),
                                            'min_child_weight': (0.0, 200),
                                            'reg_alpha' : (0., 2.),
                                            'reg_lambda' : (0., 2.)

                                            })

tun.maximize(init_points=5, n_iter=1000)


[31mInitialization[0m
[94m-------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_bin |   min_child_samples |   min_child_weight |   num_leaves |   reg_alpha |   reg_lambda |   subsample | 
    1 | 02m11s | [35m   0.58367[0m | [32m            0.4088[0m | [32m 175.6634[0m | [32m             3.7142[0m | [32m           48.6943[0m | [32m     88.8684[0m | [32m     1.2705[0m | [32m      0.8544[0m | [32m     0.4977[0m | 
    2 | 00m54s |    0.57041 |             0.7573 |  130.7221 |            174.3827 |            91.0525 |      21.5459 |      1.6258 |       0.0213 |      0.4896 | 
    3 | 01m32s |    0.54898 |             0.7021 |  236.4664 |            105.4245 |           107.2157 |      27.4664 |      1.4100 |       1.4442 |      0.5360 | 
    4 | 00m51s |    0.52755 |             0.6419 |  158.0225 |

   44 | 00m28s |    0.66633 |             0.6418 |  136.2431 |             23.5774 |             2.3626 |     119.7165 |      0.9820 |       0.0363 |      0.5126 | 
   45 | 00m28s |    0.63367 |             0.4699 |   57.2817 |             84.6860 |             0.9764 |      67.0502 |      0.4290 |       0.8368 |      0.9236 | 
   46 | 00m25s |    0.52449 |             0.5876 |   50.2915 |             90.8556 |           195.2913 |     110.4419 |      0.0164 |       0.3062 |      0.4443 | 
   47 | 00m31s |    0.68061 |             0.7376 |  105.2942 |              3.4096 |             4.8677 |      61.8661 |      0.1827 |       0.6878 |      0.5159 | 
   48 | 00m27s |    0.50918 |             0.6726 |  299.4394 |            134.7921 |           132.7859 |       2.0763 |      0.1597 |       1.4336 |      0.7769 | 
   49 | 00m26s |    0.56735 |             0.9089 |  169.6888 |            196.6905 |             7.9890 |       8.7655 |      0.1931 |       0.9641 |      0.5211 | 
   50 | 00

   94 | 00m43s |    0.66531 |             0.4622 |  268.2311 |              2.5965 |            21.5733 |      58.1658 |      0.0176 |       0.5708 |      0.8552 | 
   95 | 00m42s |    0.52449 |             0.8440 |  133.5485 |            198.9215 |           197.7766 |      40.8125 |      0.0907 |       0.6835 |      0.4166 | 
   96 | 00m43s |    0.68163 |             0.4000 |  130.5368 |              1.2666 |             0.0000 |      53.2642 |      0.0000 |       0.0497 |      0.4000 | 
   97 | 00m45s |    0.62551 |             0.8868 |  181.5568 |             67.3524 |            35.7761 |      54.7846 |      0.0336 |       0.4728 |      0.4634 | 
   98 | 00m49s | [35m   0.68980[0m | [32m            0.5167[0m | [32m 282.4470[0m | [32m             1.8754[0m | [32m            0.0804[0m | [32m     52.9685[0m | [32m     0.3356[0m | [32m      0.2302[0m | [32m     0.4322[0m | 
   99 | 00m46s |    0.65102 |             0.5249 |  233.1116 |             69.3481 |          

  143 | 01m13s |    0.53980 |             0.4160 |  107.1205 |            196.3733 |           136.1609 |     117.7411 |      0.1676 |       1.8246 |      0.5085 | 
  144 | 01m13s |    0.55510 |             0.4800 |  180.5419 |            199.4399 |           123.2394 |      55.0945 |      0.2795 |       0.5796 |      0.4020 | 
  145 | 01m05s |    0.63571 |             0.6951 |  117.9178 |              1.3366 |            25.9573 |     119.4089 |      0.5377 |       1.0252 |      0.5527 | 
  146 | 01m09s |    0.62245 |             0.4902 |  295.3740 |            112.4883 |             1.5596 |      35.3287 |      0.1241 |       0.0795 |      0.5863 | 
  147 | 01m18s |    0.66122 |             0.4046 |   67.6867 |             56.9573 |             0.3304 |      77.0342 |      0.1242 |       0.7851 |      0.6029 | 
  148 | 01m09s |    0.55816 |             0.6551 |   50.9356 |            115.1749 |             1.5735 |       6.7842 |      0.4319 |       0.1622 |      0.4562 | 
  149 | 01

  193 | 01m27s |    0.55408 |             0.5757 |  299.8345 |             88.4034 |           126.7781 |      60.4478 |      0.2403 |       0.7971 |      0.5086 | 
  194 | 01m27s |    0.52041 |             0.7557 |  299.5545 |            108.5816 |           199.3161 |       3.3173 |      0.4146 |       0.7024 |      0.4238 | 
  195 | 01m35s |    0.66020 |             0.8362 |  237.5168 |             38.4652 |            22.1700 |      59.4551 |      0.0486 |       0.4658 |      0.6317 | 
  196 | 01m31s |    0.63878 |             0.5658 |  227.7923 |            102.2656 |             2.1116 |      87.9206 |      0.0044 |       0.5546 |      0.8396 | 
  197 | 01m31s |    0.54490 |             0.4467 |   83.3216 |            111.8644 |           114.5913 |      71.4654 |      0.2646 |       0.8048 |      0.7659 | 
  198 | 01m38s |    0.52347 |             0.4460 |   91.3627 |            121.7750 |            58.4924 |       3.2316 |      0.1273 |       0.2789 |      0.6733 | 
  199 | 01

  243 | 01m47s |    0.51429 |             0.4235 |   54.3819 |              3.7577 |           199.7066 |      64.1017 |      1.0644 |       0.4243 |      0.5376 | 
  244 | 01m46s |    0.59388 |             0.9493 |   54.5552 |            199.1517 |            45.7771 |      46.4377 |      0.3668 |       1.2729 |      0.9223 | 
  245 | 01m46s |    0.58469 |             0.4715 |   50.5176 |            155.4568 |            34.7913 |      19.2778 |      0.0123 |       0.5636 |      0.7126 | 
  246 | 01m46s |    0.51939 |             0.7179 |  227.0920 |            199.9284 |           197.2610 |      33.7002 |      0.1248 |       1.0443 |      0.5270 | 
  247 | 01m54s |    0.64388 |             0.5328 |  191.6202 |             16.1798 |            18.3663 |      62.7060 |      0.1851 |       1.6416 |      0.7480 | 
  248 | 01m48s |    0.55408 |             0.4919 |  182.9764 |            102.6925 |           130.4522 |     119.6609 |      0.0071 |       1.8034 |      0.5723 | 
  249 | 01

  293 | 02m27s |    0.55714 |             0.4868 |  194.8352 |            101.4116 |            96.0027 |      54.4962 |      0.1676 |       1.9388 |      0.5427 | 
  294 | 02m21s |    0.68571 |             0.9136 |  268.5901 |              1.8069 |             0.1214 |      60.0085 |      0.0142 |       1.8984 |      0.7758 | 
  295 | 02m18s |    0.66327 |             0.8996 |  283.2564 |             43.6850 |             1.9334 |     118.4479 |      0.4726 |       0.9057 |      0.4035 | 
  296 | 02m33s |    0.58265 |             0.8230 |  206.0934 |             40.4404 |            65.2279 |     117.5125 |      0.0597 |       1.8311 |      0.8639 | 
  297 | 02m25s |    0.67755 |             0.4573 |  107.7342 |             26.5684 |             0.6830 |      35.9204 |      0.1357 |       1.2528 |      0.7455 | 
  298 | 02m47s |    0.60306 |             0.6372 |  151.6490 |            159.5078 |             4.8972 |      57.4313 |      0.0406 |       1.9045 |      0.8876 | 
  299 | 02

  343 | 03m03s |    0.59490 |             0.4357 |  137.3010 |            123.5105 |            32.5009 |     119.3899 |      0.3628 |       1.4449 |      0.6947 | 
  344 | 02m47s |    0.50918 |             0.9467 |  299.5411 |              5.6516 |           154.3074 |       2.3055 |      0.6919 |       1.9702 |      0.4841 | 
  345 | 02m59s |    0.68061 |             0.6124 |  176.9386 |              1.0561 |             2.3173 |      48.9502 |      0.0189 |       1.5737 |      0.6614 | 
  346 | 03m17s |    0.51939 |             0.6673 |   52.6622 |             44.3975 |           199.6081 |      11.6896 |      0.1331 |       0.0261 |      0.9343 | 
  347 | 03m07s |    0.68776 |             0.9328 |  179.8444 |             17.3682 |             2.2036 |      63.9757 |      0.0472 |       0.1403 |      0.4489 | 
  348 | 03m15s |    0.65714 |             0.7083 |  204.7182 |             40.5090 |            28.2270 |     117.9528 |      0.0383 |       0.8580 |      0.4118 | 
  349 | 03

  393 | 03m47s |    0.56224 |             0.7017 |  218.2412 |            198.8615 |             0.7682 |      27.5117 |      0.2417 |       1.6693 |      0.6014 | 
  394 | 04m07s |    0.51020 |             0.4000 |  175.3709 |            200.0000 |           200.0000 |       2.0000 |      2.0000 |       2.0000 |      0.4000 | 
  395 | 04m07s |    0.68061 |             0.7130 |  247.7232 |             13.4073 |             1.1671 |      91.5782 |      0.5218 |       0.9059 |      0.5953 | 


KeyboardInterrupt: 

In [16]:
original_score = 0.5163265306122449
best_score = 0.69286
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement compare to  logistic regression is {} %'.format(improvement))

overall improvement compare to  logistic regression is 34.19 %
