In [36]:
import pandas as pd
import numpy as np
import warnings
from hyperopt import tpe, hp, fmin
import hyperopt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import confusion_matrix, classification_report
from functools import reduce
import lightgbm as lgbm
warnings.filterwarnings("ignore")

In [19]:
data = pd.read_csv(r'C:\Users\админ\Desktop\bitw/data/winequality-red.csv', delimiter = ';')
target = data.quality.copy()
data.drop('quality', axis = 1, inplace = True)

In [20]:
def polynom(data, degree = 2):
    pf = PolynomialFeatures(degree = degree)
    poly = pf.fit_transform(data)
    return poly

In [21]:
def binning(data, bin = 4):
    matrix_binning = np.zeros_like(data)
    for i, row in enumerate(data.columns):
        bins = np.linspace(data[row].min(), data[row].max(), bin)
        matrix_binning[:, i] = np.digitize(data[row], bins = bins)
    
    encoder = OneHotEncoder(sparse = False)
    one_hot = encoder.fit_transform(matrix_binning)
    angle = np.hstack([one_hot * data[row][:,np.newaxis] for row in data.columns]) 
    return one_hot, angle

In [22]:
data['total sulfur dioxide'] = np.log(np.log(data['total sulfur dioxide']))
data['alcohol'] = np.log(data['alcohol'])
data['chlorides'] = np.log(data['chlorides'])
data['free sulfur dioxide'] = np.log(data['free sulfur dioxide'])
data['residual sugar'] = np.log(data['residual sugar'])

In [129]:
data.drop(['free sulfur dioxide', 'fixed acidity'], axis = 1, inplace = True)

In [130]:
#anomaly = [data[data['sulphates'] > 1.2], data[data['chlorides'] < -3]]
#anomaly_index = map(lambda x: list(x.index), anomaly)
#anomaly_index = reduce(lambda x, y: x + y, anomaly_index)
#anomaly_index = list(set(anomaly_index))

In [23]:
one_hot, angle = binning(data) 
#one_hot = np.delete(one_hot, anomaly_index, axis = 0)
#angle = np.delete(angle, anomaly_index, axis = 0)

In [24]:
poly = polynom(data)
#poly = np.delete(poly, anomaly_index, axis = 0)

In [133]:
#data.drop(anomaly_index, axis = 0, inplace = True)
#target.drop(anomaly_index, axis = 0, inplace = True)

In [25]:
data = np.hstack((data, one_hot, poly, angle))

In [26]:
kf = StratifiedKFold(n_splits = 6, shuffle = True, random_state = 1)
ss = StandardScaler()
data = ss.fit_transform(data)

In [None]:
sgd = SGDClassifier(random_state = 1)
parameters_grid = [
    {
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    'penalty':['none']
    },
    {
     'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],   
    'penalty':['l1', 'l2'],
    'alpha': np.arange(0, .1, .01)[1:]
    }, 
    {
     'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],   
    'penalty':['elasticnet'],
    'alpha': np.arange(0, .1, .01)[1:],
    'l1_ratio': np.arange(0, .1, .01)[1:], 
    'learning_rate':['constant','optimal', 'invscaling'],
    'eta0':np.arange(0, .15, .05)[1:]
    }
    ]
grid_cv = GridSearchCV(sgd, parameters_grid, scoring = 'f1_macro', cv = kf, verbose = 1, n_jobs = -1, )
grid_cv.fit(data, target)

cla = grid_cv.best_estimator_
print(cla)

for ind1, ind2 in kf.split(data, target):
    cla.fit(data[ind1], np.array(target)[ind1])
    #print(np.array(target)[ind2], cla.predict(data[ind2]))
    print(pd.DataFrame(confusion_matrix(np.array(target)[ind2], cla.predict(data[ind2])), index = [3, 4, 5, 6, 7, 8],
                      columns = [3, 4, 5, 6, 7, 8])) 
    print(classification_report(np.array(target)[ind2], cla.predict(data[ind2])))

Fitting 6 folds for each of 2525 candidates, totalling 15150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 19.1min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 22.1min


In [None]:
scoring = ['accuracy', 'f1_macro']
score_cv = cross_validate(grid_cv.best_estimator_, data, target, cv  = 7, n_jobs = -1, verbose = 1, 
                        scoring = scoring)

score_kf6 = cross_validate(grid_cv.best_estimator_, data, target, cv  = kf, n_jobs = -1, verbose = 1, 
                        scoring = scoring)


score_kf7 = cross_validate(grid_cv.best_estimator_, data, target, 
                           cv  = StratifiedKFold(n_splits = 7, shuffle = True, random_state = 1), 
                           n_jobs = -1, verbose = 1, 
                        scoring = scoring)


print('metrics on KFold = 7 \n accuracy: {:f} \n  f1: {:f}'.format(*[np.mean(score_cv.get(key)) 
                                             for key in ['test_accuracy', 'test_f1_macro']]))


print('metrics on StKfold = 6 \n accuracy: {:f} \n  f1: {:f}'.format(*[np.mean(score_kf6.get(key)) 
                                             for key in ['test_accuracy', 'test_f1_macro']]))

print('metrics on StKfold = 7 \n accuracy: {:f} \n  f1: {:f}'.format(*[np.mean(score_kf7.get(key)) 
                                             for key in ['test_accuracy', 'test_f1_macro']]))

In [None]:
def objective(space):
    rf = RandomForestClassifier(random_state = 1, n_jobs = -1, verbose = 1,
                                **space)
    val = cross_val_score(rf, data, target, cv  = kf, n_jobs = -1, verbose = 1, 
                          scoring = 'f1_macro')
    return np.mean(1 - val)


space = {
    'class_weight': hp.choice('class_weight', [None,'balanced']),
    'criterion': hp.choice('criterion', ['entropy', 'gini']),
    'n_estimators': hp.choice('n_estimators', np.arange(10, 100, 10, dtype = int)),
    'max_depth': hp.choice('max_depth', np.arange(5, 25, 5, dtype = int)),
    'min_samples_split': hp.choice('min_samples_split', np.arange(2, 11, 1, dtype = int)),
    'max_features': hp.choice('max_features', np.arange(2, data.shape[1], 1, dtype = int))
    }

best_classifier = fmin(objective, space,
                        algo = tpe.suggest, max_evals = 100)

rf = RandomForestClassifier(random_state = 1, n_jobs = -1, verbose = 1,
                                **hyperopt.pyll.stochastic.sample(space))



print(rf)


for ind1, ind2 in kf.split(data, target):
    rf.fit(data[ind1], np.array(target)[ind1])
    print(pd.DataFrame(confusion_matrix(np.array(target)[ind2], rf.predict(data[ind2])), 
                       index = [3, 4, 5, 6, 7, 8],
                      columns = [3, 4, 5, 6, 7, 8])) 
    print(classification_report(np.array(target)[ind2], rf.predict(data[ind2])))

In [29]:
scoring = ['accuracy', 'f1_macro']
score_cv = cross_validate(rf, data, target, cv  = 7, n_jobs = -1, verbose = 1, 
                        scoring = scoring)

score_kf6 = cross_validate(rf, data, target, cv  = kf, n_jobs = -1, verbose = 1, 
                        scoring = scoring)


score_kf7 = cross_validate(rf, data, target, 
                           cv  = StratifiedKFold(n_splits = 7, shuffle = True, random_state = 1), 
                           n_jobs = -1, verbose = 1, 
                        scoring = scoring)


print('metrics on KFold = 7 \n accuracy: {:f} \n  f1: {:f}'.format(*[np.mean(score_cv.get(key)) 
                                             for key in ['test_accuracy', 'test_f1_macro']]))


print('metrics on StKfold = 6 \n accuracy: {:f} \n  f1: {:f}'.format(*[np.mean(score_kf6.get(key)) 
                                             for key in ['test_accuracy', 'test_f1_macro']]))

print('metrics on StKfold = 7 \n accuracy: {:f} \n  f1: {:f}'.format(*[np.mean(score_kf7.get(key)) 
                                             for key in ['test_accuracy', 'test_f1_macro']]))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:   23.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   18.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   18.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


metrics on KFold = 7 
 accuracy: 0.565625 
  f1: 0.273777
metrics on StKfold = 6 
 accuracy: 0.675272 
  f1: 0.370649
metrics on StKfold = 7 
 accuracy: 0.672208 
  f1: 0.371172


[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:   23.7s finished


In [34]:
rf.fit(data, target)
rf.feature_importances_ 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    6.1s finished


array([9.53738185e-04, 1.23009542e-04, 7.47916603e-04, 1.70906731e-04,
       3.78669836e-04, 1.24482997e-04, 1.04185445e-03, 5.81679865e-04,
       2.05634109e-04, 1.75269850e-04, 1.83931170e-03, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.40891453e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       2.56144252e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.34312797e-05, 0.00000000e+00, 0.00000000e+00, 1.78852456e-05,
       0.00000000e+00, 7.71315244e-06, 0.00000000e+00, 0.00000000e+00,
       3.06072000e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.34122747e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       3.17372295e-05, 1.55084436e-05, 0.00000000e+00, 0.00000000e+00,
      

In [31]:
def objective(space):
    params = {
        'num_leaves': space['num_leaves'],
        'n_estimators': space['n_estimators'],
        'max_depth': space['max_depth'],
        'min_child_samples': space['min_child_samples'],
        'max_bin': space['max_bin']
    }
    clf = lgbm.LGBMClassifier(
        random_state = 1,
        n_jobs = -1,
        class_weight = 'balanced',
        **params
    )
    val = cross_val_score(clf, data, target, cv  = kf, n_jobs = -1, verbose = 1, 
                          scoring = 'f1_macro')
    return np.mean(1 - val)

space = {
    'num_leaves': hp.choice('num_leaves', np.arange(8, 128, 2, dtype = int)),
    'n_estimators':  hp.choice('n_estimators', np.arange(10, 100, 10, dtype = int)),
    'max_depth': hp.choice('max_depth', np.arange(5, 50, 5, dtype = int)),
    'min_child_samples': hp.choice('min_child_samples', np.arange(5, 25, 5, dtype = int)),
    'max_bin': hp.choice('max_bin', np.arange(200, 300, 10, dtype = int))
}

best_classifier = fmin(objective, space,
                        algo = tpe.suggest, max_evals = 100)
                              

lgb = lgbm.LGBMClassifier(random_state = 1, verbose = 1, class_weight = 'balanced',
                               **hyperopt.pyll.stochastic.sample(space))
print(lgb)

for ind1, ind2 in kf.split(data, target):
    lgb.fit(data[ind1], np.array(target)[ind1])
    print(pd.DataFrame(confusion_matrix(np.array(target)[ind2], lgb.predict(data[ind2])), 
                       index = [3, 4, 5, 6, 7, 8],
                      columns = [3, 4, 5, 6, 7, 8])) 
    print(classification_report(np.array(target)[ind2], lgb.predict(data[ind2])))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   48.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   48.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   47.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   47.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   38.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   38.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   29.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   29.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_

[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   33.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   33.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   22.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   22.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   34.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   34.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   38.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   38.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    7.0s remaining:    0.0s
[Parall

[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    5.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    5.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   20.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   20.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   54.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   54.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   27.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   27.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   49.4s remaining:    0.0s
[Parall

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=1.0, learning_rate=0.1, max_bin=210, max_depth=35,
        min_child_samples=5, min_child_weight=5, min_split_gain=0.0,
        n_estimators=40, n_jobs=-1, num_leaves=118, objective=None,
        random_state=1, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=50000, subsample_freq=1,
        verbose=1)
   3  4    5   6   7  8
3  0  0    2   0   0  0
4  0  1    5   3   0  0
5  0  0  100  14   0  0
6  0  0   23  77   7  0
7  0  0    2  13  19  0
8  0  0    0   1   1  1
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       1.00      0.11      0.20         9
           5       0.76      0.88      0.81       114
           6       0.71      0.72      0.72       107
           7       0.70      0.56      0.62        34
           8       1.00      0.33      0.50         3

   micro avg       0.74 

In [32]:
scoring = ['accuracy', 'f1_macro']
score_cv = cross_validate(lgb, data, target, cv  = 7, n_jobs = -1, verbose = 1, 
                        scoring = scoring)

score_kf6 = cross_validate(lgb, data, target, cv  = kf, n_jobs = -1, verbose = 1, 
                        scoring = scoring)


score_kf7 = cross_validate(lgb, data, target, 
                           cv  = StratifiedKFold(n_splits = 7, shuffle = True, random_state = 1), 
                           n_jobs = -1, verbose = 1, 
                        scoring = scoring)


print('metrics on KFold = 7 \n accuracy: {:f} \n  f1: {:f}'.format(*[np.mean(score_cv.get(key)) 
                                             for key in ['test_accuracy', 'test_f1_macro']]))


print('metrics on StKfold = 6 \n accuracy: {:f} \n  f1: {:f}'.format(*[np.mean(score_kf6.get(key)) 
                                             for key in ['test_accuracy', 'test_f1_macro']]))

print('metrics on StKfold = 7 \n accuracy: {:f} \n  f1: {:f}'.format(*[np.mean(score_kf7.get(key)) 
                                             for key in ['test_accuracy', 'test_f1_macro']]))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:   37.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   33.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   33.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


metrics on KFold = 7 
 accuracy: 0.544858 
  f1: 0.259968
metrics on StKfold = 6 
 accuracy: 0.685321 
  f1: 0.365803
metrics on StKfold = 7 
 accuracy: 0.685997 
  f1: 0.358411


[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:   39.0s finished


In [33]:
np.where(lgb.feature_importances_ != 0)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  36,  44,
         68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  79,  80,  81,
         82,  83,  84,  85,  86,  87,  89,  90,  91,  92,  93,  94,  95,
         96,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
        110, 111, 113, 114, 115, 116, 117, 119, 120, 121, 122, 124, 125,
        126, 128, 129, 131, 133, 134, 137, 138, 141, 142, 145, 146, 150,
        151, 154, 155, 157, 158, 159, 161, 162, 165, 166, 169, 173, 174,
        175, 177, 178, 181, 182, 185, 186, 189, 190, 194, 195, 197, 198,
        199, 201, 202, 203, 205, 206, 207, 209, 210, 213, 217, 218, 221,
        222, 225, 226, 229, 230, 233, 234, 238, 242, 243, 246, 247, 249,
        250, 253, 254, 257, 261, 262, 263, 265, 266, 269, 270, 273, 274,
        277, 278, 279, 282, 283, 286, 287, 289, 290, 291, 293, 294, 295,
        297, 298, 301, 302, 305, 306, 309, 310, 313, 314, 317, 318, 321,
        322, 326, 329, 330, 331, 333, 334, 335, 338