In [1]:
import pandas as pd
import numpy as np
import joblib

import lightgbm as lgb
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

In [2]:
# Читаем и смтортим на данные 
df = pd.read_csv('data/winequality-red.csv', sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Мы имеем признаки:

* fixed acidity - фиксированная кислотность 
* volatile acidity - летучая кислотность 
* citric acid - лимонная кислота 
* residual sugar - остаточный сахар 
* chlorides - хлориды 
* free sulfur dioxide - свободный диоксид серы 
* total sulfur dioxide - общий диоксид серы 
* density - плотность 
* pH - водородный показатель 
* sulphates - сульфаты 
* alcohol - алкоголь
* quality - качество

In [3]:
# Проверяем пропуски и смотрим тип признаков
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
# Проверим дубли
df[df.duplicated() == True]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
11,7.5,0.500,0.36,6.10,0.071,17.0,102.0,0.99780,3.35,0.80,10.5,5
27,7.9,0.430,0.21,1.60,0.106,10.0,37.0,0.99660,3.17,0.91,9.5,5
40,7.3,0.450,0.36,5.90,0.074,12.0,87.0,0.99780,3.33,0.83,10.5,5
65,7.2,0.725,0.05,4.65,0.086,4.0,11.0,0.99620,3.41,0.39,10.9,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1563,7.2,0.695,0.13,2.00,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,5
1564,7.2,0.695,0.13,2.00,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,5
1567,7.2,0.695,0.13,2.00,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,5
1581,6.2,0.560,0.09,1.70,0.053,24.0,32.0,0.99402,3.54,0.60,11.3,5


240 полностью идентичных записей, удалим их.

In [5]:
# Удаляем дубликаты и перезаписываем выборку 
df.drop_duplicates(inplace=True)
df.shape

(1359, 12)

In [6]:
# Проверим статистические показатели 
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1359.0,8.310596,1.73699,4.6,7.1,7.9,9.2,15.9
volatile acidity,1359.0,0.529478,0.183031,0.12,0.39,0.52,0.64,1.58
citric acid,1359.0,0.272333,0.195537,0.0,0.09,0.26,0.43,1.0
residual sugar,1359.0,2.5234,1.352314,0.9,1.9,2.2,2.6,15.5
chlorides,1359.0,0.088124,0.049377,0.012,0.07,0.079,0.091,0.611
free sulfur dioxide,1359.0,15.893304,10.44727,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1359.0,46.825975,33.408946,6.0,22.0,38.0,63.0,289.0
density,1359.0,0.996709,0.001869,0.99007,0.9956,0.9967,0.99782,1.00369
pH,1359.0,3.309787,0.155036,2.74,3.21,3.31,3.4,4.01
sulphates,1359.0,0.658705,0.170667,0.33,0.55,0.62,0.73,2.0


In [7]:
# Вино с оценкой <= 5 будем считать плохим, > 5 хорошим
#df['quality'] = df.quality.apply(lambda x: 0 if x <= 5 else 1)

# Разбиваем выборку на признаки и целевой показатель
X = df.drop('quality', axis=1)
y = df['quality']

In [8]:
df.quality.value_counts()

quality
5    577
6    535
7    167
4     53
8     17
3     10
Name: count, dtype: int64

In [9]:
# Реализуем стриминговое чтение файлов 
def read_streaming(X_train, y_train, batch_size=500):
    X = []
    y = []
    line = 0
    train_df, tarain_label = shuffle(X_train, y_train, random_state=13)
    train_df = train_df.to_numpy()
    for row, target in zip(train_df, tarain_label):
        X.append(row)
        y.append(target)

        line += 1
        if line >= batch_size:
            X, y = np.array(X), np.array(y)
            yield X, y
            X, y = [], []
            line = 0

In [10]:
def get_lgbm(X, y):
    gbm = None

    params = {
        'task': 'train',
        'application': 'multiclass',  
        'boosting_type': 'gbdt', 
        'learning_rate': 0.05,  
        'tree_learner': 'serial',
        'metric': ['multi_logloss'], 
        'max_bin': 255,
        'num_class': 10
    }

    read_train = read_streaming(X, y, batch_size=500)

    for i, data in enumerate(read_train):
        X_batch = data[0]
        y_batch = data[1]
        X_train, X_test, y_train, y_test = train_test_split(
            X_batch, y_batch, test_size=0.15, random_state=13
        )
        y_train = y_train.ravel()
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=1000,
                        valid_sets=lgb_eval,
                        init_model=gbm,
                        keep_training_booster=True)
        
        print(f"{i} time")
        score_train = dict([(score[1], score[2]) for score in gbm.eval_train()])
        print('The score of the current model in the training set is: logloss=%.4f \n'
              % (score_train['multi_logloss']))

    return gbm

In [11]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2)
gbm = get_lgbm(train_X, train_y)
pred_y = gbm.predict(test_X)
pred_classes = np.argmax(pred_y, axis=1)
print(f'F1 score: {f1_score(test_y, pred_classes, average="weighted")}')
print('------------------------------------------')
print(f'Precision: {precision_score(test_y, pred_classes, average="weighted")}')
print('------------------------------------------')
print(f'Recall: {recall_score(test_y, pred_classes, average="weighted")}')

joblib.dump(gbm, 'loan_model_2.pkl')
gbm = joblib.load('loan_model_2.pkl')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 675
[LightGBM] [Info] Number of data points in the train set: 425, number of used features: 11
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -4.665795
[LightGBM] [Info] Start training from score -3.279500
[LightGBM] [Info] Start training from score -0.958339
[LightGBM] [Info] Start training from score -0.910426
[LightGBM] [Info] Start training from score -1.877702
[LightGBM] [Info] Start training from score -4.260330
[LightGBM] [Info] Start training from score -34.538776
0 time
The score of the current model in the training set is: logloss=0.0000 

[LightGBM] [Info] Auto-choo