# Uploading files to collab

In [None]:
from google.colab import files

In [None]:
files.upload()

In [None]:
!ls

In [None]:
import os
os.remove('data_for_ML_unn.xlsx')

In [None]:
!pip install catboost

In [None]:
!pip install shap

In [None]:
!pip install scikit-optimize

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from skopt import BayesSearchCV
from sklearn.neighbors import KNeighborsRegressor
from skopt.space import Integer, Real, Categorical
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBRegressor
import shap
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# ML

## 1. Preparation

### 1.1. Working with the table

In [None]:
#We move the predicted column to the end for convenience
db_2 = pd.read_excel('data_for_ML_unn.xlsx')
db_1 = db_2.drop(columns=['Sugar_encoded'])
db_1.insert(50, 'Sugar_encoded', db_2['Sugar_encoded'])
db_final = db_1.drop(columns=['Wine_strength'])
db_final.insert(0, 'Wine_strength', db_1['Wine_strength'])

In [None]:
colors = ['#a5678e','#e8b7d4', '#beb7d9', '#7eabd4', '#31539d'] # Palette of colors

In [None]:
#Summary table description
db_final.describe()

### 1.2. Splitting data into test and train

In [None]:
# A function for splitting data so that different systems appear in the test and train sample
def separation(x,y,n_splits,test_size):
    separation = []
    k_fold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)
    for train_idx, test_idx in k_fold.split(x, y):
        separation.append((train_idx, test_idx))
    return separation

In [None]:
# We split the sample so that there are different systems in the test and training
y = db_final.loc[:,'Sugar_encoded']
x = db_final.loc[:,'1_vec_1':]
cv = separation(x, y, 1, 0.2)
for train_idx, val_idx in cv:
    x_train, x_test = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[val_idx]

### 1.3. Functions for plotting and calculating metrics

In [None]:
#Let's introduce a function for calculating metrics
def metrics(regr, x_train, y_train, y_test, y_pred, y1_pred):
    #Calculation of metrics
    F1 = f1_score(y_test, y_pred, average='micro')
    F1_train = f1_score(y_train, y1_pred, average='micro')
    Accuracy = balanced_accuracy_score(y_test, y_pred)
    Accuracy_train = balanced_accuracy_score(y_train, y1_pred)
    #print('Q2:', Q2)
    #Metrics output
    print('f1_score:', F1)
    print('Accuracy:', Accuracy)
    print('f1_train:', F1_train)
    print('Accuracy_train:', Accuracy_train)
    return [F1_train, F1, Accuracy_train, Accuracy]

## 2. Random Forest Regression

In [None]:
#We optimize for RFR
regr_RFR = RandomForestClassifier()

search_space = {"n_estimators": Integer(50,400),
    "criterion": Categorical(['squared_error', 'absolute_error']),
    "min_samples_split": Real(0.01, 0.6),
    "min_samples_leaf": Real(0.01, 0.5),
    "max_depth": Integer(1, 12),
    "max_features": Integer(3, 24)}

#Scaling the data
sc = MinMaxScaler(feature_range=(0, 1))
x_train01 = sc.fit_transform(x_model)
opt_RFR = BayesSearchCV(estimator = regr_RFR, search_spaces=search_space, cv = separation(x_model, y_model, 5, 0.2), n_iter=32, verbose=2, n_jobs=-1)
opt_RFR.fit(x_train01, y_model)

n_esti = opt_RFR.best_estimator_.n_estimators
criterion = opt_RFR.best_estimator_.criterion
min_leaf = opt_RFR.best_estimator_.min_samples_leaf
min_split = opt_RFR.best_estimator_.min_samples_split
depth = opt_RFR.best_estimator_.max_depth
max_feat = opt_RFR.best_estimator_.max_features

print('n_esti:', n_esti,'depth:', depth, 'criterion:', criterion, 'min_leaf:', min_leaf,'min_split:', min_split,'max_feat:', max_feat)

In [None]:
count = 0
df_error_train = pd.DataFrame()
x_model = x_train.copy().drop(['Sugar_encoded'],axis=1)

y_model = y_train.copy()
cv = separation(x_model, y_model, 5, 0.2)
count = 0

#Creating a table to check accuracy on different DES classes
RFR_df = x_model[:]
RFR_df['Sugar'] = y_model

for train_idx, val_idx in cv:
    x_train_mod, x_test_mod = x_model.iloc[train_idx], x_model.iloc[val_idx]
    y_train_mod, y_test_mod = y_model.iloc[train_idx], y_model.iloc[val_idx]

    #Scaling the data
    sc = MinMaxScaler(feature_range=(0, 1))
    x_train_mod = sc.fit_transform(x_train_mod)
    x_test_mod = sc.transform(x_test_mod)
    #Random Forest Regression
    regr_RFR = RandomForestClassifier()
    # regr_RFR = RandomForestClassifier(n_estimators=n_esti, max_depth=depth, min_samples_leaf=min_leaf,
    #                              min_samples_split=min_split, criterion=criterion, max_features=max_feat)
    regr_RFR.fit(x_train_mod, y_train_mod)
    y_pred = regr_RFR.predict(x_test_mod)
    y1_pred = regr_RFR.predict(x_train_mod)
    #Filling in the table in accordance with the metrics
    RFR_df['Sugar_train' + str(count)] = None
    RFR_df['Sugar_test' + str(count)] = None
    RFR_df['Sugar_train' + str(count)].iloc[train_idx] = y1_pred
    RFR_df['Sugar_test' + str(count)].iloc[val_idx] = y_pred
    #Calculating metrics
    df_error_train['Random Forest Regression' + str(count)] = metrics(regr_RFR, x_train_mod, y_train_mod, y_test_mod, y_pred, y1_pred)
    count +=1
df_error_train

## 3. Gradient Boosting Regression

In [None]:
#We optimize for GBR
regr_GBR = GradientBoostingClassifier()

search_space = {"learning_rate": Real(0.05, 0.15),
    "n_estimators": Integer(50, 400),
    "max_depth": Integer(1, 6),
    "min_samples_leaf": Real(0.05, 0.5),
    "min_samples_split": Real(0.02, 0.6)}

#Scaling the data
sc = MinMaxScaler(feature_range=(0, 1))
x_train01 = sc.fit_transform(x_model)

opt_GBR = BayesSearchCV(estimator = regr_GBR, search_spaces=search_space, cv=separation(x_model, y_model,5, 0.2), n_iter=32, verbose=2, n_jobs=-1)
opt_GBR.fit(x_train01, y_model)

learning_rate = opt_GBR.best_estimator_.learning_rate
n_estimators = opt_GBR.best_estimator_.n_estimators
max_depth = opt_GBR.best_estimator_.max_depth
min_samples_leaf = opt_GBR.best_estimator_.min_samples_leaf
min_samples_split = opt_GBR.best_estimator_.min_samples_split

print('learning_rate:', learning_rate,'n_estimators:', n_estimators, 'max_depth:', max_depth, 'min_samples_leaf:', min_samples_leaf,'min_samples_split:', min_samples_split)

In [None]:
count = 0
#Creating a table to check accuracy on different DES classes
GBR_df = x_model[:]
GBR_df['Sugar'] = y_model

for train_idx, val_idx in cv:
    x_train_mod, x_test_mod = x_model.iloc[train_idx], x_model.iloc[val_idx]
    y_train_mod, y_test_mod = y_model.iloc[train_idx], y_model.iloc[val_idx]

    #Scaling the data
    sc = MinMaxScaler(feature_range=(0, 1))
    x_train_mod = sc.fit_transform(x_train_mod)
    x_test_mod = sc.transform(x_test_mod)
    #GBR
    regr_GBR = GradientBoostingClassifier()
    # regr_GBR = GradientBoostingClassifier(learning_rate = learning_rate, n_estimators = n_estimators, max_depth= max_depth, min_samples_leaf= min_samples_leaf, min_samples_split= min_samples_split)
    regr_GBR.fit(x_train_mod, y_train_mod)
    y_pred = regr_GBR.predict(x_test_mod)
    y1_pred = regr_GBR.predict(x_train_mod)
    #Filling in the table in accordance with the metrics
    GBR_df['Sugar_train' + str(count)] = None
    GBR_df['Sugar_test' + str(count)] = None
    GBR_df['Sugar_train' + str(count)].iloc[train_idx] = y1_pred
    GBR_df['Sugar_test' + str(count)].iloc[val_idx] = y_pred
    #Calculating metrics
    df_error_train['Gradient Boosting Regression' + str(count)] = metrics(regr_GBR, x_train_mod, y_train_mod, y_test_mod, y_pred, y1_pred)
    count +=1
df_error_train

## 4. Cat Boosting Regression

Не заработало

ValueError: shape mismatch: value array of shape (9208,1) could not be broadcast to indexing result of shape (9208,)

In [None]:
#We optimize for CBR
regr_CBR = CatBoostClassifier()

search_space = {"iterations": Integer(100, 600),
                "learning_rate": Real(0.05, 0.4),
                "depth": Integer(1, 6)}

#Scaling the data
sc = MinMaxScaler(feature_range=(0, 1))
x_train01 = sc.fit_transform(x_model)

opt_CBR = BayesSearchCV(estimator = regr_CBR, search_spaces=search_space, cv=peparation(x_model, y_model, 5, 0.2), n_iter=32, verbose=2, n_jobs=-1)
opt_CBR.fit(x_train01, y_model)

iterations = opt_CBR.best_params_['iterations']
learning_rate = opt_CBR.best_params_['learning_rate']
depth = opt_CBR.best_params_['depth']


print('iterations:', iterations,'learning_rate:', learning_rate, 'depth:', depth)

In [None]:
count = 0
df_error_train = pd.DataFrame()
x_model = x_train.copy().drop(['Sugar_encoded'],axis=1)

y_model = y_train.copy()
cv = separation(x_model, y_model, 5, 0.2)
count = 0

#Creating a table to check accuracy on different DES classes
RFR_df = x_model[:]
RFR_df['Sugar'] = y_model

for train_idx, val_idx in cv:
    x_train_mod, x_test_mod = x_model.iloc[train_idx], x_model.iloc[val_idx]
    y_train_mod, y_test_mod = y_model.iloc[train_idx], y_model.iloc[val_idx]

    #Scaling the data
    sc = MinMaxScaler(feature_range=(0, 1))
    x_train_mod = sc.fit_transform(x_train_mod)
    x_test_mod = sc.transform(x_test_mod)
    #CBR
    regr_CBR = CatBoostClassifier()
    # regr_CBR = CatBoostClassifier(iterations = iterations, learning_rate = learning_rate, depth = depth)
    regr_CBR.fit(x_train_mod, y_train_mod)
    y_pred = regr_CBR.predict(x_test_mod)
    y1_pred = regr_CBR.predict(x_train_mod)
    #Filling in the table in accordance with the metrics
    CBR_df['Sugar_train' + str(count)] = None
    CBR_df['Sugar_test' + str(count)] = None
    CBR_df['Sugar_train' + str(count)].iloc[train_idx] = y1_pred
    CBR_df['Sugar_test' + str(count)].iloc[val_idx] = y_pred
    #Calculating metrics
    df_error_train['Cat Boosting Regression' + str(count)] = metrics(regr_CBR, x_train_mod, y_train_mod, y_test_mod, y_pred, y1_pred)
    count +=1
df_error_train

## 5. Extreme Gradient Boosting (XGBoost)

In [None]:
#We optimize for XGB
regr_XGB = XGBClassifier()

search_space = {"n_estimators": Integer(50, 350),
    "max_depth": Integer(2, 30),
    "subsample": Real(0.05, 1.0),
    "colsample_bytree": Real(0.05, 1.0)}

#Scaling the data
sc = MinMaxScaler(feature_range=(0, 1))
x_train01 = sc.fit_transform(x_model)
opt_XGB = BayesSearchCV(estimator = regr_XGB, search_spaces=search_space, cv = separation(x_model, y_model, 5, 0.2), n_iter=32, verbose=2, n_jobs=-1)
opt_XGB.fit(x_train01, y_model)

n_estimators = opt_XGB.best_estimator_.n_estimators
max_depth = opt_XGB.best_estimator_.max_depth
subsample = opt_XGB.best_estimator_.subsample
colsample_bytree = opt_XGB.best_estimator_.colsample_bytree

print('n_estimators:', n_estimators,'max_depth:', max_depth, 'subsample:', subsample,'colsample_bytree:', colsample_bytree)

In [None]:
count = 0
#Creating a table to check accuracy on different DES classes
XGB_df = x_model[:]
XGB_df['Sugar'] = y_model

for train_idx, val_idx in cv:
    x_train_mod, x_test_mod = x_model.iloc[train_idx], x_model.iloc[val_idx]
    y_train_mod, y_test_mod = y_model.iloc[train_idx], y_model.iloc[val_idx]

    #Scaling the data
    sc = MinMaxScaler(feature_range=(0, 1))
    x_train_mod = sc.fit_transform(x_train_mod)
    x_test_mod = sc.transform(x_test_mod)
    #XGB
    regr_XGB = XGBClassifier()
    # regr_XGB = XGBClassifier(n_estimators = n_estimators, max_depth = max_depth, subsample = subsample, colsample_bytree = colsample_bytree)
    regr_XGB.fit(x_train_mod, y_train_mod)
    y_pred = regr_XGB.predict(x_test_mod)
    y1_pred = regr_XGB.predict(x_train_mod)
    #Filling in the table in accordance with the metrics
    XGB_df['Sugar_train' + str(count)] = None
    XGB_df['Sugar_test' + str(count)] = None
    XGB_df['Sugar_train' + str(count)].iloc[train_idx] = y1_pred
    XGB_df['Sugar_test' + str(count)].iloc[val_idx] = y_pred
    #Calculating metrics
    df_error_train['XGBoosting Regression' + str(count)] = metrics(regr_XGB, x_train_mod, y_train_mod, y_test_mod, y_pred, y1_pred)
    count +=1
df_error_train

## 6. General graphs

In [None]:
#General table obtained after cross-validation
df_error_train.index = ['F1_train', 'F1_test', 'Accuracy_train', 'Accuracy_test']
df_error_train

In [None]:
#Table with averages and deviations for metrics
df_error_train_1 = df_error_train.transpose()
f_get_name = lambda x: x[:-1]
df_error_train_1['Regr'] = df_error_train_1.index
df_error_train_1['Regr'] = df_error_train_1['Regr'].apply(f_get_name)
df_error_train_1 = df_error_train_1.groupby('Regr').agg(['mean', 'std'])
df_error_train_1.columns = ['_'.join(col).rstrip('_') for col in df_error_train_1.columns.values] #Названия для новых столбцов
df_error_train_1

In [None]:
import matplotlib.patches as mpatches
#Let's build a graph that will help compare the methods by the main metrics

barWidth = 0.4

br1 = np.arange(3)
br2 = [x + barWidth for x in br1]
br3 =  np.arange(3)
br4 = [x + barWidth for x in br3]

fig, ((ax1, ax2)) = plt.subplots(nrows=1, ncols=2, sharex=True, figsize=(20, 6))

ax1.bar(br1, df_error_train_1['F1_train_mean'], color =colors[0], width = barWidth, label ='F1_train', yerr = df_error_train_1['F1_train_std'])
ax1.bar(br2, df_error_train_1['F1_test_mean'], color =colors[1], width = barWidth, label ='F1_validation', yerr = df_error_train_1['F1_test_std'])

ax2.bar(br3, df_error_train_1['Accuracy_train_mean'], color =colors[4], width = barWidth, label ='Accuracy_train', yerr = df_error_train_1['Accuracy_train_std'])
ax2.bar(br4, df_error_train_1['Accuracy_test_mean'], color =colors[3], width = barWidth, label ='Accuracy_validation', yerr = df_error_train_1['Accuracy_test_std'])

plt.xticks(br1, ['GBR','RFR','XGB'], fontsize = 12)

ax1.grid(color='#C3C6BA', linewidth=0.3)
ax2.grid(color='#C3C6BA', linewidth=0.3)


ax1.legend(fontsize = 12)
ax2.legend(fontsize = 12)
