In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.svm import SVC
from sklearn.linear_model import RidgeCV, LogisticRegression
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from joblib import Parallel, delayed
# from mlxtend.plotting import plot_decision_regions

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [78]:
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Функции

In [30]:
class Stacking:
    def __init__(self, estimators, final_estimator, blending=False, cv=5, n_jobs=-1):
        self.estimators = estimators            # список базовых моделей (эстиматоров),
        self.final_estimator = final_estimator  # финальная модель, для агрегации предсказаний базовых моделей.
        self.blending = blending                # стекинг (False) или блендинг (True)
        self.cv = cv                            # кол-во разбиений для кросс-валидации при обучении базовых моделей
        self.n_jobs = n_jobs                    # кол-во параллельных задач, которые будут выполняться во время обучения и предсказания 
                                                # (по умолчанию -1, что означает использование всех доступных ядер процессора)

    def _X_pred(self, estimator, data):
        if self.blending:
            X_train_v, y_train_v, X_val = data
            return estimator.fit(X_train_v, y_train_v).predict(X_val)
        else:
            X_train, y_train = data
            return cross_val_predict(estimator, X_train, y_train, cv=self.cv)

    def _X_test_pred(self, estimator, data):
        X_train, y_train, X_test = data
        return estimator.fit(X_train, y_train).predict(X_test)

    def _meta_data(self, X_train, y_train, X_test):
        if self.blending:
            #used hold-out cross-validation
            X_train_v, X_val, y_train_v, y_val = train_test_split(X_train, y_train, random_state=0)
            train_data = [X_train_v, y_train_v, X_val]
            test_data = [X_train_v, y_train_v, X_test]
            meta_y_train = y_val
        else:
            train_data = [X_train, y_train]
            test_data = [X_train, y_train, X_test]
            meta_y_train = y_train

        cv_X_train_preds = (delayed(self._X_pred)(est, train_data) for est in self.estimators)
        X_test_preds = (delayed(self._X_test_pred)(est, test_data) for est in self.estimators)

        meta_X_train = pd.DataFrame(Parallel(n_jobs=self.n_jobs)(cv_X_train_preds))
        meta_X_test = pd.DataFrame(Parallel(n_jobs=self.n_jobs)(X_test_preds))

        return meta_X_train.T, meta_y_train, meta_X_test.T

    def fit_predict(self, X_train, y_train, X_test):
        # meta learner or blender
        meta_X_train, meta_y_train, meta_X_test = self._meta_data(X_train, y_train, X_test)

        return self.final_estimator.fit(meta_X_train, meta_y_train).predict(meta_X_test)

In [3]:
def decision_boundary_plot(X, y, X_train, y_train, clf, feature_indexes, title=None):
    feature1_name, feature2_name = X.columns[feature_indexes]
    X_feature_columns = X.values[:, feature_indexes]
    X_train_feature_columns = X_train.values[:, feature_indexes]
    clf.fit(X_train_feature_columns, y_train.values)

    plot_decision_regions(X=X_feature_columns, y=y.values, clf=clf)
    plt.xlabel(feature1_name)
    plt.ylabel(feature2_name)
    plt.title(title)

In [44]:
def start(X_train, X_test, y_train, y_test, estimators_1, model_LogisticRegression_1):
    stacking_clf = Stacking(estimators=estimators_1, final_estimator=model_LogisticRegression_1)
    stacking_pred_res = stacking_clf.fit_predict(X_train, y_train, X_test)
    stacking_acc = accuracy_score(stacking_pred_res, y_test)
    stacking_f1 =f1_score(stacking_pred_res, y_test)
    print(f'stacking   acc: {(stacking_acc * 100):.2f}   f1: {stacking_f1:.2f}')
    
    blending_clf = Stacking(estimators=estimators_1, final_estimator=model_LogisticRegression_1, blending=True)
    blending_pred_res = blending_clf.fit_predict(X_train, y_train, X_test)
    blending_acc = accuracy_score(blending_pred_res, y_test)
    blending_f1 = f1_score(blending_pred_res, y_test)
    print(f'blending   acc: {(blending_acc * 100):.2f}   f1: {blending_f1:.2f}')

# Загрузка данных

In [4]:
file_name_dum = './datasets/all_dataset_dum.csv'
data = pd.read_csv(file_name_dum, sep=";", encoding='utf8') #, low_memory=False)
data = data.drop(['direction'], axis=1)
data.head(2)


Unnamed: 0,id,distance,len,Кол-во пиков,Сред.зн.пика,Min,Max,Медиана,Смещение низ,Смещение верх,...,Верх.квартиль,Дисперсия,Асимметрия,Куртозис,Std откл.,Коэф.вариации,Std ошибка,X0,X1,X2
0,1,6656,1077,82,6.616,3.836,7.216,5.428,1.592,1.788,...,6.157,0.66,0.033,-1.112,0.812,0.149,0.025,1,0,0
1,1,6625,1077,74,5.802,3.226,6.143,4.719,1.493,1.424,...,5.323,0.555,0.024,-1.114,0.745,0.159,0.023,1,0,0


In [5]:
file_name_dum_balance = './datasets/all_dataset_dum_balance.csv'
data_balance = pd.read_csv(file_name_dum_balance, sep=";", encoding='utf8') #, low_memory=False)
data_balance = data_balance.drop(['direction'], axis=1)
data_balance.head(2)


Unnamed: 0,id,distance,len,Кол-во пиков,Сред.зн.пика,Min,Max,Медиана,Смещение низ,Смещение верх,...,Верх.квартиль,Дисперсия,Асимметрия,Куртозис,Std откл.,Коэф.вариации,Std ошибка,X0,X1,X2
0,1,6656,1077,82,6.616,3.836,7.216,5.428,1.592,1.788,...,6.157,0.66,0.033,-1.112,0.812,0.149,0.025,1,0,0
1,1,6625,1077,74,5.802,3.226,6.143,4.719,1.493,1.424,...,5.323,0.555,0.024,-1.114,0.745,0.159,0.023,1,0,0


# Const

In [6]:
f_0 = ['id', 'distance', 'Кол-во пиков', 'Сред.зн.пика', 'Медиана', 'Смещение', 
       'Ниж.квартиль', 'Верх.квартиль', 'Дисперсия', 'Куртозис', 'X0', 'X1', 'X2']
f_1 = ['id', 'distance', 'Кол-во пиков', 'Сред.зн.пика', 'Медиана', 'Смещение', 
       'Верх.квартиль', 'Дисперсия', 'Куртозис', 'X0', 'X1', 'X2']
# f_2 = ['Max', 'Mean', 'Std откл.', 'Асимметрия', 'Куртозис', 'id']
f_3 = ['id', 'distance', 'Кол-во пиков', 'Медиана', 'Смещение', 
        'Ниж.квартиль', 'Верх.квартиль', 'Дисперсия', 'Куртозис', 'X0', 'X1', 'X2']
f_4 = ['id', 'distance', 'Кол-во пиков', 'Медиана', 'Смещение', 
       'Ниж.квартиль', 'Верх.квартиль', 'Дисперсия', 'Куртозис', 'X0', 'X1', 'X2']

# feature_3 = ['Max', 'Куртозис', 'Гарм.сред.', 'Std откл.', 'Асимметрия', 'Ниж.квартиль', 'Верх.квартиль', 'id'] 
feature_4 = ['Max', 'Куртозис', 'Гарм.сред.', 'Std откл.', 'Асимметрия', 'X0', 'X1', 'X2', 'id'] 
feature_5 = ['Max', 'Куртозис', 'Гарм.сред.', 'Std откл.', 'Асимметрия', 'distance', 'id'] 
feature_6 = ['Max', 'Куртозис', 'Гарм.сред.', 'Std откл.', 'Асимметрия', 'Кол-во пиков', 'Сред.зн.пика', 'id'] 
feature_7 = ['Max', 'Куртозис', 'Гарм.сред.', 'X0', 'X1', 'X2', 'distance', 'Кол-во пиков', 'Сред.зн.пика', 'id'] 
feature_8 = ['Max', 'Куртозис', 'Гарм.сред.', 'Std откл.', 'Асимметрия', 'X0', 'X1', 'X2', 'distance', 'id'] 

In [61]:
# LogisticRegression(max_iter=5000),
# SVC(probability=True, random_state=0),                     # Метод опорных векторов
# RandomForestClassifier(n_estimators=300, random_state=42), # Случайный лес
# ExtraTreesClassifier(n_estimators=300, random_state=42),    # Случайный лес экстремальных деревьев
# GradientBoostingClassifier(n_estimators=200, random_state=42),
# KNeighborsClassifier(),                                    # Метод ближайших соседей
# AdaBoostClassifier()

# XGBClassifier() # градиентного бустинга
# MLPClassifier() # многослойный персептрон

In [98]:
model_LogisticRegression_1 = LogisticRegression(random_state=42)

models = [
    LogisticRegression(random_state=42),
    SVC(random_state=42),
    RandomForestClassifier(random_state=42),
    ExtraTreesClassifier(random_state=42),
    GradientBoostingClassifier(n_estimators=200, random_state=42),
    KNeighborsClassifier(),
    AdaBoostClassifier(random_state=42),
    XGBClassifier(random_state=42),
    MLPClassifier(random_state=42),
]

In [102]:
estimators_1 = [
    KNeighborsClassifier(),  
    ExtraTreesClassifier(n_estimators=300, random_state=42)  
]

estimators_2 = [
    LogisticRegression(max_iter=5000, random_state=42),
    SVC(probability=True, random_state=42),
    RandomForestClassifier(n_estimators=300, random_state=42),
    ExtraTreesClassifier(n_estimators=300, random_state=42),
    GradientBoostingClassifier(n_estimators=200, random_state=42),
    KNeighborsClassifier(), 
    # AdaBoostClassifier(),
    # XGBClassifier(),
    # MLPClassifier()
]

estimators_3 = [
    LogisticRegression(max_iter=5000),
    SVC(probability=True, random_state=0),                     
    RandomForestClassifier(n_estimators=300, random_state=42), 
    # ExtraTreesClassifier(n_estimators=300, random_state=42),    
    GradientBoostingClassifier(n_estimators=200, random_state=42),
    KNeighborsClassifier(), 
]


estimators_21 = [
    LogisticRegression(C=0.1, max_iter=1000, random_state=42),
    SVC(probability=True, random_state=42),
    RandomForestClassifier(n_estimators=300, random_state=42),
    ExtraTreesClassifier(n_estimators=300, random_state=42),
    GradientBoostingClassifier(n_estimators=200, random_state=42),
    KNeighborsClassifier(), 
    # AdaBoostClassifier(),
    # XGBClassifier(),
    # MLPClassifier()
]

# Main

### data_balance[f_0]

In [106]:
df = data_balance[f_0]
y =  np.array(df['id'])  # Target variable
X = np.array(df.drop('id', axis=1))  # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
print(X_train.shape)

(3385, 12)


In [58]:
start(X_train, X_test, y_train, y_test, estimators_1, model_LogisticRegression_1)

stacking   acc: 74.38   f1: 0.75
blending   acc: 73.67   f1: 0.74


In [59]:
start(X_train, X_test, y_train, y_test, estimators_2, model_LogisticRegression_1)

stacking   acc: 76.03   f1: 0.77
blending   acc: 74.26   f1: 0.75


In [107]:
for model in models:
    print(type(model).__name__)
    start(X_train, X_test, y_train, y_test, estimators_1, model)

LogisticRegression
stacking   acc: 74.38   f1: 0.75
blending   acc: 73.67   f1: 0.74
SVC
stacking   acc: 74.38   f1: 0.75
blending   acc: 73.67   f1: 0.74
RandomForestClassifier
stacking   acc: 74.38   f1: 0.75
blending   acc: 73.67   f1: 0.74
ExtraTreesClassifier
stacking   acc: 74.38   f1: 0.75
blending   acc: 73.67   f1: 0.74
GradientBoostingClassifier
stacking   acc: 74.38   f1: 0.75
blending   acc: 73.67   f1: 0.74
KNeighborsClassifier


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


stacking   acc: 74.38   f1: 0.75


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


blending   acc: 63.99   f1: 0.57
AdaBoostClassifier
stacking   acc: 74.38   f1: 0.75
blending   acc: 73.67   f1: 0.74
XGBClassifier
stacking   acc: 74.38   f1: 0.75
blending   acc: 73.67   f1: 0.74
MLPClassifier
stacking   acc: 74.38   f1: 0.75
blending   acc: 73.67   f1: 0.74


### data_balance

In [64]:
df = data_balance
y =  np.array(df['id'])  # Target variable
X = np.array(df.drop('id', axis=1))  # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
print(X_train.shape)

(3385, 26)


In [51]:
start(X_train, X_test, y_train, y_test, estimators_1, model_LogisticRegression_1)

stacking   acc: 77.80   f1: 0.79
blending   acc: 75.91   f1: 0.77


In [90]:
start(X_train, X_test, y_train, y_test, estimators_2, model_LogisticRegression_1)

stacking   acc: 80.28   f1: 0.82
blending   acc: 78.98   f1: 0.80


In [103]:
start(X_train, X_test, y_train, y_test, estimators_21, model_LogisticRegression_1)

stacking   acc: 80.28   f1: 0.82
blending   acc: 78.98   f1: 0.80


In [85]:
start(X_train, X_test, y_train, y_test, estimators_3, model_LogisticRegression_1)

stacking   acc: 79.81   f1: 0.81
blending   acc: 78.98   f1: 0.80


In [99]:
for model in models:
    print(type(model).__name__)
    start(X_train, X_test, y_train, y_test, estimators_1, model)

LogisticRegression
stacking   acc: 77.80   f1: 0.79
blending   acc: 75.91   f1: 0.77
SVC
stacking   acc: 77.80   f1: 0.79
blending   acc: 75.91   f1: 0.77
RandomForestClassifier
stacking   acc: 77.80   f1: 0.79
blending   acc: 75.91   f1: 0.77
ExtraTreesClassifier
stacking   acc: 77.80   f1: 0.79
blending   acc: 75.91   f1: 0.77
GradientBoostingClassifier
stacking   acc: 77.80   f1: 0.79
blending   acc: 75.91   f1: 0.77
KNeighborsClassifier


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


stacking   acc: 77.80   f1: 0.79


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


blending   acc: 62.57   f1: 0.55
AdaBoostClassifier
stacking   acc: 77.80   f1: 0.79
blending   acc: 75.91   f1: 0.77
XGBClassifier
stacking   acc: 77.80   f1: 0.79
blending   acc: 75.91   f1: 0.77
MLPClassifier
stacking   acc: 77.80   f1: 0.79
blending   acc: 75.91   f1: 0.77


In [97]:
for model in models:
    print(type(model).__name__)
    start(X_train, X_test, y_train, y_test, estimators_2, model)

LogisticRegression
stacking   acc: 80.28   f1: 0.82
blending   acc: 78.98   f1: 0.80
SVC
stacking   acc: 79.81   f1: 0.81
blending   acc: 79.34   f1: 0.81
RandomForestClassifier
stacking   acc: 79.69   f1: 0.81
blending   acc: 77.80   f1: 0.79
ExtraTreesClassifier
stacking   acc: 79.57   f1: 0.81
blending   acc: 77.45   f1: 0.79
GradientBoostingClassifier
stacking   acc: 79.93   f1: 0.81
blending   acc: 78.04   f1: 0.80
KNeighborsClassifier


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


stacking   acc: 79.46   f1: 0.81


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


blending   acc: 76.74   f1: 0.79
AdaBoostClassifier
stacking   acc: 80.28   f1: 0.82
blending   acc: 79.22   f1: 0.81
XGBClassifier
stacking   acc: 79.57   f1: 0.81
blending   acc: 77.45   f1: 0.79
MLPClassifier
stacking   acc: 79.93   f1: 0.81
blending   acc: 79.22   f1: 0.81




### data

In [104]:
df = data
y =  np.array(df['id'])  # Target variable
X = np.array(df.drop('id', axis=1))  # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
print(X_train.shape)

(3972, 26)


In [56]:
start(X_train, X_test, y_train, y_test, estimators_2, model_LogisticRegression_1)

stacking   acc: 77.14   f1: 0.73
blending   acc: 75.23   f1: 0.70


In [105]:
for model in models:
    print(type(model).__name__)
    start(X_train, X_test, y_train, y_test, estimators_1, model)

LogisticRegression
stacking   acc: 76.33   f1: 0.72
blending   acc: 75.73   f1: 0.71
SVC
stacking   acc: 76.33   f1: 0.72
blending   acc: 75.73   f1: 0.71
RandomForestClassifier
stacking   acc: 76.33   f1: 0.72
blending   acc: 75.73   f1: 0.71
ExtraTreesClassifier
stacking   acc: 76.33   f1: 0.72
blending   acc: 75.73   f1: 0.71
GradientBoostingClassifier
stacking   acc: 76.33   f1: 0.72
blending   acc: 75.73   f1: 0.71
KNeighborsClassifier


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


stacking   acc: 76.33   f1: 0.72


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


blending   acc: 64.65   f1: 0.45
AdaBoostClassifier
stacking   acc: 76.33   f1: 0.72
blending   acc: 75.73   f1: 0.71
XGBClassifier
stacking   acc: 76.33   f1: 0.72
blending   acc: 75.73   f1: 0.71
MLPClassifier
stacking   acc: 76.33   f1: 0.72
blending   acc: 75.73   f1: 0.71
