In [109]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

In [110]:
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [111]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [133]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    """    Эта функция подсчитывает признаки для мета-классификатора.   
    Они являются вероятностями классов при решении задачи многоклассовой классификации. 
    :arg clf: классификатор    
    :args X_train, y_train: обучающая выборка   
    :arg X_test: признаки тестовой выборки   
    :arg cv: класс, генерирующий фолды (KFold) 
    :returns X_meta_train, X_meta_test: новые признаки для обучающей и тестовой выборок    """
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(X_train), n_classes), dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train, y_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]

        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)

        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)

    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)

    X_meta_test = meta_clf.predict_proba(X_test)

    return X_meta_train, X_meta_test

In [98]:
def compute_meta_feature_mean(clf, X_train, X_test, y_train, cv):
    """
    Эта функция подсчитывает признаки для мета-классификатора. 
    Они являются вероятностями классов при решении задачи многоклассовой классификации.

    :arg clf: классификатор
    :args X_train, y_train: обучающая выборка
    :arg X_test: признаки тестовой выборки
    :arg cv: класс, генерирующий фолды (KFold)

    :returns X_meta_train, X_meta_test: новые признаки для обучающей и тестовой выборок
    """
# Напишите ваш код ниже
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(X_train), n_classes), dtype=np.float32)
    X_meta_test = np.zeros((len(X_test), n_classes), dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]

        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)

        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
        X_meta_test += folded_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test/cv.n_splits

In [99]:
dtc = DecisionTreeClassifier()

In [100]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

In [101]:
accuracy_score(dtc.predict(X_test), y_test)

0.8611111111111112

In [113]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [114]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

In [104]:
models = [
    LogisticRegression(penalty='l1', C=0.001, solver='saga', multi_class='ovr', max_iter=2000, random_state=42, n_jobs=-1),
    LogisticRegression(penalty='l2', C=0.001, solver='saga', multi_class='multinomial', max_iter=2000, random_state=42, n_jobs=-1),
    RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    GradientBoostingClassifier(n_estimators=200, random_state=42)
]

In [66]:
stacked_features_train, stacked_features_test = generate_meta_features(models, X_train, X_test, y_train, cv)


  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:00<00:00,  7.16it/s][A
 50%|█████     | 2/4 [00:01<00:02,  1.02s/it][A
 75%|███████▌  | 3/4 [00:06<00:02,  2.93s/it][A
100%|██████████| 4/4 [01:50<00:00, 27.63s/it][A


In [67]:
total_X_train = np.hstack([X_train, stacked_features_train])
total_X_test = np.hstack([X_test, stacked_features_test])

In [90]:
lr = LogisticRegression(multi_class='auto', solver='lbfgs', random_state=42)

In [69]:
compute_metric(lr, total_X_train, y_train, total_X_test)

0.984559

In [115]:
models = [
    RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    ExtraTreesClassifier(n_estimators=200, random_state=42, n_jobs=-1)
]

In [116]:
stacked_features_train, stacked_features_test = generate_meta_features(models, X_train, X_test, y_train, cv)


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:05<00:05,  5.17s/it][A
100%|██████████| 2/2 [00:07<00:00,  3.94s/it][A


In [117]:
total_X_train = np.hstack([X_train, stacked_features_train])
total_X_test = np.hstack([X_test, stacked_features_test])

In [118]:
compute_metric(lr, total_X_train, y_train, total_X_test)

0.973967

In [119]:
models = [
    KNeighborsClassifier(n_jobs=-1),
    ExtraTreesClassifier(n_estimators=200, random_state=42, n_jobs=-1)
]

In [120]:
stacked_features_train, stacked_features_test = generate_meta_features(models, X_train, X_test, y_train, cv)


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:00<00:00,  5.33it/s][A
100%|██████████| 2/2 [00:02<00:00,  1.48s/it][A


In [121]:
total_X_train = np.hstack([X_train, stacked_features_train])
total_X_test = np.hstack([X_test, stacked_features_test])

In [123]:
compute_metric(lr, stacked_features_train, y_train, stacked_features_test)

0.98502

In [124]:
models = [
    LogisticRegression(penalty='l1', C=0.001, solver='saga', multi_class='ovr', max_iter=2000, random_state=42, n_jobs=-1),
    ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    AdaBoostClassifier(random_state=42),
    KNeighborsClassifier(n_jobs=-1)
]

In [125]:
stacked_features_train, stacked_features_test = generate_meta_features(models, X_train, X_test, y_train, cv)


  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:00<00:00,  6.93it/s][A
 50%|█████     | 2/4 [00:04<00:04,  2.43s/it][A
 75%|███████▌  | 3/4 [00:05<00:01,  1.99s/it][A
100%|██████████| 4/4 [00:05<00:00,  1.49s/it][A


In [126]:
compute_metric(lr, stacked_features_train, y_train, stacked_features_test)

0.98502

In [139]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [141]:
rfc = RandomForestClassifier(random_state=42, n_jobs=-1)

In [145]:
knc = KNeighborsClassifier(n_jobs=-1)

In [148]:
gbc = GradientBoostingClassifier(random_state=42)

In [153]:
etc = ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1)

In [132]:
models = [
    RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs=-1)
]

In [142]:
stacked_features_train, stacked_features_test = generate_meta_features(models, X_train, X_test, y_train, cv)


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.82s/it][A
100%|██████████| 2/2 [00:05<00:00,  2.52s/it][A


In [149]:
compute_metric(gbc, stacked_features_train, y_train, stacked_features_test)

0.984925

In [154]:
models = [
    RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=24, random_state=42, n_jobs=-1),
    ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs=-1)
]

In [155]:
stacked_features_train, stacked_features_test = generate_meta_features(models, X_train, X_test, y_train, cv)


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.94s/it][A
100%|██████████| 2/2 [00:05<00:00,  2.60s/it][A


In [156]:
compute_metric(gbc, stacked_features_train, y_train, stacked_features_test)

0.984925

In [157]:
new_rfc = RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=24, random_state=42, n_jobs=-1).fit(X_train, y_train)

In [158]:
new_etc = ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs=-1).fit(X_train, y_train)

In [159]:
new_lr = LogisticRegression(random_state=42).fit(X_train, y_train)

In [162]:
pred1 = new_rfc.predict(X_test)
pred2 = new_etc.predict(X_test)
pred3 = new_lr.predict(X_test)

In [189]:
np.vstack([pred1, pred2, pred3]).argmax(axis=0)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [191]:
(pred1+pred2+pred3).argmax(axis=1)

AxisError: axis 1 is out of bounds for array of dimension 1