In [None]:
!pip install scikit-learn==0.24.2

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

In [2]:
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [3]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [4]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [5]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

In [6]:
# 1

stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(penalty='l1', C=0.001, solver='saga', multi_class='ovr', max_iter=2000, random_state=42),
    LogisticRegression(penalty='l2', C=0.001, solver='saga', multi_class='multinomial', max_iter=2000, random_state=42),  
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    GradientBoostingClassifier(n_estimators=200, random_state=42)
], X_train, X_test, y_train, cv)

clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

100%|██████████| 4/4 [03:00<00:00, 45.04s/it]


0.978082

In [7]:
import sklearn

sklearn.__version__

'0.24.2'

In [8]:
# 2

stacked_features_train, stacked_features_test = generate_meta_features([ 
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=200, n_jobs=-1, random_state=42)], 
    X_train, X_test, y_train, cv)

clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

100%|██████████| 2/2 [00:17<00:00,  8.84s/it]


0.982421

In [10]:
# 3

stacked_features_train, stacked_features_test = generate_meta_features([ 
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)], 
    X_train, X_test, y_train, cv)

clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

100%|██████████| 2/2 [00:08<00:00,  4.44s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.989904

In [11]:
# 4 

stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(penalty='l1', C=0.001, solver='saga', multi_class='ovr', max_iter=2000, random_state=42),
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42), 
    AdaBoostClassifier()
], X_train, X_test, y_train, cv)

clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

100%|██████████| 4/4 [00:52<00:00, 13.11s/it]


0.987404

In [12]:
# 5

def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

stacked_features_train, stacked_features_test = generate_meta_features([ 
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)], 
    X_train, X_test, y_train, cv)

clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

100%|██████████| 2/2 [00:19<00:00,  9.74s/it]


0.983918