In [1]:
import pandas as pd
import numpy as np
import re
import time

In [2]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor, BayesianRidge
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, SGDClassifier
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, StratifiedKFold, cross_val_predict, RepeatedKFold
from tqdm import tqdm
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor


from sklearn.svm import LinearSVR, SVR, LinearSVC, SVC
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.ensemble import StackingRegressor, VotingRegressor, StackingClassifier, VotingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from catboost import Pool, CatBoostRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

In [3]:
class Base:
    def __init__(self, methods='all'):

        if methods == 'all':
            self.methods = list(self.all_estimators.keys())
            
        elif isinstance(methods, str):
            self.methods = [methods]
            
        elif isinstance(methods, dict):
            self.methods = list(methods.keys())
            self.all_estimators = methods
            
        else:
            self.methods = list(methods)
            
            
    def fit(self, X, y):
        
        self.estimators_ = dict()
        for method in tqdm(self.methods):
            
            est = clone(self.all_estimators[method])              
            est.fit(X, y)
            self.estimators_[method] = est
            
        return(self.estimators_)

In [4]:
class Clf:
    def val(self, X, y):
        try:
            scores = dict()
            pb = tqdm(self.methods)
            for method in pb:
                pb.set_description(desc=f'Validate {method}', refresh=False)
                start_time = time.time()
                cv = cross_val_score(clone(self.all_estimators[method]),
                                     X,
                                     y,
                                     n_jobs=-1,
                                     cv=RepeatedKFold(n_splits=4,
                                                      n_repeats=2,
                                                      random_state=13),
                                     scoring='roc_auc')
                scores[method] = round(np.mean(cv), 4)
                pb.set_postfix(score=scores[method], refresh=False)
                print(f'{method} validated in', time.time() - start_time)
        finally:
            self.val_scores = pd.Series(
                    scores, name='val_scores', dtype='float64').sort_values(ascending=False)
            return (self.val_scores)

In [5]:
class Reg:
    def val(self, X, y):
        try:
            scores = dict()
            pb = tqdm(self.methods, leave=False)
            for method in pb:
                pb.set_description(desc=f'Validate {method}', refresh=False)
                start_time = time.time()
                cv = cross_val_score(clone(self.all_estimators[method]),
                                     X,
                                     y,
                                     n_jobs=-1,
                                     cv=RepeatedKFold(n_splits=4,
                                                      n_repeats=2,
                                                      random_state=13),
                                     scoring='neg_root_mean_squared_error')

                scores[method] = round(np.mean(-cv), 4)
                pb.set_postfix(score=scores[method], refresh=False)
                print(f'{method} validated in',
                      round(time.time() - start_time), 3)
        finally:
            self.val_scores = pd.Series(
                scores, name='val_scores',
                dtype='float64').sort_values(ascending=False)
            return (self.val_scores)

    def predict(self, X, transform=None):

        predicts = pd.DataFrame(index=X.index)

        for method in tqdm(self.methods):
            if transform == 'exp':
                predicts[method] = np.exp(self.estimators_[method].predict(X))
            elif transform == 'expm1':
                predicts[method] = np.expm1(
                    self.estimators_[method].predict(X))
            else:
                predicts[method] = self.estimators_[method].predict(X)

        return (predicts)

In [113]:
class Lin_Clf(Base, Clf):
    def __init__(self, methods='all', **prms):
        """Linear models for classification tasks"""
        """'log', 'ridge'
        'all'"""

        self.all_estimators = {
            'log':
            LogisticRegression(random_state=13, n_jobs=-1, **prms),
            'ridge':
            RidgeClassifier(solver='saga',
                            max_iter=1000000,
                            random_state=13,
                            **prms),
            'dt':
            DecisionTreeClassifier(random_state=13, **prms),
            'et':
            ExtraTreeClassifier(random_state=13, **prms),
            'passagr':
            PassiveAggressiveClassifier(max_iter=1000000,
                                        n_jobs=-1,
                                        random_state=13,
                                        n_iter_no_change=10,
                                        **prms),
            'lsvm':
            LinearSVC(max_iter=100, random_state=13, **prms),
            'svm':
            SVC(max_iter=100, random_state=13, **prms),
            'perceptron':
            Perceptron(n_jobs=-1, random_state=13, n_iter_no_change=10,
                       **prms),
            'sgd':
            SGDClassifier(n_jobs=-1,
                          random_state=13,
                          n_iter_no_change=10,
                          **prms),
#             'knn': KNeighborsClassifier(n_jobs=-1, **prms),
        }

        Base.__init__(self, methods)

In [114]:
class Ensembles_Clf(Base, Clf):
    def __init__(self, methods='all', **prms):
        """Ensemble models for classification tasks"""
        """Possible methods: or 'all'"""

        self.all_estimators = {
            'rf':
            RandomForestClassifier(n_estimators=100,
                                   n_jobs=-1,
                                   random_state=13,
                                   **prms),
            'et':
            ExtraTreesClassifier(n_estimators=100,
                                 n_jobs=-1,
                                 random_state=13,
                                 **prms),
            'gbm':
            GradientBoostingClassifier(n_estimators=100,
                                       random_state=13,
                                       n_iter_no_change=10,
                                       **prms),
            'lgbm': LGBMClassifier(n_estimators=100,
                                  random_state=13,
                                  n_jobs=-1,
                                  **prms),
        }

        Base.__init__(self, methods)

In [8]:
class Lin_Reg(Reg, Base):
    def __init__(self, methods='all', **prms):
        """Linear models for regression tasks"""
        
        """Possible methods: 'linreg', 'kridge', 'ridge', 'lasso', 
        'elastic', 'huber', 'bayes', 'lsvm', 'svm' or 'all'"""
        self.all_estimators = {
            'linreg': LinearRegression(n_jobs=-1,
                                       **prms),
            'ridge': Ridge(random_state=13,
                           **prms),
            'lasso': Lasso(max_iter=1000000,
                           random_state=13,
                           **prms),
            'elastic': ElasticNet(max_iter=1000000,
                                  random_state=13,
                                  **prms),
            'huber': HuberRegressor(max_iter=1000,
                                    **prms),
            'bayes': BayesianRidge(n_iter=1000000,
                                   **prms),
            'kridge': KernelRidge(**prms),
            'lsvm': LinearSVR(max_iter=1000,
                          random_state=13, **prms),
            'svm': SVR(max_iter=1000, **prms),
#             'knn': KNeighborsRegressor(n_jobs=-1, **prms),
        }

        Base.__init__(self, methods, **prms)

In [None]:
class Ensembles_Reg(Base, Reg):
    def __init__(self, methods='all', **prms):
        """Ensemble models for classification tasks"""


        self.all_estimators = {
            'gbm': GradientBoostingRegressor(n_estimators=10000,
                                              random_state=13,
                                              n_iter_no_change=10,
                                              **prms),            
            'lgbm': LGBMRegressor(n_estimators=1000,
                                  random_state=13,
                                  n_jobs=-1,
                                  **prms),
            'cat': CatBoostRegressor(n_estimators=300,
                                      random_state=13,
                                      early_stopping_rounds=10,
                                      **prms),
            'xgb': XGBRegressor(n_estimators=500,
                                verbosity=0,
                                n_jobs=-1,
                                random_state=13,
                                **prms),
            'rf': RandomForestRegressor(n_estimators=300,
                                         n_jobs=-1,
                                         random_state=13,
                                         **prms),
            'dt': DecisionTreeRegressor(random_state=13,
                                        **prms),
            'et': ExtraTreeRegressor(random_state=13, **prms),
            'ets': ExtraTreesRegressor(n_estimators=300,
                                  n_jobs=-1,
                                  random_state=13,
                                        **prms),
            'mlp': MLPRegressor(max_iter=500,
                                early_stopping=True,
                                n_iter_no_change=10,
                                random_state=13,
                                **prms)
        }

        Base.__init__(self, methods)

In [None]:
class StackingAveragedModels():
    def __init__(self, base_models, meta_model, n_folds=5):
        if isinstance(base_models, dict):
            self.base_models = base_models
        elif isinstance(base_models, list):
            self.base_models = dict(zip(range(len(base_models)), base_models))

        if isinstance(meta_model, dict):
            self.meta_model = clone(meta_model.popitem()[1])
        else:
            self.meta_model = clone(meta_model)
        self._kf = KFold(n_splits=n_folds, shuffle=True, random_state=13)

        self.all_models = dict.fromkeys(self.base_models.keys(), list())

    def fit(self, X, y):
        preds = pd.DataFrame(index=range(len(X)),
                             columns=self.base_models.keys())
        for name, model in tqdm(self.base_models.items()):
            for train_index, holdout_index in self._kf.split(X, y):
                instance = clone(model)
                instance.fit(X.iloc[train_index], y.iloc[train_index])
                self.all_models[name].append(instance)
                y_pred = instance.predict(X.iloc[holdout_index])
                preds.loc[holdout_index, name] = y_pred

        self.meta_model.fit(preds, y)
        return self

    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X)
                             for model in base_model]).mean(axis=1)
            for base_model in self.all_models.values()
        ])
        return self.meta_model.predict(meta_features)