In [14]:
from abc import ABC, abstractmethod
from typing import Optional, Tuple
import warnings

import pandas as pd
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array, check_X_y, check_X
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator

from sklearn.linear_model import LinearRegression

ImportError: cannot import name 'check_X' from 'sklearn.utils' (C:\Users\rdas6\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\__init__.py)

In [2]:
nsamples = 100
data = pd.DataFrame(
    columns=["v1", "v2", "t"],
    index=pd.date_range(start="2023-01-01", freq="D", periods=nsamples),
)

data["v1"] = np.arange(1, nsamples + 1)
data["v2"] = data["v1"] + 0.1
data["t"] = data["v1"] + 0.01

train_data = data.head(80)
test_data = data.tail(20)

train_y_d = train_data["t"].copy(deep=True)
train_x_d = train_data[["v1", "v2"]].copy(deep=True)

test_y_d = test_data["t"].copy(deep=True)
test_x_d = test_data[["v1", "v2"]].copy(deep=True)

print(test_data.shape)

print(train_data.tail(), end="\n\n")

print(test_data.head())

(20, 3)
            v1    v2      t
2023-03-17  76  76.1  76.01
2023-03-18  77  77.1  77.01
2023-03-19  78  78.1  78.01
2023-03-20  79  79.1  79.01
2023-03-21  80  80.1  80.01

            v1    v2      t
2023-03-22  81  81.1  81.01
2023-03-23  82  82.1  82.01
2023-03-24  83  83.1  83.01
2023-03-25  84  84.1  84.01
2023-03-26  85  85.1  85.01


In [None]:
LinearRegression().fit(train_x_d, train_y_d).predict(test_x_d[["v2", "v1"]])

In [5]:
class InputGuard(BaseEstimator, TransformerMixin):
    """
    Verify column names at predict time match the ones used when fitting

    Parameters
    ----------
    strict : bool, optional
        If True, it will raise an error if the input does not match
        exactly (same columns, same order), if False, it will ignore
        order and extra columns (will only show a warning), defaults
        to True

    Notes
    -----
    Must be used in a Pipeline object and must be the first step. fit
    and predict should be called with a pandas.DataFrame object
    """

    def __init__(self, strict=True):
        self.strict = strict

    def fit(self, X, y=None):
        X_out, y = check_X_y(X, y)
        X = X if isinstance(X, pd.DataFrame) and hasattr(X, "columns") else X_out

        # our estimator is designed to work on structures
        # that have a columns attribute (such as pandas Data Frame)
        if isinstance(X, pd.DataFrame) and hasattr(X, "columns"):
            self.expected_ = list(X.columns)
            self.n_features_ = X.shape[1]
            self.n_features_in_ = X.shape[1]

        # ...but we still need to support numpy.arrays to
        # pass check_estimator
        else:
            self.expected_ = None
            self.n_features_ = X.shape[1]
            self.n_features_in_ = X.shape[1]
            warnings.warn(
                "Input does not have a columns attribute, "
                "only number of columns will be validated"
            )
        self.is_fitted_ = True
        return self

    def get_feature_names_out(self):
        return self.expected_

    def transform(self, X):
        # these two are to pass check_estimator
        check_is_fitted(self)
        X_out = check_array(X)
        X = X if isinstance(X, pd.DataFrame) and hasattr(X, "columns") else X_out

        # if column names are available...
        if self.expected_:
            return self._transform(X)
        else:
            # this is raised to pass check_estimator
            if self.n_features_ != X.shape[1]:
                raise ValueError(
                    f"Number of columns from fit {self.n_features_} \
                                   is different from transform {X.shape[1]}"
                )

            return X

    def _transform(self, X):
        # this function implements our core logic and it
        # will only be called when fit received an X with a columns attribute

        if not hasattr(X, "columns"):
            raise ValueError(
                f"{type(self).__name__}.fit ran with a X object that had \
                             a columns attribute, but the current \
                             X does not have it"
            )

        columns_got = list(X.columns)

        missing = set(self.expected_) - set(columns_got)
        extra = set(columns_got) - set(self.expected_)

        if missing:
            raise ValueError(
                f"Columns during fit were: {self.expected_}, but got {columns_got} \
                               for predict. Missing: {missing}"
            )
        if extra:
            warnings.warn(f"Got extra columns: {extra}, ignoring")

        if self.expected_ != columns_got:
            warnings.warn(
                f"Columns during fit were: {self.expected_}, but got {columns_got} for predict"
            )
            return X[self.expected_]
        return X

In [6]:
o = InputGuard()
check_estimator(InputGuard())



In [7]:
class IndexGuard(BaseEstimator, TransformerMixin):
    """
    Verify column names at predict time match the ones used when fitting

    Parameters
    ----------
    strict : bool, optional
        If True, it will raise an error if the input does not match
        exactly (same columns, same order), if False, it will ignore
        order and extra columns (will only show a warning), defaults
        to True

    Notes
    -----
    Must be used in a Pipeline object and must be the first step. fit
    and predict should be called with a pandas.DataFrame object
    """

    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.expected_ = list(X.columns)
        # self.n_features_ = X.shape[1]
        # self.n_features_in_  = X.shape[1]
        return self

    def transform(self, X):
        columns_got = list(X.columns)

        missing = set(self.expected_) - set(columns_got)
        extra = set(columns_got) - set(self.expected_)

        if missing:
            raise ValueError(
                f"Columns during fit were: {self.expected_}, but got {columns_got} \
                               for predict. Missing: {missing}"
            )
        if extra:
            warnings.warn(f"Got extra columns: {extra}, ignoring")

        if self.expected_ != columns_got:
            warnings.warn(
                f"Columns during fit were: {self.expected_}, but got {columns_got} for predict"
            )
            return X[self.expected_]
        return X

In [8]:
o = IndexGuard()
check_estimator(IndexGuard())

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [9]:
from feature_engine.datetime import DatetimeFeatures


dtf = DatetimeFeatures(features_to_extract=["year", "month", "day_of_month"])

check_estimator(dtf)

ValueError: No datetime variables found in this dataframe.

In [10]:
from feature_engine.timeseries.forecasting import (
    ExpandingWindowFeatures,
    LagFeatures,
    WindowFeatures,
)

In [11]:
check_estimator(LagFeatures(missing_values="ignore"))

