# Import Libraries

In [3]:
from .synthetic_dataset_generation import generate_cross_sectional_dataset, generate_multivariate_time_series_dataset
from .stability_weighted_ensemble_feature_importance import SWEFI

ImportError: attempted relative import with no known parent package

# Synthetic Dataset Generation 

In [13]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

def generate_ar_time_series(n, ar_params, ma_params, sigma=1.0):
    """
    Generate an autoregressive (AR) time series.

    Parameters:
    - n: Length of the time series
    - ar_params: List of AR parameters [phi1, phi2, ..., phi_p]
    - sigma: Standard deviation of the white noise

    Returns:
    - time_series: Generated AR time series
    """
    # Create the AR coefficients for the model
    ar = np.r_[1, -np.array(ar_params)]
    ma = np.r_[1, np.array(ma_params)]

    # Generate the ARMA process
    arma_process = sm.tsa.ArmaProcess(ar, ma)
    time_series = arma_process.generate_sample(nsample=n, scale=sigma)

    return time_series


def create_lagged_data(time_series, k):
    """
    Create a DataFrame with the time series and its k lags.

    Parameters:
    - time_series: The generated time series
    - k: Number of lags

    Returns:
    - df: DataFrame containing the time series and its lags
    """
    data = {'y': time_series}
    for i in range(1, k + 1):
        data[f'lag_{i}'] = np.roll(time_series, i)

    df = pd.DataFrame(data)
    df = df.iloc[k:]  # Drop the initial rows with NaN values due to the roll

    return df.to_numpy()

def generate_cross_sectional_dataset(
    n_informative: int = 5, 
    n_redundant: int = 25,  
    n_noise: int = 5, 
    n_samples: int = 10000,  
    random_state: int = 41,  
    sigma_std: float = 0.1, 
    n_clusters_per_class: int = 2,
) -> tuple:
    n_features = n_informative + n_redundant + n_noise
    np.random.seed(random_state)
    X, y = Datasets.make_classification(
        n_samples=n_samples,
        n_features=n_features-n_redundant,
        n_informative=n_informative,
        n_redundant=0,
        shuffle=False,
        random_state=random_state,
        n_clusters_per_class=n_clusters_per_class,
    )

    columns = [f"I{i}" for i in range(n_informative)]
    columns += [f"N{i}" for i in range(n_noise)]
    X, y = pd.DataFrame(X, columns=columns), pd.Series(y)
    i = np.random.choice(range(n_informative), size=n_redundant)
    for k, j in enumerate(i):
        X[f"R{k} (from I{j})"] = X[f"I{j}"] + np.random.normal(size=X.shape[0]) * sigma_std

    return X, y

def generate_multivariate_time_series_dataset(
    n_informative: int = 10,  # number of informative features
    n_redundant: int = 10,  # number of redundant features
    n_noise: int = 20,
    n_samples: int = 10000,  # number of sample to generate
    n_time_steps: int = 2, # Two business years
    random_state: int = 41,  # random state
    sigma_std: float = 0.1,  # standard deviation of generation
    time_series_params = [[0.3], [0.3], 0.1], # AR parameter (phi1)
):
    n_lag = n_time_steps - 1

    informative_features = np.zeros(shape=(n_samples, n_informative, n_time_steps))
    for k in range(n_informative):
        time_series = generate_ar_time_series(n_samples + n_lag, time_series_params[0], time_series_params[1], 1)
        informative_features[:, k, :] = create_lagged_data(time_series, n_lag)

    np.random.seed(random_state)
    i = np.random.choice(range(n_informative), size=n_redundant)
    np.random.seed()
    
    linear_redundant_features = np.zeros((n_samples, n_redundant, n_time_steps))
    linear_redundant_features_from_ = []

    for k, j in enumerate(i):
        linear_redundant_features[:, k, :] = informative_features[:, j, :] + np.random.normal(size=(n_samples, n_time_steps)) * sigma_std
        linear_redundant_features_from_.append(j)

    # Generate noise features
    noise_features = np.zeros(shape=(n_samples, n_noise, n_time_steps))
    for k in range(n_noise):
        time_series = generate_ar_time_series(n_samples + n_lag, 0.5, 0.5, 1)
        noise_features[:, k, :] = create_lagged_data(time_series, n_lag)

    # Concatenate all features
    X = np.concatenate([
        informative_features,
        linear_redundant_features,
        noise_features],
        axis=1)

    columns = [f"I{i} lag{j}" for i in range(n_informative) for j in range(n_time_steps)]
    columns += [f"R{i} from I{k} lag{j}" for i, k in enumerate(linear_redundant_features_from_) for j in range(n_time_steps)]
    columns += [f"N{i} lag{j}" for i in range(n_noise) for j in range(n_time_steps)]

    X = X.reshape((n_samples, -1))
    X = pd.DataFrame(X, columns=columns)

    I = X.iloc[:, :n_informative * n_time_steps]
    I = (I - I.mean()) / I.std()
    weights = np.ones(shape=len(I.columns))

    # Calculate linear combination of features plus some noise
    linear_combination = I @ weights
    # Generate binary target variable based on probabilities
    y = (linear_combination > 0).astype(int)

    y = pd.Series(y)

    return X, y



# Feature Importance Methods Implementation

In [14]:
def feature_importance_linear_models(
    classifier,
    X, y,
) -> pd.DataFrame:
    classifier.fit(X, y)
    coefficients_importances = np.abs(classifier.coef_).mean(axis=0)
    coefficients_importances = pd.Series(coefficients_importances, classifier.feature_names_in_)

    importances_scaled = minmax_scale(
        coefficients_importances,
        feature_range=(0, 1),
        axis=0,
    )

    return pd.Series(importances_scaled, index=coefficients_importances.index)


def feature_importance_sklearn(
    classifier,
    X, y,
) -> pd.DataFrame:
    classifier.fit(X, y)
    importances = pd.Series(classifier.feature_importances_, index=classifier.feature_names_in_)
    
    importances = pd.concat({
        "Mean": importances.mean(),
    }, axis=1)

    importances_scaled = minmax_scale(
        importances["Mean"],
        feature_range=(0, 1),
        axis=0,
    )

    return pd.Series(importances_scaled, index=importances.index)


def feature_importance_RFE(
    classifier,
    X,
    y,
) -> pd.DataFrame:
    rfe = RFE(
        estimator=classifier,
        verbose=0,
        n_features_to_select=1,    
    )

    rfe.fit(X, y)

    inverted_ranking = np.max(rfe.ranking_) - rfe.ranking_ + 1

    normalized_importance = minmax_scale(inverted_ranking)

    return pd.Series(normalized_importance, index=rfe.feature_names_in_)

def feature_importance_SFI(
    classifier,
    X,
    y,
    n_splits: int = 5,
    score_sample_weights: list = None,  
    train_sample_weights: list = None, 
) -> pd.DataFrame:

    if train_sample_weights is None:
        train_sample_weights = np.ones(X.shape[0])
    if score_sample_weights is None:
        score_sample_weights = np.ones(X.shape[0])

    cv_generator = StratifiedKFold(n_splits=n_splits)
    feature_names = X.columns
    importances = []
    for feature_name in feature_names:
        scores = []

        for train, test in cv_generator.split(X, y):
            x_train, y_train, weights_train = X.iloc[train, :][[feature_name]], y.iloc[train], train_sample_weights[train]
            x_test, y_test, weights_test = X.iloc[test, :][[feature_name]], y.iloc[test], score_sample_weights[test]

            feature_train, label_train, sample_weights_train = (
                x_train, y_train, weights_train,
            )

            feature_test, label_test, sample_weights_test = (
                x_test, y_test, weights_test,
            )

            try:
                classifier.fit(feature_train, label_train, sample_weight=sample_weights_train)
            except:
                classifier.fit(feature_train, label_train)

            prediction = classifier.predict(feature_test)
            score = Metrics.accuracy_score(label_test, prediction, sample_weight=sample_weights_test)
            scores.append(score)

        importances.append({
            "FeatureName": feature_name,
            "Mean": np.mean(scores),
        })

    importances = pd.DataFrame(importances)
    importances_scaled = minmax_scale(
        importances["Mean"],
        feature_range=(0, 1),
        axis=0,
    )

    return pd.Series(importances_scaled, index=importances.FeatureName)


def feature_importance_MDI(classifier, X, y):
    classifier.fit(X, y)
    importances = pd.Series(classifier.feature_importances_, index=classifier.feature_names_in_)
    
    importances_scaled = minmax_scale(
        importances,
        feature_range=(0, 1),
        axis=0,
    )

    return pd.Series(importances_scaled, index=importances.index)

def feature_importance_MDA(
    classifier,
    X: pd.DataFrame,
    y: pd.DataFrame,
    n_repeats: int = 5,
) -> pd.DataFrame:

    classifier.fit(X, y)
    importances = permutation_importance(classifier, X, y, n_repeats=n_repeats, random_state=43).importances_mean

    importances = minmax_scale(
        importances,
        feature_range=(0, 1),
        axis=0,
    )

    return pd.Series(importances, index=X.columns)

class UAMeasure(Enum):
    SPEARMAN = "Spearman"
    PEARSON = "Pearson"
    KENDAL_TAU = "Kendal-Tau"
    MUTUAL_INFORMATION = "Mutual Information"
    ANOVA_F = "ANOVA F-Stat"

def feature_importance_univariate_analysis(measurements, X, y):
    ua = pd.DataFrame(
        data=None,
        index=X.columns
    )

    if UAMeasure.SPEARMAN.value in measurements:
        spearmans = minmax_scale(X.apply(lambda feature: (np.abs(np.corrcoef(feature, y)[0, 1])), axis=0))
        ua[UAMeasure.SPEARMAN.value] = spearmans

    if UAMeasure.PEARSON.value in measurements:
        pearsons = minmax_scale(X.apply(lambda feature: np.abs(pearsonr(feature, y).statistic), axis=0))
        ua[UAMeasure.PEARSON.value] = pearsons
        
    if UAMeasure.KENDAL_TAU.value in measurements:
        kendalltaus = minmax_scale(X.apply(lambda feature: np.abs(kendalltau(feature, y).statistic), axis=0))
        ua[UAMeasure.KENDAL_TAU.value] = kendalltaus

    if UAMeasure.MUTUAL_INFORMATION.value in measurements:
        mutual_informations = minmax_scale(mutual_info_classif(X, y, n_neighbors=51))
        ua[UAMeasure.MUTUAL_INFORMATION.value] = mutual_informations

    if UAMeasure.ANOVA_F.value in measurements:
        f_values = minmax_scale(f_classif(X, y)[0])
        ua[UAMeasure.ANOVA_F.value] = f_values
    
    return ua

# SWEFI implementation

In [23]:
def full_name(klass):
    return ".".join([klass.__module__, klass.__name__])

class FIModel(Enum):
    COEFFICIENT_BASED = "Coefficient-Based + Shrinkage & Selection"
    TREE_BASED = "Tree-Based"
    PERMUTATION_BASED = "Permutation-Based"
    SINGLE_FEATURE_BASED = "Signle Feature-Based"
    RFE_BASED = "Recursive Feature Elimination-Based"

class SWEFI:
    def __init__(self, X, y, n_fold=10):

        self.percentage = None
        self.clfx = ClassificationExperiment()
        self.clfx.setup(data = X, target = y, fold=n_fold, train_size=0.99, session_id = 123, n_jobs = -1, normalize=True, normalize_method='zscore')
        self.X = X
        self.y = y

    def select_models(self, select_n_model=None):
        model_to_methods = self.clfx.models()

        model_to_methods['FI Methods'] = pd.Series({
            'lr': [FIModel.COEFFICIENT_BASED.value, FIModel.PERMUTATION_BASED.value, FIModel.SINGLE_FEATURE_BASED.value, FIModel.RFE_BASED.value],
            'knn': [FIModel.PERMUTATION_BASED.value, ],
            'nb': [FIModel.SINGLE_FEATURE_BASED.value],
            'dt': [FIModel.SINGLE_FEATURE_BASED.value],
            'svm': [FIModel.COEFFICIENT_BASED.value, FIModel.PERMUTATION_BASED.value,], 
            'rbfsvm': [FIModel.PERMUTATION_BASED.value,],
            'gpc': [FIModel.PERMUTATION_BASED.value,],
            'mlp': [FIModel.PERMUTATION_BASED.value,],
            'ridge': [FIModel.COEFFICIENT_BASED.value, FIModel.PERMUTATION_BASED.value, FIModel.SINGLE_FEATURE_BASED.value, FIModel.RFE_BASED.value],
            'rf': [FIModel.TREE_BASED.value, FIModel.RFE_BASED.value],
            'qda': [FIModel.PERMUTATION_BASED.value, ],
            'ada': [FIModel.PERMUTATION_BASED.value, ],
            'gbc': [FIModel.PERMUTATION_BASED.value, ],
            'lda': [FIModel.COEFFICIENT_BASED.value, FIModel.PERMUTATION_BASED.value, FIModel.SINGLE_FEATURE_BASED.value, FIModel.RFE_BASED.value],
            'et': [FIModel.TREE_BASED.value, FIModel.RFE_BASED.value],
            'xgboost': [FIModel.TREE_BASED.value, FIModel.RFE_BASED.value ],
            'lightgbm': [FIModel.PERMUTATION_BASED.value,],
            'dummy': [FIModel.PERMUTATION_BASED.value],
        })

        model_to_methods = model_to_methods[['Reference', 'FI Methods']].set_index('Reference').squeeze().to_dict()
        
        model_to_methods.update({
            "sklearn.svm._classes.SVC": [FIModel.COEFFICIENT_BASED.value, FIModel.RFE_BASED.value],
        })
        
        self.model_to_methods = model_to_methods

        exclude = set(['dummy', 'svm', 'knn', 'nb', 'lightgbm', 'ada', 'gpc', 'rbfsvm'])
        all_models = set(self.clfx.models().index.tolist()) 
        external_models = [SVC(probability=True, kernel='linear'), ]

        include = list(all_models - exclude) + external_models
        
        if type(select_n_model) == float:
            self.learning_models = self.clfx.compare_models(n_select=round(len(self.clfx.models()) * select_n_model),
                                                             include=include,
                                                             )
        else:
            self.learning_models = self.clfx.compare_models(n_select=select_n_model,
                                                             include=include,
                                                             )
            
        return self

    def select_univariate_analysis_measurements(self, measurements=['Pearson', 'Spearman', 'Kendal']):
        self.measurements = measurements

        return self

    def fine_tune_selected_models(self, hpo_n_fold=4, hpo_n_iter=25, hpo_metric = 'Accuracy', hpo_search_library = 'scikit-optimize', hpo_search_algorithm = 'bayesian'):
        learning_models = self.learning_models
        model_to_methods = self.model_to_methods


        model_to_custom_config = {
            "sklearn.ensemble._forest.ExtraTreesClassifier": None,
            "sklearn.neighbors._classification.KNeighborsClassifier": None,
            "sklearn.ensemble._forest.RandomForestClassifier": None,
            "xgboost.sklearn.XGBClassifier": None,
            "lightgbm.sklearn.LGBMClassifier": {},
            "sklearn.neural_network._multilayer_perceptron.MLPClassifier": None,
            "sklearn.ensemble._gb.GradientBoostingClassifier": {},
            "sklearn.naive_bayes.GaussianNB": {},
            "sklearn.ensemble._weight_boosting.AdaBoostClassifier": {},
            "sklearn.linear_model._ridge.RidgeClassifier": {
                "alpha": [1],
                "solver": ["lsqr"],
            },
            "sklearn.discriminant_analysis.LinearDiscriminantAnalysis": {
                "shrinkage": [0.0,],
                "solver": ["lsqr"],
            },
            "sklearn.linear_model._logistic.LogisticRegression": {
                "C": [1,],
                "solver": ['liblinear'],
            },
            "sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis": {},
            "sklearn.tree._classes.DecisionTreeClassifier": None,
            "sklearn.svm._classes.SVC": {
                'probability': [True, ],
                'C': [1.0],
                'kernel': [
                    "linear",
                ],
            },
        }

        tuned_learning_models = []
        for model in learning_models:
            print("-" * 80)
            print(model.__class__.__name__)

            if model_to_custom_config[full_name(model.__class__)] is None:
                tlm, tuner = self.clfx.tune_model(
                    model,
                    return_tuner=True,
                    fold=hpo_n_fold, 
                    verbose=True, 
                    tuner_verbose=True, 
                    optimize=hpo_metric, 
                    search_library=hpo_search_library, search_algorithm=hpo_search_algorithm,
                    n_iter=hpo_n_iter,
                    choose_better=True
                )

            elif len(model_to_custom_config[full_name(model.__class__)]) == 0:
                print("No hyper-tuning ...")
                tlm = model

            else:
                print("Custom config ...")
                tlm, tuner = self.clfx.tune_model(
                    model,
                    return_tuner=True,
                    fold=hpo_n_fold, 
                    verbose=True, 
                    tuner_verbose=True, 
                    optimize=hpo_metric, 
                    search_library='scikit-learn', search_algorithm='grid',
                    custom_grid=model_to_custom_config[full_name(model.__class__)], 
                    n_iter=hpo_n_iter,
                    choose_better=True
                )


            tuned_learning_models.append(tlm)

        self.finalized_tuned_learning_models = [self.clfx.finalize_model(estimator=tlm, model_only=True,) for tlm in tuned_learning_models]

        model_method_pairs = []

        for tlm in tuned_learning_models:
            methods = model_to_methods[full_name(tlm.__class__)]
            for method in methods:
                model_method_pairs.append((tlm, method))

        self.model_method_pairs = model_method_pairs
        
        return self

    def _initialize_model_feature_importance_data(self, models, n_iteration=5):
        feature_names = self.X.columns
        index_columns = ["Model Name", "Feature Importance Model", "Subset Index"]
        to_apply_product_list = [
            models,
            [i for i in range(n_iteration)],
        ]
        tuples = [(type(model).__name__, proper_fi_model, iteration) for (model, proper_fi_model), iteration in
                  itertools.product(*to_apply_product_list)]
        
        index = pd.MultiIndex.from_tuples(
            tuples,
            names=index_columns
        )

        data = pd.DataFrame({}, columns=feature_names, index=index)
        return data, index
    
    def _initialize_univariate_analysis_feature_importance_data(self, measurements, n_iteration=5):
        feature_names = self.X.columns
        index_columns = ["Model Name", "Feature Importance Model", "Subset Index"]
        to_apply_product_list = [
            measurements,
            [i for i in range(n_iteration)],
        ]
        tuples = [("Univariate Analysis", ua_measurement, iteration) for (ua_measurement), iteration in
                  itertools.product(*to_apply_product_list)]
        
        index = pd.MultiIndex.from_tuples(
            tuples,
            names=index_columns
        )

        data = pd.DataFrame({}, columns=feature_names, index=index)
        return data, index

    @staticmethod
    def stationary_bootstrap(X, y, n_iteration=5,):
        optimal_block_size = round(optimal_block_length(X).mean(axis=0)["stationary"])
        random_state = RandomState(1234)
        bootstraper = StationaryBootstrap(optimal_block_size, X=X, y=y, random_state=random_state)
        for bootstraped_data in bootstraper.bootstrap(n_iteration):
            yield bootstraped_data[1]['X'], bootstraped_data[1]['y']

    @staticmethod
    def iid_bootstrap(X, y, n_iteration=5,):
        random_state = RandomState(1234)
        bootstraper = IIDBootstrap(X=X, y=y, random_state=random_state)
        for bootstraped_data in bootstraper.bootstrap(n_iteration):
            yield bootstraped_data[1]['X'], bootstraped_data[1]['y']

    def compute_feature_importance_data(self, bootstrap_method, n_iteration=10, n_repeats=10):
        X = self.X
        y = self.y

        X = (X-X.mean()) / X.std()

        model_method_pairs = self.model_method_pairs
        models_fi, models_index = self._initialize_model_feature_importance_data(model_method_pairs, n_iteration=n_iteration)

        measurements = self.measurements
        measurements_fi, measurements_index = self._initialize_univariate_analysis_feature_importance_data(measurements, n_iteration=n_iteration)

        for iteration, (X_train, y_train) in tqdm(enumerate(
            bootstrap_method(X, y, n_iteration)
        ), total=n_iteration):

            for model, proper_fi_model in model_method_pairs:

                if proper_fi_model == FIModel.TREE_BASED.value:
                    try:
                        result = feature_importance_MDI(model, X_train, y_train, ).values
                    except:
                        result = feature_importance_sklearn(model, X_train, y_train,).values

                elif proper_fi_model == FIModel.PERMUTATION_BASED.value:
                    result = feature_importance_MDA(model, X_train, y_train, n_repeats=n_repeats).values

                elif proper_fi_model == FIModel.COEFFICIENT_BASED.value:
                    result = feature_importance_linear_models(model, X_train, y_train,).values

                elif proper_fi_model == FIModel.SINGLE_FEATURE_BASED.value:
                    result = feature_importance_SFI(model, X_train, y_train, n_splits=n_repeats).values
                
                elif proper_fi_model == FIModel.RFE_BASED.value:
                    result = feature_importance_RFE(model, X_train, y_train,).values

                # Normalize
                result = (result - result.min()) / (result.max() - result.min()) 

                models_fi.loc[type(model).__name__, proper_fi_model, iteration] = result

            ua_result = feature_importance_univariate_analysis(measurements, X_train, y_train).T.values
            measurements_fi.loc[pd.IndexSlice["Univariate Analysis", :, iteration]] = ua_result
            
        self.models_fi = models_fi
        self.models_index = models_index

        self.measurements_fi = measurements_fi
        self.measurements_index = measurements_index

        self.n_iterationn = n_iteration

        return self
    
    def compute_swefi_scores(self, percentage=0.5, weight='linear'):
        features = self.X.columns

        fi = pd.concat([self.models_fi, self.measurements_fi])
        index = self.models_index.tolist() + self.measurements_index.tolist()
        k = round(len(features) * percentage)
        important_features_together = []

        for idx in index:
            important_features_in_current_step = fi.loc[idx].squeeze().sort_values(ascending=False)[:k].index.tolist()
            important_features_together.extend(important_features_in_current_step)

        feature_ranked_as_k_top_important = dict(zip(features, [0] * len(features)))
        n_times_that_feature_ranked_as_important = Counter(important_features_together)
        dict.update(feature_ranked_as_k_top_important, n_times_that_feature_ranked_as_important)
        feature_ranked_as_k_top_important = pd.Series(feature_ranked_as_k_top_important)
        stability_scores = feature_ranked_as_k_top_important / (self.n_iterationn * (len(self.model_method_pairs) + len(self.measurements)))

        if weight == 'linear':
            weights = stability_scores
            weights = weights / weights.sum()
            swefi = (fi * weights)

        elif weight == 'logarithmic':
            weights = np.log(stability_scores + 1)
            weights = weights / weights.sum()
            swefi = (fi * weights)

        elif weight == 'exponential':
            weights = np.exp(stability_scores)
            weights = weights / weights.sum()
            swefi = (fi * weights) 

        elif weight == 'harmonic':
            swefi = (2 * fi * stability_scores) / (fi + stability_scores)

        elif weight == 'power-2':
            weights = stability_scores ** 2
            weights = weights / weights.sum()
            swefi = (fi * weights)

        elif weight == 'power-0.5':
            weights = (stability_scores ** 0.5)
            weights = weights / weights.sum()
            swefi = (fi * weights)

        elif weight == 'entropy':
            weights = (-stability_scores * np.log(stability_scores))
            weights = weights / weights.sum()            
            swefi = (fi * weights)

        else:
            raise NotImplementedError(f"Weight {weight} not implemented.")

        swefi = pd.concat(
            [swefi.mean() / swefi.mean().sum(), swefi.std() * (swefi.shape[0]) ** -0.5 / swefi.mean().sum()],
            axis=1,
        ).rename(columns={
            0: 'mean(SWEFI)',
            1: 'std(SWEFI)',
        }).sort_values(by='mean(SWEFI)')

        self.stability_scores = stability_scores
        self.swefi = swefi
        return self

    def get_swefi_scores(self):
        return self.swefi

    def get_inner_models_feature_importances(self):
        models_fi = self.models_fi.groupby(level=[0, 1]).sum()
        models_fi = models_fi.div(models_fi.sum(axis=1).values, axis=0).transpose()
        return models_fi

# Run on Synthetic Dataset

In [16]:
# n_informative=5
# n_redundant=20
# n_noise=5
# n_samples = 1000
# sigma_std = 0.1

# X, y = generate_cross_sectional_dataset(
#     n_informative=n_informative,
#     n_redundant=n_redundant,
#     n_noise=n_noise,
#     n_samples=n_samples,
#     sigma_std=sigma_std
# )

n_informative=5
n_redundant=20
n_noise=5
n_samples = 1000
sigma_std = 0.1
n_time_steps = 2

X, y = generate_multivariate_time_series_dataset(
    n_informative=n_informative,
    n_redundant=n_redundant,
    n_noise=n_noise,
    n_samples=n_samples,
    n_time_steps=n_time_steps,
    sigma_std=sigma_std,
    time_series_params=[[0.5], [0.5], [1]]
)

In [17]:
select_n_model = 100

bootstrap_method = SWEFI.stationary_bootstrap

hpo_n_fold=4
hpo_n_iter=25
hpo_metric = 'AUC'
hpo_search_library = 'scikit-optimize'
hpo_search_algorithm = 'bayesian'

n_iteration = 5

percentage = 0.5

In [18]:
swefi = SWEFI(X, y, n_fold=10)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,0
2,Target type,Binary
3,Original data shape,"(1000, 61)"
4,Transformed data shape,"(1000, 61)"
5,Transformed train set shape,"(990, 61)"
6,Transformed test set shape,"(10, 61)"
7,Numeric features,60
8,Preprocess,True
9,Imputation type,simple


In [19]:
swefi.select_models(select_n_model=select_n_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
10,SVM - Radial Kernel,0.9828,0.9992,0.984,0.9823,0.983,0.9656,0.9659,0.011
9,Logistic Regression,0.9808,0.9993,0.984,0.9786,0.9811,0.9616,0.9621,0.006
4,MLP Classifier,0.9657,0.9982,0.9741,0.9597,0.9664,0.9313,0.9324,0.058
0,Ridge Classifier,0.9424,0.9926,0.9523,0.9367,0.9439,0.8848,0.8861,0.505
7,Linear Discriminant Analysis,0.9323,0.9882,0.9443,0.9254,0.934,0.8646,0.8663,0.005
1,Extra Trees Classifier,0.9061,0.9729,0.9105,0.9082,0.9081,0.812,0.8146,0.041
5,Extreme Gradient Boosting,0.899,0.9681,0.9006,0.903,0.9007,0.7979,0.8,0.05
2,Random Forest Classifier,0.8939,0.9678,0.9007,0.894,0.8962,0.7878,0.7901,0.053
8,Gradient Boosting Classifier,0.8869,0.9673,0.8967,0.8867,0.89,0.7736,0.7769,0.318
3,Quadratic Discriminant Analysis,0.7808,0.8703,0.7954,0.7804,0.7865,0.5615,0.5637,0.005


<__main__.SWEFI at 0x167ee2350>

In [20]:
swefi.fine_tune_selected_models(
    hpo_n_fold=hpo_n_fold,
    hpo_n_iter=hpo_n_iter,
    hpo_metric=hpo_metric,
    hpo_search_algorithm=hpo_search_algorithm,
    hpo_search_library=hpo_search_library
);

--------------------------------------------------------------------------------
SVC
Custom config ...


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.9798,0.9993,0.9762,0.984,0.9801,0.9597,0.9597
2,0.9838,0.9995,1.0,0.9692,0.9844,0.9676,0.9681
3,0.9838,0.9988,0.984,0.984,0.984,0.9676,0.9676
Mean,0.9869,0.9994,0.99,0.9843,0.9871,0.9737,0.9738
Std,0.0078,0.0005,0.0103,0.0109,0.0076,0.0155,0.0155


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
--------------------------------------------------------------------------------
LogisticRegression
Custom config ...


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.996,0.9999,1.0,0.9921,0.996,0.9919,0.992
1,0.9798,0.9995,0.9683,0.9919,0.9799,0.9597,0.96
2,0.9838,0.9999,1.0,0.9692,0.9844,0.9676,0.9681
3,0.9717,0.9981,0.976,0.9683,0.9721,0.9433,0.9433
Mean,0.9828,0.9993,0.9861,0.9804,0.9831,0.9656,0.9658
Std,0.0088,0.0007,0.0142,0.0116,0.0087,0.0175,0.0175


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
--------------------------------------------------------------------------------
MLPClassifier


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.996,1.0,0.9921,1.0,0.996,0.9919,0.992
1,0.9677,0.9984,0.9603,0.9758,0.968,0.9355,0.9356
2,0.9879,0.9999,0.9921,0.9843,0.9881,0.9757,0.9757
3,0.9838,0.9992,0.976,0.9919,0.9839,0.9676,0.9677
Mean,0.9838,0.9994,0.9801,0.988,0.984,0.9677,0.9678
Std,0.0103,0.0006,0.0132,0.009,0.0102,0.0205,0.0205


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fi

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9476,0.9914,0.9603,0.938,0.949,0.8951,0.8954
1,0.9113,0.9807,0.8968,0.9262,0.9113,0.8226,0.8231
2,0.9433,0.9927,0.9762,0.9179,0.9462,0.8864,0.8883
3,0.8988,0.9793,0.928,0.8788,0.9027,0.7974,0.7987
Mean,0.9252,0.986,0.9403,0.9152,0.9273,0.8504,0.8514
Std,0.0207,0.0061,0.0305,0.0222,0.0205,0.0415,0.0415


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
--------------------------------------------------------------------------------
LinearDiscriminantAnalysis
Custom config ...


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9516,0.9871,0.9683,0.9385,0.9531,0.9032,0.9036
1,0.9113,0.9733,0.9127,0.9127,0.9127,0.8225,0.8225
2,0.9393,0.9887,0.9683,0.9173,0.9421,0.8784,0.8798
3,0.8907,0.9689,0.896,0.8889,0.8924,0.7813,0.7813
Mean,0.9232,0.9795,0.9363,0.9143,0.9251,0.8463,0.8468
Std,0.0238,0.0086,0.0325,0.0176,0.024,0.0476,0.0479


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
--------------------------------------------------------------------------------
ExtraTreesClassifier


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9113,0.9779,0.9048,0.9194,0.912,0.8226,0.8227
1,0.9032,0.9791,0.9286,0.8864,0.907,0.8062,0.8072
2,0.8826,0.9628,0.9127,0.8647,0.888,0.7648,0.7661
3,0.8745,0.9587,0.856,0.8917,0.8735,0.7491,0.7497
Mean,0.8929,0.9696,0.9005,0.8905,0.8951,0.7857,0.7864
Std,0.0149,0.009,0.0271,0.0195,0.0154,0.0298,0.0296


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fi

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9315,0.9903,0.9841,0.8921,0.9358,0.8626,0.8674
1,0.879,0.9841,0.9841,0.8158,0.8921,0.7572,0.7745
2,0.8543,0.9663,0.9841,0.7848,0.8732,0.7068,0.7321
3,0.8785,0.9723,0.992,0.8105,0.8921,0.7564,0.7767
Mean,0.8858,0.9782,0.9861,0.8258,0.8983,0.7708,0.7877
Std,0.0282,0.0095,0.0034,0.04,0.023,0.0568,0.0493


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fi

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9032,0.9743,0.8889,0.918,0.9032,0.8065,0.8069
1,0.8952,0.9722,0.9286,0.8731,0.9,0.79,0.7917
2,0.8462,0.9522,0.8651,0.8385,0.8516,0.692,0.6923
3,0.8259,0.9273,0.832,0.8254,0.8287,0.6517,0.6518
Mean,0.8676,0.9565,0.8786,0.8638,0.8709,0.7351,0.7357
Std,0.0325,0.019,0.0352,0.0359,0.0318,0.0651,0.0655


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fi

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7863,0.848,0.7698,0.8017,0.7854,0.5727,0.5732
1,0.7782,0.8297,0.8016,0.771,0.786,0.556,0.5565
2,0.7206,0.8028,0.7381,0.7209,0.7294,0.4408,0.4409
3,0.7449,0.8218,0.696,0.7768,0.7342,0.4905,0.4932
Mean,0.7575,0.8256,0.7514,0.7676,0.7588,0.515,0.516
Std,0.0263,0.0162,0.0391,0.0293,0.027,0.0527,0.0526


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fi

In [21]:
swefi.select_univariate_analysis_measurements(measurements=[
        # UAMeasure.SPEARMAN.value,
        # UAMeasure.PEARSON.value,
        # UAMeasure.KENDAL_TAU.value,
        UAMeasure.MUTUAL_INFORMATION.value,
        UAMeasure.ANOVA_F.value,
    ]
)

<__main__.SWEFI at 0x167ee2350>

In [22]:
swefi.compute_feature_importance_data(bootstrap_method=bootstrap_method, n_iteration=n_iteration, n_repeats=10);

  0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
swefi.compute_swefi_scores(percentage=percentage)
swefi_scores = swefi.get_swefi_scores()

index = swefi_scores.index
value = swefi_scores['mean(SWEFI)']
error = swefi_scores['std(SWEFI)']

# Create a bar chart
fig = go.Figure(go.Bar(
    y=value,  # Values go on the x-axis for a horizontal bar chart
    x=index,  # Categories go on the y-axis
    error_y=dict(type='data', array=error),
))

fig.update_layout(
    xaxis=dict(
        title='Feature',
        # showgrid=True,
        zeroline=True,
        showline=True,
        # gridcolor='lightgrey',
        # gridwidth=0.5,
        zerolinecolor='black',
        zerolinewidth=3,
        linecolor='black',
        linewidth=3,
        mirror=True,
        tickangle=45,  # Rotate x-axis labels by 45 degrees

        tickfont=dict(
            family='Arial',
            size=18,
            color='black',
        )
    ),
    yaxis=dict(
        title='Importance',
        showgrid=True,
        zeroline=True,
        showline=True,
        gridcolor='black',
        gridwidth=1,
        zerolinecolor='black',
        zerolinewidth=3,
        linecolor='black',
        linewidth=3,
        mirror=True,

        tickfont=dict(
            family='Arial',
            size=18,
            color='black',
        )
    ),
    margin=dict(l=10, r=10, b=10, t=10),
    paper_bgcolor='white',
    plot_bgcolor='lightgrey',
    # showlegend=True,
    # legend=dict(
    #     x=0.9,
    #     y=0.9,
    #     traceorder='normal',
    #     font=dict(
    #         family='sans-serif',
    #         size=12,
    #         color='black'
    #     ),
    #     bgcolor='LightSteelBlue',
    #     bordercolor='Black',
    #     borderwidth=2
    # ),
    width=2000,
    height=1500,
    bargap=0.1,
    bargroupgap=0.1,
)


fig.write_image('swefi_success_time_series_vertical.png')
fig.show()
