In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from scipy.stats import wilcoxon
from joblib import Parallel, delayed

class ARS:
    def __init__(self, X, y, objective='regression', model_type='linear', random_state=42):
        self.X = X  # NumPy array
        self.y = y  # NumPy array
        self.objective = objective
        self.model_type = model_type  # seleccionar el modelo
        self.random_state = random_state

    def _relevance_score(self, original_median, benchmark, objective):
        if objective == 'classification':
            return (original_median - benchmark) / (1 - benchmark)
        elif objective == 'regression':
            return (benchmark - original_median) / benchmark
        else:
            raise ValueError('Unspecified objective')

    def _acceptable_minimum_distribution(self, base, objective):
        if objective == 'classification':
            return np.round(np.percentile(base, 60), 4) + 0.01
        elif objective == 'regression':
            median = np.median(base)
            mad = np.median(np.abs(base - median))
            return np.round(median - 0.5 * mad, 4)
        else:
            raise ValueError('Unspecified objective')

    def _get_model(self, iteration):
        # Selección del modelo según el tipo y el objetivo
        if self.objective == 'regression':
            if self.model_type == 'tree':
                return DecisionTreeRegressor(random_state=self.random_state + iteration, criterion='absolute_error')
            elif self.model_type == 'knn':
                return KNeighborsRegressor(n_neighbors=5)
            elif self.model_type == 'linear':
                return LinearRegression()
            else:
                raise ValueError("Invalid model_type for regression. Choose from 'tree', 'knn', or 'linear'.")
        elif self.objective == 'classification':
            if self.model_type == 'tree':
                return DecisionTreeClassifier(random_state=self.random_state + iteration, criterion='entropy')
            elif self.model_type == 'knn':
                return KNeighborsClassifier(n_neighbors=5)
            else:
                raise ValueError("Invalid model_type for classification. Choose from 'tree' or 'knn'.")
        else:
            raise ValueError("Invalid objective. Choose 'regression' or 'classification'.")

    def _run_iteration(self, iteration, stratify=None):
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, 
            self.y, 
            stratify=stratify, 
            train_size=0.6, 
            random_state=self.random_state + iteration, 
        )

        if self.objective == 'regression':
            metric = mean_absolute_error
            metric_params = {}
        else:  # 'classification'
            metric = f1_score
            metric_params = {'average': 'micro'}
            
        # Inicializa el modelo original según el tipo especificado
        original_model = self._get_model(iteration)  # Llama a _get_model para obtener el modelo adecuado
        # Train the original model
        original_model.fit(X_train, y_train)
        original_pred = original_model.predict(X_test)
        original_score = metric(y_test, original_pred, **metric_params)


        # Shuffle X_train using a NumPy permutation
        shuffled_indices = np.random.RandomState(self.random_state + iteration).permutation(X_train.shape[0])
        X_train_shuffled = X_train[shuffled_indices]
        # Inicializa el modelo shadow con el mismo tipo que el modelo original
        shadow_model = self._get_model(iteration)
        shadow_model.fit(X_train_shuffled, y_train)
        shadow_pred = shadow_model.predict(X_test)
        shadow_score = metric(y_test, shadow_pred, **metric_params)

        return original_score, shadow_score

    def calculate_multivariate_relevance_score(self, max_iterations=1000):
        np.random.seed(self.random_state)
        
        # Convert y to the appropriate type
        if self.objective == 'regression':
            y = self.y.astype(float)
            stratify = None
        elif self.objective == 'classification':
            y = self.y.astype(str)
            stratify = y
        else:
            raise ValueError('Specify objective')

        # Check if y has only one unique value
        if np.unique(y).size == 1:
            return [0.0, 0.0, 0.0]

        # Execute iterations in parallel
        results = Parallel(n_jobs=-1)(
            delayed(self._run_iteration)(i, stratify) for i in range(max_iterations)
        )

        # Separate results
        original_scores, shadow_scores = zip(*results)

        # Convert to arrays for efficiency
        A = np.array(original_scores)
        B = np.array(shadow_scores)

        # Calculate metrics
        acceptable_minimum = self._acceptable_minimum_distribution(B, self.objective)
        median_original_value = np.round(np.median(A), 4)
        relevance_score = self._relevance_score(median_original_value, acceptable_minimum, self.objective)

        # Wilcoxon test
        alternative = 'greater' if self.objective == 'classification' else 'less'
        stat, p_value = wilcoxon(A, B, alternative=alternative)

        # Check conditions for relevance score
        if median_original_value < 0 or relevance_score < 0 or p_value >= 0.05:
            relevance_score = 0.0

        return [relevance_score, median_original_value, acceptable_minimum,self.objective, self.model_type]


In [2]:
from sklearn.datasets import make_classification, make_regression
import time 
# Function to convert numbers to strings
def to_string(number_list):
    return [str(number) for number in number_list]

# Generate sample data
n_features_ = 30
X, y = make_regression(
    random_state=4,
    n_samples=1500,
    n_features=n_features_,
    n_informative=3,
    shuffle=False,
    noise=0.0,
)

# Convert to NumPy arrays
X = X.astype(float)
y = y.astype(float)

# Create a list of feature names
feature_names = to_string(range(n_features_))

# Function to process each feature and record its processing time
def process_feature(i):
    start_time = time.time()  # Start time
    X_col = X[:, [i]]  # Select a single column
    y_col = y
    ars = ARS(X=X_col, y=y_col, objective='regression',model_type='tree', random_state=42)
    score, median, acceptable_min, objective, model_type = ars.calculate_multivariate_relevance_score()
    end_time = time.time()  # End time
    elapsed_time = end_time - start_time  # Calculate elapsed time
    return [[feature_names[i]], score, median, acceptable_min, objective, model_type , elapsed_time]

# Initialize a list to store the results
results_list = []

# Measure the total processing time
start_time = time.time()

# Execute in parallel for all features
results = Parallel(n_jobs=-1)(
    delayed(process_feature)(i) for i in range(n_features_)
)

end_time = time.time()
total_time = end_time - start_time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")
results= pd.DataFrame(results)
results.columns = ['feature_names', 'ARS', 'median_original', 'Threshold', 'objective', 'model_type',  'elapsed_time' ]
results


Total Processing Time: 193.97 seconds


Unnamed: 0,feature_names,ARS,median_original,Threshold,objective,model_type,elapsed_time
0,[0],0.179227,118.0496,143.8273,regression,tree,70.451169
1,[1],0.118375,127.1305,144.2002,regression,tree,84.929547
2,[2],0.244725,108.6641,143.8736,regression,tree,108.235012
3,[3],0.008229,143.2538,144.4424,regression,tree,70.425377
4,[4],0.0,148.118,144.1374,regression,tree,90.112511
5,[5],0.0,145.6923,144.2062,regression,tree,96.199365
6,[6],0.021134,141.2559,144.3056,regression,tree,111.227439
7,[7],0.0,145.3847,144.1706,regression,tree,80.403079
8,[8],0.0,144.2389,144.0942,regression,tree,96.842746
9,[9],0.0,147.9556,144.2314,regression,tree,124.495765


In [3]:
from sklearn.datasets import make_classification, make_regression
import time 
# Function to convert numbers to strings
def to_string(number_list):
    return [str(number) for number in number_list]

# Generate sample data
n_features_ = 30
X, y = make_regression(
    random_state=4,
    n_samples=1500,
    n_features=n_features_,
    n_informative=3,
    shuffle=False,
    noise=0.0,
)

# Convert to NumPy arrays
X = X.astype(float)
y = y.astype(float)

# Create a list of feature names
feature_names = to_string(range(n_features_))

# Function to process each feature and record its processing time
def process_feature(i):
    start_time = time.time()  # Start time
    X_col = X[:, [i]]  # Select a single column
    y_col = y
    ars = ARS(X=X_col, y=y_col, objective='regression',model_type='linear', random_state=42)
    score, median, acceptable_min, objective, model_type = ars.calculate_multivariate_relevance_score()
    end_time = time.time()  # End time
    elapsed_time = end_time - start_time  # Calculate elapsed time
    return [[feature_names[i]], score, median, acceptable_min, objective, model_type , elapsed_time]

# Initialize a list to store the results
results_list = []

# Measure the total processing time
start_time = time.time()

# Execute in parallel for all features
results = Parallel(n_jobs=-1)(
    delayed(process_feature)(i) for i in range(n_features_)
)

end_time = time.time()
total_time = end_time - start_time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")
results= pd.DataFrame(results)
results.columns = ['feature_names', 'ARS', 'median_original', 'Threshold', 'objective', 'model_type',  'elapsed_time' ]
results


Total Processing Time: 9.70 seconds


Unnamed: 0,feature_names,ARS,median_original,Threshold,objective,model_type,elapsed_time
0,[0],0.172457,84.1005,101.6268,regression,linear,4.886539
1,[1],0.103421,91.1334,101.6457,regression,linear,5.336902
2,[2],0.259854,75.0548,101.4054,regression,linear,4.238376
3,[3],0.0,102.8119,101.9842,regression,linear,4.559596
4,[4],0.0,102.7781,101.9784,regression,linear,5.274292
5,[5],0.0,102.8422,102.0255,regression,linear,5.366618
6,[6],0.0,102.8571,102.0139,regression,linear,5.205786
7,[7],0.0,102.8261,101.9817,regression,linear,5.292018
8,[8],0.0,102.898,101.9697,regression,linear,5.116782
9,[9],0.0,102.8219,101.9724,regression,linear,5.123405


In [4]:
from sklearn.datasets import make_classification, make_regression
import time 
# Function to convert numbers to strings
def to_string(number_list):
    return [str(number) for number in number_list]

# Generate sample data
n_features_ = 30
X, y = make_regression(
    random_state=4,
    n_samples=1500,
    n_features=n_features_,
    n_informative=3,
    shuffle=False,
    noise=0.0,
)

# Convert to NumPy arrays
X = X.astype(float)
y = y.astype(float)

# Create a list of feature names
feature_names = to_string(range(n_features_))

# Function to process each feature and record its processing time
def process_feature(i):
    start_time = time.time()  # Start time
    X_col = X[:, [i]]  # Select a single column
    y_col = y
    ars = ARS(X=X_col, y=y_col, objective='regression',model_type='knn', random_state=42)
    score, median, acceptable_min, objective, model_type = ars.calculate_multivariate_relevance_score()
    end_time = time.time()  # End time
    elapsed_time = end_time - start_time  # Calculate elapsed time
    return [[feature_names[i]], score, median, acceptable_min, objective, model_type , elapsed_time]

# Initialize a list to store the results
results_list = []

# Measure the total processing time
start_time = time.time()

# Execute in parallel for all features
results = Parallel(n_jobs=-1)(
    delayed(process_feature)(i) for i in range(n_features_)
)

end_time = time.time()
total_time = end_time - start_time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")
results= pd.DataFrame(results)
results.columns = ['feature_names', 'ARS', 'median_original', 'Threshold', 'objective', 'model_type',  'elapsed_time' ]
results


Total Processing Time: 15.72 seconds


Unnamed: 0,feature_names,ARS,median_original,Threshold,objective,model_type,elapsed_time
0,[0],0.165857,93.1121,111.6261,regression,knn,9.096797
1,[1],0.111334,99.0402,111.4482,regression,knn,8.636446
2,[2],0.258533,82.509,111.278,regression,knn,7.641924
3,[3],0.0,112.515,111.7595,regression,knn,9.108637
4,[4],0.0,113.5384,111.688,regression,knn,9.069638
5,[5],0.0,111.7933,111.4789,regression,knn,8.577278
6,[6],0.0,112.6206,111.6288,regression,knn,8.254133
7,[7],0.0,112.0357,111.6489,regression,knn,8.720911
8,[8],0.0,112.8818,111.7974,regression,knn,8.804026
9,[9],0.0,113.3135,111.7277,regression,knn,8.152266
