In [54]:
import numpy as np
from sklearn.utils import check_X_y
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import check_random_state
import logging


class FixedBorutaPy(BaseEstimator, TransformerMixin):
    def __init__(self, estimator, n_estimators='auto', perc=100, alpha=0.05, two_step=True,
                 max_iter=100, random_state=None, verbose=0):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.perc = perc
        self.alpha = alpha
        self.two_step = two_step
        self.max_iter = max_iter
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y):
        return self._fit(X, y)

    def _fit(self, X, y):
        X, y = check_X_y(X, y)
        self.random_state = check_random_state(self.random_state)
        n_feat = X.shape[1]
        dec_reg = np.zeros(n_feat, dtype=int)
        hit_reg = np.zeros(n_feat, dtype=int)

        iter_num = 0
        while iter_num < self.max_iter:
            iter_num += 1
            if self.verbose > 0:
                print(f"Iteration: {iter_num}")

            forest = RandomForestClassifier(
                n_estimators=self.n_estimators if isinstance(self.n_estimators, int) else 100,
                n_jobs=-1, random_state=self.random_state)
            forest.fit(X, y)

            imp = forest.feature_importances_
            median_imp = np.median(imp)

            imp_shadow = np.zeros(n_feat)
            for _ in range(5):
                X_shadow = np.random.permutation(X)
                forest_shadow = RandomForestClassifier(
                    n_estimators=self.n_estimators if isinstance(self.n_estimators, int) else 100,
                    n_jobs=-1, random_state=self.random_state)
                forest_shadow.fit(X_shadow, y)
                imp_shadow += forest_shadow.feature_importances_

            imp_shadow /= 5
            median_imp_shadow = np.median(imp_shadow)

            for i in range(n_feat):
                if imp[i] > median_imp_shadow:
                    hit_reg[i] += 1
                if imp[i] > median_imp:
                    dec_reg[i] = 1
                else:
                    dec_reg[i] = -1

            if np.all(hit_reg > 0):
                break

        self.support_ = dec_reg == 1
        self.ranking_ = dec_reg
        return self

    def transform(self, X):
        check_is_fitted(self, 'support_')
        return X[:, self.support_]

# Replace BorutaPy with FixedBorutaPy in your code
boruta_selector = FixedBorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)

In [55]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score


In [59]:
data = pd.read_csv('dataset - 2020-09-24.csv')

In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 571 entries, 0 to 570
Data columns (total 59 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Name                    571 non-null    object 
 1   Jersey Number           563 non-null    float64
 2   Club                    571 non-null    object 
 3   Position                571 non-null    object 
 4   Nationality             570 non-null    object 
 5   Age                     570 non-null    float64
 6   Appearances             571 non-null    int64  
 7   Wins                    571 non-null    int64  
 8   Losses                  571 non-null    int64  
 9   Goals                   571 non-null    int64  
 10  Goals per match         309 non-null    float64
 11  Headed goals            502 non-null    float64
 12  Goals with right foot   502 non-null    float64
 13  Goals with left foot    502 non-null    float64
 14  Penalties scored        309 non-null    fl

In [61]:
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['number']).columns


In [62]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [63]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [64]:
if hasattr(X_train_processed, "toarray"):
    X_train_processed = X_train_processed.toarray()
if hasattr(X_test_processed, "toarray"):
    X_test_processed = X_test_processed.toarray()


In [65]:
rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)

In [66]:
boruta_selector = FixedBorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)

In [67]:
try:
    boruta_selector.fit(X_train_processed, y_train)
except Exception as e:
    print("An error occurred during Boruta fitting:", e)
    raise


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
Iteration: 29
Iteration: 30
Iteration: 31
Iteration: 32
Iteration: 33
Iteration: 34
Iteration: 35
Iteration: 36
Iteration: 37
Iteration: 38
Iteration: 39
Iteration: 40
Iteration: 41
Iteration: 42
Iteration: 43
Iteration: 44
Iteration: 45
Iteration: 46
Iteration: 47
Iteration: 48
Iteration: 49
Iteration: 50
Iteration: 51
Iteration: 52
Iteration: 53
Iteration: 54
Iteration: 55
Iteration: 56
Iteration: 57
Iteration: 58
Iteration: 59
Iteration: 60
Iteration: 61
Iteration: 62
Iteration: 63
Iteration: 64
Iteration: 65
Iteration: 66
Iteration: 67
Iteration: 68
Iteration: 69
Iteration: 70
Iteration: 71
Iteration: 72
I

In [82]:
try:
    boruta_selector.fit(X_train_processed, y_train)
except Exception as e:
    print("An error occurred during Boruta fitting:", e)
    raise


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
Iteration: 29
Iteration: 30
Iteration: 31
Iteration: 32
Iteration: 33
Iteration: 34
Iteration: 35
Iteration: 36
Iteration: 37
Iteration: 38
Iteration: 39
Iteration: 40
Iteration: 41
Iteration: 42
Iteration: 43
Iteration: 44
Iteration: 45
Iteration: 46
Iteration: 47
Iteration: 48
Iteration: 49
Iteration: 50
Iteration: 51
Iteration: 52
Iteration: 53
Iteration: 54
Iteration: 55
Iteration: 56
Iteration: 57
Iteration: 58
Iteration: 59
Iteration: 60
Iteration: 61
Iteration: 62
Iteration: 63
Iteration: 64
Iteration: 65
Iteration: 66
Iteration: 67
Iteration: 68
Iteration: 69
Iteration: 70
Iteration: 71
Iteration: 72
I

In [83]:
try:
    X_train_boruta = boruta_selector.transform(X_train_processed)
    X_test_boruta = boruta_selector.transform(X_test_processed)
except Exception as e:
    print("An error occurred during Boruta transformation:", e)
    raise


In [84]:
rfc.fit(X_train_boruta, y_train)
y_pred = rfc.predict(X_test_boruta)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9478260869565217
