In [1]:
!Pip install boruta

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [3]:
data = pd.read_csv("bank-additional-full.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                        Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                        --------------  ----- 
 0   age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"cons.conf.idx";"euribor3m";"nr.employed";"y"  41188 non-null  object
dtypes: object(1)
memory usage: 321.9+ KB


In [4]:
# Select only numerical columns
numerical_data = data.select_dtypes(include=[np.number])

# Check for columns with low variance
low_variance = numerical_data.var()[numerical_data.var() == 0]
print("Columns with low variance:", low_variance.index.tolist())



Columns with low variance: []


In [5]:
# Check for highly correlated columns among numerical data
correlation_matrix = numerical_data.corr().abs()
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
high_corr_columns = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
print("Highly correlated columns:", high_corr_columns)


Highly correlated columns: []


In [6]:
# Checking unique values in categorical columns
for column in data.select_dtypes(include=['object']).columns:
    unique_vals = data[column].nunique()
    print(f"Column '{column}' has {unique_vals} unique values.")


Column 'age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"cons.conf.idx";"euribor3m";"nr.employed";"y"' has 41176 unique values.


In [7]:
# Checking unique values in categorical columns
for column in data.select_dtypes(include=['object']).columns:
    unique_vals = data[column].nunique()
    print(f"Column '{column}' has {unique_vals} unique values.")


Column 'age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"cons.conf.idx";"euribor3m";"nr.employed";"y"' has 41176 unique values.


In [8]:
# Display the first few rows and column names to check if they are read correctly
print(data.head())
print(data.columns)


  age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"cons.conf.idx";"euribor3m";"nr.employed";"y"
0  56;"housemaid";"married";"basic.4y";"no";"no";...                                                                                                                                                                          
1  57;"services";"married";"high.school";"unknown...                                                                                                                                                                          
2  37;"services";"married";"high.school";"no";"ye...                                                                                                                                                                          
3  40;"admin.";"married";"basic.6y";"no";"no";"no...                                                        

In [9]:
# Checking unique values in categorical columns
for column in data.select_dtypes(include=['object']).columns:
    unique_vals = data[column].nunique()
    print(f"Column '{column}' has {unique_vals} unique values.")


Column 'age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"cons.conf.idx";"euribor3m";"nr.employed";"y"' has 41176 unique values.


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv("bank-additional-full.csv", delimiter=';')

# Define features and target
X = data.drop(columns=['y'])  # Drop the target column
y = data['y']  # Target column

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Identify categorical features
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['number']).columns

# Preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))  # Handle missing values
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessing of training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


In [12]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv("bank-additional-full.csv", delimiter=';')

# Define features and target
X = data.drop(columns=['y'])
y = data['y']

# Identify categorical and numeric features
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(include=['number']).columns

# Preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))  # Handle missing values
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X_preprocessed = preprocessor.fit_transform(X)


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=1)

# Initialize and train the model
rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
boruta_selector = BorutaPy(rf_all_features, n_estimators='auto', verbose=2, random_state=1)
boruta_selector.fit(np.array(X_train), np.array(y_train))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	63
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	63
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	63
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	63
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	63
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	63
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	63
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	19
Tentative: 	8
Rejected: 	36
Iteration: 	9 / 100
Confirmed: 	19
Tentative: 	8
Rejected: 	36
Iteration: 	10 / 100
Confirmed: 	19
Tentative: 	8
Rejected: 	36
Iteration: 	11 / 100
Confirmed: 	19
Tentative: 	8
Rejected: 	36
Iteration: 	12 / 100
Confirmed: 	19
Tentative: 	8
Rejected: 	36
Iteration: 	13 / 100
Confirmed: 	19
Tentative: 	7
Rejected: 	37
Iteration: 	14 / 100
Confirmed: 	19
Tentative: 	7
Rejected: 	37
Iteration: 	15 / 100
Confirmed: 	19
Tentative: 	7
Rejected: 	37
Iteration: 	16 / 100
Confirmed: 	19
Tentative: 	7
Reject

In [14]:
print("ranking:", boruta_selector.ranking_)
print("No. of significant features: ", boruta_selector.n_features_)

ranking: [ 1  1  4  1  1  1  1  1  1  1 28  7 35 41 34  6 36 20  6 31 41 38 33 15
 12 42 24 31 16 21 43 26 13 23  2  3 44 27 39 26 31 38 29  1  1  1 10  8
 11  2  1  1 15  1  1 22 10 18 17 19  1  1  1]
No. of significant features:  19


In [15]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

# Sample data: Replace this with your actual data
# X_train should be a pandas DataFrame
X_train = pd.DataFrame({
    'feature1': np.random.rand(100),
    'feature2': np.random.rand(100),
    'feature3': np.random.rand(100)
})
y_train = np.random.randint(0, 2, size=100)  # Binary target variable

# Store original feature names
feature_names = X_train.columns.tolist()

# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize Boruta with the Random Forest classifier
boruta_selector = BorutaPy(estimator=rf, n_estimators='auto', random_state=42)

# Fit Boruta
boruta_selector.fit(X_train.values, y_train)

# Create a DataFrame with features and their rankings
selected_rf_features = pd.DataFrame({
    'Features': feature_names,
    'Ranking': boruta_selector.ranking_
})

# Sort the DataFrame by ranking
sorted_rf_features = selected_rf_features.sort_values(by='Ranking')

# Display the sorted DataFrame
print(sorted_rf_features)

   Features  Ranking
2  feature3        2
1  feature2        3
0  feature1        4


In [16]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Sample data: Replace this with your actual data
X = pd.DataFrame({
    'feature1': np.random.rand(100),
    'feature2': np.random.rand(100),
    'feature3': np.random.rand(100),
    'feature4': np.random.rand(100)
})
y = np.random.randint(0, 2, size=100)  # Binary target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Store original feature names
feature_names = X_train.columns.tolist()

# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize Boruta with the Random Forest classifier
boruta_selector = BorutaPy(estimator=rf, n_estimators='auto', random_state=42)

# Fit Boruta
boruta_selector.fit(X_train.values, y_train)

# Transform the training set to keep only important features
X_imp_train = boruta_selector.transform(X_train.values)

# Get the boolean mask of selected features
selected_mask = boruta_selector.support_

# Transform the test set using the same selected features
X_imp_test = X_test.values[:, selected_mask]

# Convert the transformed arrays back to DataFrames for easier handling
X_imp_train_df = pd.DataFrame(X_imp_train, columns=np.array(feature_names)[selected_mask])
X_imp_test_df = pd.DataFrame(X_imp_test, columns=np.array(feature_names)[selected_mask])

# Display the important features in the training set
print("Important Features in Training Set:")
print(X_imp_train_df.head())

# Display the important features in the test set
print("Important Features in Test Set:")
print(X_imp_test_df.head())

Important Features in Training Set:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]
Important Features in Test Set:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [17]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Sample data: Replace this with your actual data
X = pd.DataFrame({
    'feature1': np.random.rand(100),
    'feature2': np.random.rand(100),
    'feature3': np.random.rand(100),
    'feature4': np.random.rand(100)
})
y = np.random.randint(0, 2, size=100)  # Binary target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Store original feature names
feature_names = X_train.columns.tolist()

# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize Boruta with the Random Forest classifier
boruta_selector = BorutaPy(estimator=rf, n_estimators='auto', random_state=42)

# Fit Boruta
boruta_selector.fit(X_train.values, y_train)

# Check the selected features
selected_mask = boruta_selector.support_
print("Selected Features Mask:", selected_mask)

# Transform the training set to keep only important features
X_imp_train = boruta_selector.transform(X_train.values)

# Check the shape of the transformed training set
print("Shape of X_imp_train:", X_imp_train.shape)

# Transform the test set using the same selected features
X_imp_test = X_test.values[:, selected_mask]

# Check the shape of the transformed test set
print("Shape of X_imp_test:", X_imp_test.shape)

# Convert the transformed arrays back to DataFrames for easier handling
X_imp_train_df = pd.DataFrame(X_imp_train, columns=np.array(feature_names)[selected_mask])
X_imp_test_df = pd.DataFrame(X_imp_test, columns=np.array(feature_names)[selected_mask])

# Check if we have any features left
if X_imp_train_df.shape[1] == 0:
    print("No features selected for training. Please check Boruta results.")
else:
    # Initialize and fit the Random Forest classifier using the important features
    rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
    rf_boruta.fit(X_imp_train_df, y_train)

    # Optionally, evaluate the model on the test set
    y_pred = rf_boruta.predict(X_imp_test_df)

    # Display predictions
    print("Predictions on Test Set:")
    print(y_pred)

Selected Features Mask: [False False False False]
Shape of X_imp_train: (80, 0)
Shape of X_imp_test: (20, 0)
No features selected for training. Please check Boruta results.


In [19]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Sample data: Replace this with your actual data
X = pd.DataFrame({
    'feature1': np.random.rand(100),
    'feature2': np.random.rand(100),
    'feature3': np.random.rand(100),
    'feature4': np.random.rand(100)
})
y = np.random.randint(0, 2, size=100)  # Binary target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Store original feature names
feature_names = X_train.columns.tolist()

# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize Boruta with the Random Forest classifier
boruta_selector = BorutaPy(estimator=rf, n_estimators='auto', random_state=42)

# Fit Boruta
boruta_selector.fit(X_train.values, y_train)

# Get the boolean mask of selected features
selected_mask = boruta_selector.support_

# Check if any features were selected
if not any(selected_mask):
    print("No features were selected by Boruta. Proceeding with all features.")
    X_imp_train_df = X_train
    X_imp_test_df = X_test
else:
    # Transform the training set to keep only important features
    X_imp_train = boruta_selector.transform(X_train.values)

    # Transform the test set using the same selected features
    X_imp_test = X_test.values[:, selected_mask]

    # Convert the transformed arrays back to DataFrames for easier handling
    X_imp_train_df = pd.DataFrame(X_imp_train, columns=np.array(feature_names)[selected_mask])
    X_imp_test_df = pd.DataFrame(X_imp_test, columns=np.array(feature_names)[selected_mask])

# Check the shapes of the transformed datasets
print("Shape of X_imp_train_df:", X_imp_train_df.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_imp_test_df:", X_imp_test_df.shape)
print("Shape of y_test:", y_test.shape)

# Initialize and fit the Random Forest classifier using the important features
rf_boruta = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)

# Fit the model
rf_boruta.fit(X_imp_train_df, y_train)

# Make predictions on the test set
y_pred = rf_boruta.predict(X_imp_test_df)

# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Set:", accuracy)


No features were selected by Boruta. Proceeding with all features.
Shape of X_imp_train_df: (80, 4)
Shape of y_train: (80,)
Shape of X_imp_test_df: (20, 4)
Shape of y_test: (20,)
Accuracy on Test Set: 0.65
