<a href="https://colab.research.google.com/github/Saraalkhalifa/SZ_ML-classifier/blob/main/data_reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Importing the libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import copy

# **Importing the dataset**


In [None]:
# Load your CSV data into a pandas DataFrame
data = pd.read_csv('/content/drive/MyDrive/al/data/rs22.csv')

# **incremental removal**

In [None]:
target_column = ' group'

In [None]:
# Drop rows with NaN values in the target column
data = data.dropna(subset=[target_column])

In [None]:
X = data.drop(target_column, axis=1)
y = data[target_column]

In [None]:
# Function to calculate Entropy for a column
def calculate_entropy(column_values):
    # Implementation of entropy calculation
    total_samples = len(column_values)
    entropy = 0
    values = column_values.value_counts()
    for value in values:
        proportion = value / total_samples
        entropy -= proportion * np.log2(proportion)
    return entropy

In [None]:
# Initialize dictionaries to store accuracy information
column_accuracy = {}
impact_on_accuracy = {}

while len(X.columns) > 1:  # Continue until only one column is left
    entropy_scores = {}

    # Calculate Entropy for each column
    for column in X.columns:
        entropy_scores[column] = calculate_entropy(X[column])

    # Identify column with the lowest Entropy
    min_entropy_column = min(entropy_scores, key=entropy_scores.get)

    # Remove column with the lowest Entropy
    X.drop(min_entropy_column, axis=1, inplace=True)

    # Handle missing values in the dataset
    imputer = SimpleImputer(strategy='mean')  # You can use other strategies as needed
    X_imputed = imputer.fit_transform(X)

    # Train a model and evaluate accuracy
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

    # Handle NaN values in y_train
    y_train = y_train.dropna()

    clf = DecisionTreeClassifier()  # You can use any model of your choice
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Display information
    print(f"Removed column: {min_entropy_column} | Accuracy: {accuracy}")

    # Store the accuracy impact after removing the column
    column_accuracy[min_entropy_column] = accuracy

Removed column: rejected | Accuracy: 0.9962484368486869
Removed column:  gender | Accuracy: 0.995831596498541
Removed column: condition | Accuracy: 0.9868695289704044
Removed column:  education | Accuracy: 0.976240100041684
Removed column:  age | Accuracy: 0.921634014172572
Removed column: ITI | Accuracy: 0.9301792413505627
Removed column: time_ms | Accuracy: 0.8672363484785327
Removed column: FC3_B0 | Accuracy: 0.8666110879533139
Removed column: C3_B0 | Accuracy: 0.8649437265527303
Removed column: CP3_B0 | Accuracy: 0.8674447686536056
Removed column: C4_B0 | Accuracy: 0.8680700291788245
Removed column: CP4_B0 | Accuracy: 0.8726552730304293
Removed column: FC4_B0 | Accuracy: 0.8718215923301376
Removed column: Fz_B0 | Accuracy: 0.8666110879533139
Removed column: FCz_B0 | Accuracy: 0.8697373905794081
Removed column: Cz_B0 | Accuracy: 0.8770320967069613
Removed column: CP3_N100 | Accuracy: 0.8751563151313048
Removed column: Fz_N100 | Accuracy: 0.8724468528553564
Removed column: FC4_N100 |

In [None]:
# Final dataset with one column remaining
print("Final column remaining:", X.columns)

Final column remaining: Index(['Cz'], dtype='object')


# **One feature-out experiment**

In [None]:
data.dropna(subset=[data.columns[-1]], inplace=True)

In [None]:
# Assuming the last column is the target variable and the rest are features
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable

In [None]:
# Function to calculate accuracy after removing each column
def calculate_accuracy_with_removal(X, y):
    initial_accuracy = 0
    max_accuracy = 0
    best_column = None
    accuracies = {}

    for col in X.columns:
        X_temp = X.drop(col, axis=1)

        # Impute missing values
        imputer = SimpleImputer(strategy='mean')
        X_temp = pd.DataFrame(imputer.fit_transform(X_temp), columns=X_temp.columns)

        X_train, X_test, y_train, y_test = train_test_split(X_temp, y, test_size=0.2, random_state=42)
        clf = DecisionTreeClassifier(random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        accuracies[col] = acc

        if acc > initial_accuracy:
            initial_accuracy = acc
            best_column = col

    return accuracies, best_column, initial_accuracy

# Calculate accuracy by removing one column at a time
accuracies, best_col, max_accuracy = calculate_accuracy_with_removal(X, y)

print("Accuracy with each column removed:")
for col, acc in accuracies.items():
    print(f"Removed column '{col}': Accuracy = {acc}")


Accuracy with each column removed:
Removed column 'condition': Accuracy = 0.9908295122967903
Removed column 'Fz': Accuracy = 0.9970821175489787
Removed column 'FCz': Accuracy = 0.9952063359733222
Removed column 'Cz': Accuracy = 0.9949979157982493
Removed column 'FC3': Accuracy = 0.9952063359733222
Removed column 'FC4': Accuracy = 0.9987494789495623
Removed column 'C3': Accuracy = 0.9952063359733222
Removed column 'C4': Accuracy = 0.996040016673614
Removed column 'CP3': Accuracy = 0.9954147561483951
Removed column 'CP4': Accuracy = 0.9954147561483951
Removed column 'time_ms': Accuracy = 0.9966652771988328
Removed column 'ITI': Accuracy = 0.9945810754481034
Removed column 'rejected': Accuracy = 0.9954147561483951
Removed column 'Fz_N100': Accuracy = 0.9947894956231763
Removed column 'FCz_N100': Accuracy = 0.9949979157982493
Removed column 'Cz_N100': Accuracy = 0.9949979157982493
Removed column 'FC3_N100': Accuracy = 0.9949979157982493
Removed column 'FC4_N100': Accuracy = 0.9949979157982

In [None]:
print(f"\nMaximum Accuracy after removing one column at a time: {max_accuracy} by removing '{best_col}'")



Maximum Accuracy after removing one column at a time: 0.9987494789495623 by removing 'FC4'


In [None]:
# Final dataset with one column remaining
print("Final column remaining:", X.columns)

Final column remaining: Index(['condition', 'Fz', 'FCz', 'Cz', 'FC3', 'FC4', 'C3', 'C4', 'CP3', 'CP4',
       'time_ms', 'ITI', 'rejected', 'Fz_N100', 'FCz_N100', 'Cz_N100',
       'FC3_N100', 'FC4_N100', 'C3_N100', 'C4_N100', 'CP3_N100', 'CP4_N100',
       'Fz_P200', 'FCz_P200', 'Cz_P200', 'FC3_P200', 'FC4_P200', 'C3_P200',
       'C4_P200', 'CP3_P200', 'CP4_P200', 'Fz_B0', 'FCz_B0', 'Cz_B0', 'FC3_B0',
       'FC4_B0', 'C3_B0', 'C4_B0', 'CP3_B0', 'CP4_B0', 'Fz_B1', 'FCz_B1',
       'Cz_B1', 'FC3_B1', 'FC4_B1', 'C3_B1', 'C4_B1', 'CP3_B1', 'CP4_B1',
       ' gender', ' age', ' education'],
      dtype='object')
