<a href="https://colab.research.google.com/github/Victhagas/EEG-Analysis/blob/main/ALSP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading libraries

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

# Loading data

In [21]:
dataset = pd.read_csv("READYCPTfourthrun.csv")

# Separating matrix of features from the dependent variable

In [22]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [23]:
x

array([[ 1.00000000e+00, -3.21006938e-01,  1.45900342e-01,
         2.83669934e-01,  3.75000391e-01, -1.36040051e+00,
        -1.39985980e+00, -1.46684067e+00, -1.61392729e+00,
         6.98692337e-01,  1.01106286e+00,  1.48115942e+00,
         2.30816212e+00,  1.45503210e+00,  9.10777348e-01,
         9.04378906e-01,  1.37907547e+00],
       [ 1.00000000e+00,  8.38200127e-01,  9.00155901e-01,
        -2.86074610e-02,  3.66858873e-01,  1.52620208e+00,
         1.57247875e-01,  2.99182413e-01,  8.44643214e-01,
        -3.45342155e-01,  4.24193242e-01, -4.04199605e-01,
        -4.27386895e-01, -1.41536476e+00, -8.05983109e-01,
        -2.71176662e-01, -8.21237865e-01],
       [ 1.00000000e+00,  6.31221291e-01,  1.41228936e-01,
         2.56051863e-01,  8.28995288e-02,  1.05531749e+00,
        -8.75903532e-01, -5.60909250e-01, -4.42961671e-01,
        -2.79916621e-01,  4.72181114e-01,  4.53073312e-01,
         4.25984795e-01, -1.05628926e+00,  5.49874613e-01,
         1.29334506e-01,  2.2

In [24]:
def bootstrap_mean_diff(x, y, column_index, n_bootstrap=1000):
    np.random.seed(42)
    mean_diffs = []

    for _ in range(n_bootstrap):
        bootstrap_indices = np.random.randint(low=0, high=len(x), size=len(x))

        bootstrap_x = x[bootstrap_indices]
        bootstrap_y = y[bootstrap_indices]

        # Separating the data based on genotype
        data_1 = bootstrap_x[bootstrap_y == 0, column_index]
        data_2 = bootstrap_x[bootstrap_y == 1, column_index]

        mean_1 = np.mean(data_1)
        mean_2 = np.mean(data_2)

        mean_diff = mean_2 - mean_1
        mean_diffs.append(mean_diff)

    confidence_interval = np.percentile(mean_diffs, [2.5, 97.5])  # 95% of CI

    return np.mean(mean_diffs), confidence_interval

In [28]:
mean_diff, ci = bootstrap_mean_diff(x, y, 11)  # index 1 for HR2 (we can change it)
print(f"Mean difference in HR1 between genotypes: {mean_diff}")
print(f"95% CI for mean difference: {ci}")

Mean difference in HR1 between genotypes: 0.029257820310830428
95% CI for mean difference: [-0.48917023  0.52229863]


In [27]:
dataset

Unnamed: 0,Sex,HR2,HR1,HR0.5,HR0.2,FAR2,FAR1,FAR0.5,FAR0.2,DP2,DP1,DP0.5,DP0.2,CB2,CB1,CB0.5,CB0.2,Genotype
0,1,-0.321007,0.1459,0.28367,0.375,-1.360401,-1.39986,-1.466841,-1.613927,0.698692,1.011063,1.481159,2.308162,1.455032,0.910777,0.904379,1.379075,1
1,1,0.8382,0.900156,-0.028607,0.366859,1.526202,0.157248,0.299182,0.844643,-0.345342,0.424193,-0.4042,-0.427387,-1.415365,-0.805983,-0.271177,-0.821238,1
2,1,0.631221,0.141229,0.256052,0.0829,1.055317,-0.875904,-0.560909,-0.442962,-0.279917,0.472181,0.453073,0.425985,-1.056289,0.549875,0.129335,0.226182,0
3,1,-1.821413,-0.630809,-0.877705,0.249481,-0.721763,-0.901431,-0.296571,0.156652,-0.848695,0.21212,-0.529536,0.089508,1.553309,0.930469,0.768281,-0.248347,0
4,0,-1.124727,-0.937018,-1.183746,-0.604176,0.14971,0.285099,0.086082,-1.21835,-1.380622,-1.178709,-1.183913,0.611473,0.726212,0.390042,0.59324,1.259802,0
5,0,0.328463,-0.894025,-0.544964,0.316214,1.125935,0.520966,1.955051,2.811528,-0.750635,-1.2656,-1.741347,-1.476481,-0.726235,0.245055,-0.694972,-1.616835,1
6,0,-1.626723,-1.600362,-0.276503,-0.505995,-0.499644,-0.658537,-1.105003,-1.107796,-0.856082,-0.950056,0.733189,0.741363,1.338104,1.336528,1.101274,1.20853,0
7,1,-2.956079,-2.539205,-2.524276,-1.627817,0.073657,0.123426,-0.330639,0.534861,-2.21819,-2.057806,-1.729127,-1.851838,1.613086,1.319275,1.783171,0.609894,0
8,1,-0.540032,0.824311,0.321577,-0.254521,0.877994,1.742419,0.377017,0.344913,-1.300947,-0.446498,-0.2235,-0.597919,0.060506,-1.440239,-0.490613,-0.119471,0
9,1,-1.586274,-1.405576,-2.101618,-0.784668,1.005251,1.060724,0.800544,0.809122,-2.15092,-1.891937,-2.266385,-1.307644,0.525949,0.272157,0.786734,-0.029954,1


# Splitting data into training and test sets

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Building Logistic Regression instance

In [30]:
bagged_logistic = BaggingClassifier(
    estimator=LogisticRegression(max_iter=10000),
    n_estimators=10,
    random_state=42,
)

bagged_logistic.fit(x_train, y_train)

accuracy = accuracy_score(y_test, bagged_logistic.predict(x_test))
print(f"Accuracy with Bagging: {accuracy:.4f}")

Accuracy with Bagging: 0.6667


In [31]:
classifier = LogisticRegression()

# Perform Grid Search to find the best hyperparameters (tuning)


In [32]:
param_grid = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "solver": ["liblinear", "newton-cg", "saga", "sag", "lbfgs"],
    "max_iter": [100, 500, 1000, 2000, 10000]
}

# Get the best parameters and estimator from the Grid Search




In [33]:
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring="accuracy")

grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

best_model = grid_search.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'C': 0.001, 'max_iter': 100, 'solver': 'newton-cg'}
Best Score: 0.5222222222222223


# Implementation of custom function to select accuracy-weighed features

In [34]:
selected_features = list(range(x_train.shape[1]))
initial_accuracy = accuracy_score(y_test, best_model.predict(x_test))

for i in selected_features:
    features_to_use = [feature for feature in selected_features if feature != i]

    if len(features_to_use) > 0:
        x_subset = x_train[:, features_to_use]
        classifier.fit(x_subset, y_train)
        y_pred_subset = classifier.predict(x_test[:, features_to_use])
        accuracy_subset = accuracy_score(y_test, y_pred_subset)

        if accuracy_subset > initial_accuracy:
            print(f"Removing feature in position {i} - Accuracy improved to {accuracy_subset:.4f}")
            initial_accuracy = accuracy_subset
            selected_features = features_to_use
        else:
            print(f"Keeping feature in position {i} - Accuracy: {accuracy_subset:.4f}")
    else:
        print(f"All features removed - Terminating Process")
        break

print("Selected Features:")
for feature_index in selected_features:
    if feature_index < len(dataset.columns) - 1:
        print(dataset.columns[feature_index])
    else:
        print("Invalid Index")

Removing feature in position 0 - Accuracy improved to 0.5833
Keeping feature in position 1 - Accuracy: 0.5833
Keeping feature in position 2 - Accuracy: 0.5833
Keeping feature in position 3 - Accuracy: 0.5833
Keeping feature in position 4 - Accuracy: 0.5833
Keeping feature in position 5 - Accuracy: 0.5000
Keeping feature in position 6 - Accuracy: 0.5833
Keeping feature in position 7 - Accuracy: 0.5833
Keeping feature in position 8 - Accuracy: 0.4167
Keeping feature in position 9 - Accuracy: 0.5000
Keeping feature in position 10 - Accuracy: 0.5833
Keeping feature in position 11 - Accuracy: 0.5000
Removing feature in position 12 - Accuracy improved to 0.6667
Keeping feature in position 13 - Accuracy: 0.6667
Keeping feature in position 14 - Accuracy: 0.6667
Keeping feature in position 15 - Accuracy: 0.5833
Keeping feature in position 16 - Accuracy: 0.6667
Selected Features:
HR2
HR1
HR0.5
HR0.2
FAR2
FAR1
FAR0.5
FAR0.2
DP2
DP1
DP0.5
CB2
CB1
CB0.5
CB0.2


# Fit the best regressor on the training data



In [35]:
best_model.fit(x_train[:, selected_features], y_train)

# Make predictions on the test set using selected features



In [36]:
y_pred = best_model.predict(x_test[:, selected_features])

# Evaluate performance on test set

In [37]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[[0 6]
 [0 6]]
Accuracy: 0.5


# Building Random Forest classifier instance

In [38]:
rf_classifier = RandomForestClassifier()

# Perform Grid Search to find the best hyperparameters (tuning)


In [39]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Get the best parameters and estimator from the Grid Search


In [40]:
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')

grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

best_rf_model = grid_search.best_estimator_

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}
Best Score: 0.475


# Implementation of custom function to select accuracy-weighed features

In [None]:
selected_features = list(range(x_train.shape[1]))
initial_accuracy = accuracy_score(y_test, best_rf_model.predict(x_test))

for i in selected_features:
    features_to_use = [feature for feature in selected_features if feature != i]

    if len(features_to_use) > 0:
        x_subset = x_train[:, features_to_use]
        rf_classifier.fit(x_subset, y_train)
        y_pred_subset = rf_classifier.predict(x_test[:, features_to_use])
        accuracy_subset = accuracy_score(y_test, y_pred_subset)

        if accuracy_subset > initial_accuracy:
            print(f"Removing feature in position {i} - Accuracy improved to {accuracy_subset:.4f}")
            initial_accuracy = accuracy_subset
            selected_features = features_to_use
        else:
            print(f"Keeping feature in position {i} - Accuracy: {accuracy_subset:.4f}")
    else:
        print(f"All features removed - Terminating Process")
        break

print("Selected Features:")
for feature_index in selected_features:
    # Assuming 'dataset' has column names
    if feature_index < len(dataset.columns) - 1:
        print(dataset.columns[feature_index])
    else:
        print("Invalid Index")

Removing feature in position 0 - Accuracy improved to 0.5833
Removing feature in position 1 - Accuracy improved to 0.7500
Keeping feature in position 2 - Accuracy: 0.6667
Keeping feature in position 3 - Accuracy: 0.4167
Keeping feature in position 4 - Accuracy: 0.5000
Keeping feature in position 5 - Accuracy: 0.6667
Keeping feature in position 6 - Accuracy: 0.6667
Keeping feature in position 7 - Accuracy: 0.6667
Removing feature in position 8 - Accuracy improved to 0.8333
Keeping feature in position 9 - Accuracy: 0.6667
Keeping feature in position 10 - Accuracy: 0.7500
Keeping feature in position 11 - Accuracy: 0.6667
Keeping feature in position 12 - Accuracy: 0.5833
Keeping feature in position 13 - Accuracy: 0.6667
Keeping feature in position 14 - Accuracy: 0.6667
Keeping feature in position 15 - Accuracy: 0.6667
Keeping feature in position 16 - Accuracy: 0.6667
Selected Features:
HR1
HR0.5
HR0.2
FAR2
FAR1
FAR0.5
DP2
DP1
DP0.5
DP0.2
CB2
CB1
CB0.5
CB0.2


# Fit the best regressor on the training data

In [None]:
best_rf_model.fit(x_train[:, selected_features], y_train)

# Make predictions on the test set using selected features

In [None]:
y_pred = best_rf_model.predict(x_test[:, selected_features])

# Evaluate performance on test set

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[[3 1]
 [3 5]]
Accuracy: 0.6666666666666666
