In [None]:
import os
import pandas as pd

source_directory = 'sourcePath'
output_directory = 'destinationPath'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def clean_csv(file_path):
    data = pd.read_csv(file_path, delimiter=';')
    data.dropna(axis=1, how='all', inplace=True)
    output_file_path = os.path.join(output_directory, os.path.basename(file_path))
    data.to_csv(output_file_path, index=False, sep=';')


for file_name in os.listdir(source_directory):
    if file_name.endswith('.csv'):
        file_path = os.path.join(source_directory, file_name)
        clean_csv(file_path)


All CSV files have been cleaned and saved to the output directory.


In [None]:
import numpy as np
datadir = "data"

X_data = []
Y_data = []
X_val = []
Y_val = []


files = os.listdir(os.path.join(datadir, "ctrl"))
for file in files:
    path = os.path.join(datadir, "ctrl", file)
    df = pd.read_csv(path)
    X_data.append(np.reshape(df.to_numpy(), (-1,)))
    Y_data.append(np.array([1.0, 0.0]))

files = os.listdir(os.path.join(datadir, "pt"))
for file in files:
    path = os.path.join(datadir, "pt", file)
    df = pd.read_csv(path).apply(lambda x: x.replace(';', ''))
    X_data.append(np.reshape(df.to_numpy(), (-1,)))
    Y_data.append(np.array([0.0, 1.0]))

files = os.listdir(os.path.join(datadir, "v"))
files.sort()
for file in files:
    path = os.path.join(datadir, "v", file)
    df = pd.read_csv(path)
    X_val.append(np.reshape(df.to_numpy(), (-1,)))
    if 'd' in file:
        Y_val.append(np.array([0.0, 1.0]))
    else:
        Y_val.append(np.array([1.0, 0.0]))


In [None]:
X_data


[array([-0.00161 , -0.00189 , -0.00198 , ..., -0.000881, -0.00106 ,
        -0.0011  ]),
 array([-0.000578, -0.000889, -0.00102 , ...,  0.00012 ,  0.000307,
         0.000405]),
 array([-0.00168, -0.00146, -0.00134, ...,  0.00133,  0.00174,  0.00197]),
 array([-0.00207, -0.00224, -0.00222, ..., -0.00159, -0.00212, -0.00226]),
 array([-0.00169, -0.00144, -0.00131, ...,  0.00152,  0.00206,  0.00233]),
 array([-0.0011  , -0.000955, -0.000834, ...,  0.000673,  0.000955,
         0.000893]),
 array([ 9.59e-04, -5.21e-06, -9.57e-04, ...,  4.80e-04,  2.43e-04,
         4.28e-04]),
 array([-0.00164, -0.00188, -0.00197, ..., -0.00091, -0.00114, -0.00121]),
 array([-0.000728, -0.000637, -0.000505, ..., -0.0011  , -0.00122 ,
        -0.00128 ]),
 array([ 1.30e-03,  6.09e-04, -1.12e-04, ...,  1.89e-04,  9.57e-05,
         3.25e-04]),
 array([ 0.000451, -0.000373, -0.00126 , ..., -0.000488, -0.00035 ,
         0.000122]),
 array([-1.76e-04, -8.60e-04, -1.52e-03, ..., -4.47e-04, -4.29e-04,
         

In [None]:
def extract_features(arrays):
    features = []
    for array in arrays:
        mean = np.mean(array)
        std = np.std(array)
        min_val = np.min(array)
        max_val = np.max(array)
        median = np.median(array)
        range_val = max_val - min_val
        q1 = np.percentile(array, 25)
        q3 = np.percentile(array, 75)

        features.append({
            "mean": mean,
            "std": std,
            "min": min_val,
            "max": max_val,
            "median": median,
            "range": range_val,
            "q1": q1,
            "q3": q3
        })

    features = pd.DataFrame(features)
    return features


In [None]:
feature_set = extract_features(X_data)
feature_set_val = extract_features(X_val)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(feature_set, Y_data)
y_pred = clf.predict(feature_set_val)

print("Accuracy:", accuracy_score(Y_val, y_pred))
print("Classification Report:\n", classification_report(Y_val, y_pred))


Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.60      1.00      0.75         3

   micro avg       0.67      0.67      0.67         6
   macro avg       0.80      0.67      0.62         6
weighted avg       0.80      0.67      0.62         6
 samples avg       0.67      0.67      0.67         6



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, verbose=1)
grid_search_rf.fit(feature_set, Y_data)
best_rf = grid_search_rf.best_estimator_


Fitting 5 folds for each of 144 candidates, totalling 720 fits


240 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sk

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(feature_set, Y_data)
knn_pred = knn_model.predict(feature_set_val)

print("Accuracy:", accuracy_score(Y_val, knn_pred))
print("Classification Report:\n", classification_report(Y_val, knn_pred))


Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.60      1.00      0.75         3

   micro avg       0.67      0.67      0.67         6
   macro avg       0.80      0.67      0.62         6
weighted avg       0.80      0.67      0.62         6
 samples avg       0.67      0.67      0.67         6



In [None]:
Y_data = np.argmax(Y_data, axis=1)
Y_val = np.argmax(Y_val, axis=1)


In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', C=1.0)  # Linear kernel
svm_model.fit(feature_set, Y_data)
svm_pred = svm_model.predict(feature_set_val)

print("Accuracy:", accuracy_score(Y_val, svm_pred))
print("Classification Report:\n", classification_report(Y_val, svm_pred))


Accuracy: 0.5
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.50      1.00      0.67         3

    accuracy                           0.50         6
   macro avg       0.25      0.50      0.33         6
weighted avg       0.25      0.50      0.33         6



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist_svm = {
    'C': uniform(0.1, 10),
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm = RandomizedSearchCV(SVC(), param_distributions=param_dist_svm, n_iter=100, cv=5, verbose=1, random_state=42)
random_search_svm.fit(feature_set, Y_data)
best_svm = random_search_svm.best_estimator_


Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(feature_set, Y_data)
nb_pred = nb_model.predict(feature_set_val)

print("Accuracy:", accuracy_score(Y_val, nb_pred))
print("Classification Report:\n", classification_report(Y_val, nb_pred))


Accuracy: 0.5
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.67      0.57         3
           1       0.50      0.33      0.40         3

    accuracy                           0.50         6
   macro avg       0.50      0.50      0.49         6
weighted avg       0.50      0.50      0.49         6

