In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
from imblearn.ensemble import BalancedRandomForestClassifier




In [4]:
# Load dataset
df = pd.read_csv('merged_dataset.csv')
# Define the column names
columns = ['mfcc_' + str(i) for i in range(1, 301)] + ['label']

# Assign the column names to the DataFrame
df.columns = columns

# Print the DataFrame with headers
print(df.head())

        mfcc_1      mfcc_2      mfcc_3     mfcc_4     mfcc_5     mfcc_6  \
0 -1056.156930  100.706891   84.862294  61.978023  36.441811  12.850290   
1 -1053.566761  104.572517   89.251661  66.992090  41.908143  18.372516   
2 -1054.493806  103.244414   87.882304  65.582452  40.487623  16.988013   
3 -1044.344650  116.555050   98.293259  71.862921  42.277698  14.831793   
4 -1037.935844  124.927492  104.753948  75.632052  43.180481  13.306042   

     mfcc_7     mfcc_8     mfcc_9    mfcc_10  ...  mfcc_292   mfcc_293  \
0 -5.030501 -15.043214 -16.971785 -12.373160  ...  1.097410   8.823732   
1  0.060017 -10.797882 -13.775119 -10.161991  ...  1.919385   9.382641   
2 -1.245174 -12.006527 -14.909664 -11.281917  ...  3.386919  10.842312   
3 -6.079423 -17.866068 -20.164598 -14.694518  ...  3.274818  10.696498   
4 -9.129471 -21.325330 -22.999589 -16.203308  ...  3.561111  10.834448   

    mfcc_294   mfcc_295   mfcc_296  mfcc_297  mfcc_298  mfcc_299  mfcc_300  \
0  12.765254  12.431488   

In [5]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424782 entries, 0 to 424781
Columns: 301 entries, mfcc_1 to label
dtypes: float64(301)
memory usage: 975.5 MB


Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,...,mfcc_292,mfcc_293,mfcc_294,mfcc_295,mfcc_296,mfcc_297,mfcc_298,mfcc_299,mfcc_300,label
count,424782.0,424782.0,424782.0,424782.0,424782.0,424782.0,424782.0,424782.0,424782.0,424782.0,...,424782.0,424782.0,424782.0,424782.0,424782.0,424782.0,424782.0,424782.0,424782.0,424782.0
mean,-957.25179,73.429367,63.587957,49.229162,32.865257,17.208224,4.588648,-3.502608,-6.652877,-5.502739,...,2.052021,5.984551,8.34431,8.635752,7.017027,4.160127,0.99521,-1.579928,-2.944798,2.964808
std,93.061677,28.0037,23.140265,16.322959,9.424632,5.998543,7.846342,9.642355,9.411135,7.427699,...,2.479999,3.319391,4.151845,3.867859,2.713942,1.767335,2.360405,3.208475,3.376831,0.28438
min,-1131.37085,0.0,0.0,0.0,-8.052197,-10.1068,-31.239292,-47.488093,-46.642887,-34.282181,...,-7.140624,-3.907965,-3.207446,-3.123274,-3.264276,-8.39623,-15.581895,-18.701742,-18.76126,0.0
25%,-1047.499379,52.860956,46.600853,37.090646,25.264859,12.809885,-0.119585,-9.119909,-12.133904,-10.09826,...,0.420948,3.484893,4.846934,5.357046,4.922851,3.010286,-0.536171,-3.863631,-5.477259,3.0
50%,-978.547489,76.241309,66.240453,51.35278,34.086228,17.131187,5.559055,-2.831372,-6.571349,-5.756559,...,1.957027,5.740892,7.847505,8.38323,6.992698,4.082627,1.124616,-1.468043,-3.064989,3.0
75%,-885.080499,91.779168,79.169843,60.863535,40.169603,21.809376,10.334772,3.077508,-0.563506,-0.857388,...,3.498399,8.364641,11.669916,11.833003,9.1145,5.21689,2.633316,0.889726,-0.359829,3.0
max,-405.694693,216.038255,173.814956,116.89762,67.363842,41.974007,27.413303,21.702301,18.714559,17.11709,...,22.024933,26.631968,27.885411,26.992258,21.979536,15.633528,13.494096,11.531253,9.691933,3.0


In [6]:
# Configure TensorFlow to use GPU
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)

# Print GPU devices
tf.test.gpu_device_name()
gpus = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpus))
for gpu in gpus:
    print(gpu)

# Print session data
print(sess)


Num GPUs Available:  1
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
<tensorflow.python.client.session.Session object at 0x000001E8157910A0>


In [5]:
X = df.drop('label', axis=1).values # Features
#y = df['label'].values  # Labels
y = df['label'].values # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a standard scaler
scaler = StandardScaler()
# Normalize the feature values using StandardScaler
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create an instance of Balanced Random Forest classifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)  # Adjust the number of estimators as needed

# Fit the classifier to the training data
brf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = brf.predict(X_test)

# Evaluate the model
accuracy = brf.score(X_test, y_test)

print(f"Balanced Random Forest Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))
print()

  warn(
  warn(
  warn(


Balanced Random Forest Accuracy: 0.9236201843285426
              precision    recall  f1-score   support

         0.0       0.11      0.90      0.20       512
         1.0       0.37      0.89      0.53       540
         2.0       0.18      0.90      0.30       506
         3.0       1.00      0.92      0.96     83399

    accuracy                           0.92     84957
   macro avg       0.42      0.90      0.50     84957
weighted avg       0.99      0.92      0.95     84957




In [8]:
import pickle
# Export the trained SVM model to a pickle file
with open('epilepsy_prediction_model.pkl', 'wb') as file:
    pickle.dump(brf, file)

In [7]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.ensemble import BalancedRandomForestClassifier
from colorama import Fore, Style

def print_decorative_log(message, color=Fore.BLUE, style=Style.RESET_ALL):
    line_length = len(message) + 4  # Length of the message plus padding on both sides
    decorative_line = "#" * line_length
    print(color + decorative_line)
    print(f"# {message} #")
    print(decorative_line + style)

# Load the dataset
X = df.drop('label', axis=1).values # Features
#y = df['label'].values  # Labels
y = df['label'].values # Labels
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print_decorative_log("Dataset Splitted", Fore.GREEN)

# Define a standard scaler
print_decorative_log("Normalization & Standardization", Fore.YELLOW)
scaler = StandardScaler()

# Define SMOTE oversampling
print_decorative_log("SMOTE Oversampling", Fore.YELLOW)
smote = SMOTE(random_state=42)

# Define the models and their respective parameter grids
models = [
    {
        'name': 'SVM',
        'pipeline': ImbPipeline([
            ('scaler', scaler),
            ('smote', smote),
            ('svm', SVC(random_state=42))
        ]),
        'param_grid': {'svm__C': [0.1, 1, 10], 'svm__kernel': ['linear', 'rbf']}
    },
    {
        'name': 'Random Forest',
        'pipeline': ImbPipeline([
            ('scaler', scaler),
            ('smote', smote),
            ('rf', RandomForestClassifier(random_state=42))
        ]),
        'param_grid': {'rf__n_estimators': [100, 200, 500], 'rf__max_depth': [None, 5, 10]}
    },
    {
        'name': 'AdaBoost',
        'pipeline': ImbPipeline([
            ('scaler', scaler),
            ('smote', smote),
            ('adaboost', AdaBoostClassifier(random_state=42))
        ]),
        'param_grid': {'adaboost__n_estimators': [50, 100, 200], 'adaboost__learning_rate': [0.1, 0.5, 1]}
    },
    {
        'name': 'XGBoost',
        'pipeline': ImbPipeline([
            ('scaler', scaler),
            ('smote', smote),
            ('xgboost', XGBClassifier(random_state=42))
        ]),
        'param_grid': {'xgboost__n_estimators': [100, 200, 500], 'xgboost__learning_rate': [0.1, 0.5, 1]}
    },
    {
        'name': 'Balanced Random Forest',
        'pipeline': ImbPipeline([
            ('scaler', scaler),
            ('smote', smote),
            ('brf', BalancedRandomForestClassifier(random_state=42))
        ]),
        'param_grid': {'brf__n_estimators': [100, 200, 500], 'brf__max_depth': [None, 5, 10]}
    }
]

# Variables to store the best model and its performance
best_model = None
best_accuracy = 0.0

# Iterate through each model
for model in models:
    print(f"Model: {model['name']}")
    print("Parameter Grid:", model['param_grid'])
    print()

    print_decorative_log("Grid Search", Fore.YELLOW)
    # Perform Grid Search for the current model
    grid_search = GridSearchCV(model['pipeline'], model['param_grid'], scoring='accuracy', cv=5)
    print_decorative_log("Training", Fore.YELLOW)
    grid_search.fit(X_train, y_train)

    # Make predictions on the test set
    print_decorative_log("Predicting", Fore.YELLOW)
    y_pred = grid_search.best_estimator_.predict(X_test)
    
    print_decorative_log("Evaluation", Fore.YELLOW)
    # Print the classification report
    print("Best Parameters:", grid_search.best_params_)
    print("Accuracy:", grid_search.best_score_)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print()

    # Check if this model has the best accuracy so far
    if grid_search.best_score_ > best_accuracy:
        best_accuracy = grid_search.best_score_
        best_model = model['name']

# Print the best model
print(f"The best model is: {best_model} with accuracy: {best_accuracy}")

[32m####################
# Dataset Splitted #
####################[0m
[33m###################################
# Normalization & Standardization #
###################################[0m
[33m######################
# SMOTE Oversampling #
######################[0m
Model: SVM
Parameter Grid: {'svm__C': [0.1, 1, 10], 'svm__kernel': ['linear', 'rbf']}

[33m###############
# Grid Search #
###############[0m
[33m############
# Training #
############[0m


In [None]:

X = df.drop('label', axis=1).values.astype(np.float32)  # Features
#y = df['label'].values  # Labels
y = df['label'].values.astype(np.float32)  # Labels


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a standard scaler
scaler = StandardScaler()

# SVM Model
svm_model = SVC()
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', svm_model)
])

# Random Forest Model
rf_model = RandomForestClassifier()
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', rf_model)
])

# RNN Model
def create_rnn_model():
    strategy = tf.distribute.OneDeviceStrategy('GPU:0')  # Use the first GPU
    with strategy.scope():
        model = Sequential()
        model.add(LSTM(64, input_shape=(X_train.shape[1], 1), return_sequences=True))
        model.add(LSTM(32, return_sequences=False))
        model.add(Dense(4, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# CNN Model
def create_cnn_model():
    strategy = tf.distribute.OneDeviceStrategy('GPU:0')  # Use the first GPU
    with strategy.scope():
        model = Sequential()
        model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Flatten())
        model.add(Dense(50, activation='relu'))
        model.add(Dense(4, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Wrap Keras models for use in scikit-learn
rnn_model = KerasClassifier(build_fn=create_rnn_model, epochs=10, batch_size=10, verbose=1)
cnn_model = KerasClassifier(build_fn=create_cnn_model, epochs=10, batch_size=10, verbose=1)

# Fit and evaluate models
models = {#'SVM': svm_pipeline,
          #'Random Forest': rf_pipeline,
          'RNN': rnn_model,
          'CNN': cnn_model}

accuracy_values = []
reports = []

for model_name, model in models.items():
    if model_name in ['RNN', 'CNN']:
        
        model.fit(X_train, to_categorical(y_train))
        y_pred = model.predict(X_test)
        y_pred = np.argmax(y_pred, axis=1)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    accuracy = model.score(X_test, y_test)
    accuracy_values.append(accuracy)

    report = classification_report(y_test, y_pred)
    reports.append(report)

    print(f"{model_name} Accuracy: {accuracy}")
    print(report)
    print()

model_names = list(models.keys())
plt.bar(model_names, accuracy_values)
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.show()

for model_name, report in zip(model_names, reports):
    print(f"Classification Report for {model_name}:")
    print(report)
    print()

best_model_name = max(models, key=lambda name: models[name].score(X_test, y_test))
best_model = models[best_model_name]
print(f"Best Model: {best_model_name}")

In [None]:


# Configure TensorFlow to use GPU
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)

# Load dataset
df = pd.read_csv('/path/to/dataset.csv')

# Define the column names
columns = ['mfcc_' + str(i) for i in range(1, 301)] + ['label']

# Assign the column names to the DataFrame
df.columns = columns

# Print the DataFrame with headers
print(df.head())

X = df.drop('label', axis=1).values  # Features
y = df['label'].values  # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a standard scaler
scaler = StandardScaler()

# SVM Pipeline
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# Random Forest Pipeline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier())
])

# RNN Model
def create_rnn_model():
    strategy = tf.distribute.OneDeviceStrategy('GPU:0')  # Use the first GPU
    with strategy.scope():
        model = Sequential()
        model.add(LSTM(64, input_shape=(X_train.shape[1], 1), return_sequences=True))
        model.add(LSTM(32, return_sequences=False))
        model.add(Dense(4, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# CNN Model
def create_cnn_model():
    strategy = tf.distribute.OneDeviceStrategy('GPU:0')  # Use the first GPU
    with strategy.scope():
        model = Sequential()
        model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Flatten())
        model.add(Dense(50, activation='relu'))
        model.add(Dense(4, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Wrap Keras models for use in scikit-learn
rnn_model = KerasClassifier(build_fn=create_rnn_model, epochs=10, batch_size=10, verbose=1)
cnn_model = KerasClassifier(build_fn=create_cnn_model, epochs=10, batch_size=10, verbose=1)

# Define parameter grid for GridSearchCV
svm_param_grid = {'svm__C': [1, 10], 'svm__gamma': [0.001, 0.01]}
rf_param_grid = {'rf__n_estimators': [50, 100], 'rf__max_depth': [10, 20]}
rnn_param_grid = {'batch_size': [10, 20], 'epochs': [10, 20]}
cnn_param_grid = {'batch_size': [10, 20], 'epochs': [10, 20]}

# Define cross-validation folds
cv = KFold(n_splits=5, random_state=42, shuffle=True)

# Perform GridSearchCV
grids = {}
for model_name, pipeline, param_grid in [('SVM', svm_pipeline, svm_param_grid),
                                         ('RandomForest', rf_pipeline, rf_param_grid),
                                         ('RNN', rnn_model, rnn_param_grid),
                                         ('CNN', cnn_model, cnn_param_grid)]:
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, cv=cv, verbose=2)
    grid_search.fit(X_train, to_categorical(y_train) if model_name in ['RNN', 'CNN'] else y_train)
    grids[model_name] = grid_search

# Evaluate models
for model_name, grid in grids.items():
    if model_name in ['RNN', 'CNN']:
        y_pred = grid.predict(X_test)
        y_pred = np.argmax(y_pred, axis=1)
    else:
        y_pred = grid.predict(X_test)
    print(f"{model_name} Best Params: {grid.best_params_}")
    print(f"{model_name} Accuracy: {grid.best_score_}")
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))

# Select the best model
best_model_name = max(grids, key=lambda name: grids[name].best_score_)
best_model = grids[best_model_name].best_estimator_
print(f"Best Model: {best_model_name}")

In [None]:
# X = df.drop('label', axis=1).values  # Features
# y = df['label'].values  # Labels
# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define a standard scaler
# scaler = StandardScaler()

# # SVM Pipeline
# svm_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('svm', SVC())
# ])

# # Random Forest Pipeline
# rf_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('rf', RandomForestClassifier())
# ])

# # RNN Model
# def create_rnn_model():
#     model = Sequential()
#     model.add(LSTM(64, input_shape=(X_train.shape[1], 1), return_sequences=True))
#     model.add(LSTM(32, return_sequences=False))
#     model.add(Dense(4, activation='softmax'))
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model

# # CNN Model
# def create_cnn_model():
#     model = Sequential()
#     model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
#     model.add(MaxPooling1D(pool_size=2))
#     model.add(Flatten())
#     model.add(Dense(50, activation='relu'))
#     model.add(Dense(4, activation='softmax'))
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model

# # Wrap Keras models for use in scikit-learn
# rnn_model = KerasClassifier(build_fn=create_rnn_model, epochs=10, batch_size=10, verbose=0)
# cnn_model = KerasClassifier(build_fn=create_cnn_model, epochs=10, batch_size=10, verbose=0)

# # Define parameter grid for GridSearchCV
# svm_param_grid = {'svm__C': [1, 10], 'svm__gamma': [0.001, 0.01]}
# rf_param_grid = {'rf__n_estimators': [50, 100], 'rf__max_depth': [10, 20]}
# rnn_param_grid = {'batch_size': [10, 20], 'epochs': [10, 20]}
# cnn_param_grid = {'batch_size': [10, 20], 'epochs': [10, 20]}

# # Define cross-validation folds
# cv = KFold(n_splits=5, random_state=42, shuffle=True)

# # Perform GridSearchCV
# grids = {}
# for model_name, pipeline, param_grid in [('SVM', svm_pipeline, svm_param_grid),
#                                          ('RandomForest', rf_pipeline, rf_param_grid),
#                                          ('RNN', rnn_model, rnn_param_grid),
#                                          ('CNN', cnn_model, cnn_param_grid)]:
#     grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, cv=cv)
#     grid_search.fit(X_train, to_categorical(y_train) if model_name in ['RNN', 'CNN'] else y_train)
#     grids[model_name] = grid_search

# # Evaluate models
# for model_name, grid in grids.items():
#     if model_name in ['RNN', 'CNN']:
#         y_pred = grid.predict(X_test)
#         y_pred = np.argmax(y_pred, axis=1)
#     else:
#         y_pred = grid.predict(X_test)
#     print(f"{model_name} Best Params: {grid.best_params_}")
#     print(f"{model_name} Accuracy: {grid.best_score_}")
#     print(classification_report(y_test, y_pred))

# # Select the best model
# best_model_name = max(grids, key=lambda name: grids[name].best_score_)
# best_model = grids[best_model_name].best_estimator_
# print(f"Best Model: {best_model_name}")

# # Save the model if needed
# # best_model.model.save('best_model.h5')  # Uncomment this line to save the Keras model
