In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
import time
import xgboost as xgb
import lightgbm as lgb

# Load data from CSV
data = pd.read_csv('/kaggle/input/librosaextractedtess/ExtractedFeaturesForTessDatasetLibrosaCleanedNa.csv')

# Separate features (X) and labels (y)
X = data.drop(columns=['Emotions'])
y = data['Emotions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into train and test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Standardize features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for CNN input
X_train_cnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_cnn = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)


# Initializing models with improved parameters
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, C= 0.001),
    'Naive Bayes': GaussianNB(var_smoothing=1e-11),
    'Decision Tree': DecisionTreeClassifier(max_depth= 7, min_samples_split=5),
    'Random Forest': RandomForestClassifier(n_estimators=1000, max_depth=10, min_samples_split=5),
    'KNN': KNeighborsClassifier(n_neighbors=15),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth= 10, subsample=0.8),
    'XGBoost': xgb.XGBClassifier(n_estimators=300, learning_rate=0.01, max_depth= 10, subsample=0.8),
    'LightGBM': lgb.LGBMClassifier(n_estimators=300, learning_rate=0.01, max_depth=10, subsample=0.8),
   # Modified architectures for neural network models
  'Feedforward Neural Network': Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
   ]),
# Convolutional Neural Network
  'Convolutional Neural Network': Sequential([
    Conv1D(128, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], X_train_cnn.shape[2])),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
  ]),
# Deep Belief Network
  'Deep Belief Network': Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
   ]),
    'Voting Classifier': VotingClassifier(estimators=[
        ('Logistic Regression', LogisticRegression(max_iter=1000, C= 0.001)),
        ('Naive Bayes', GaussianNB(var_smoothing=1e-11)),
        ('Decision Tree', DecisionTreeClassifier(max_depth=7, min_samples_split=5)),
        ('Random Forest', RandomForestClassifier(n_estimators=1000, max_depth=10, min_samples_split=5)),
        ('KNN', KNeighborsClassifier(n_neighbors=15)),
        ('Gradient Boosting', GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=10, subsample=0.8)),
        ('XGBoost', xgb.XGBClassifier(n_estimators=300, learning_rate=0.01, max_depth=10, subsample=0.8)),
        ('LightGBM', lgb.LGBMClassifier(n_estimators=300, learning_rate=0.01, max_depth=10, subsample=0.8)),
    ]),
'MLP Classifier': MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),  # Increased hidden layer sizes
    max_iter=1000,  # Increased maximum number of iterations
    activation='relu',  # ReLU activation function
    solver='adam',  # Adam optimizer
    random_state=42
)
}

# Training and evaluating each model
for name, model in models.items():
    start_time = time.time()
    
    if name == 'Feedforward Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_scaled, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    elif name == 'Convolutional Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_cnn, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    elif name == 'Deep Belief Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_scaled, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    else:
        model.fit(X_train, y_train_encoded)
    
    train_time = time.time() - start_time
    
    start_time = time.time()
    if name in ['Feedforward Neural Network', 'Convolutional Neural Network', 'Deep Belief Network']:
        y_pred_proba = model.predict(X_test_scaled)
        y_pred = y_pred_proba.argmax(axis=-1)
    else:
        y_pred = model.predict(X_test)
    
    predict_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'{name}: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}, Training Time = {train_time:.4f} s, Prediction Time = {predict_time:.4f} s')


2024-04-23 14:15:02.839572: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 14:15:02.839707: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 14:15:03.025475: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://sciki

Logistic Regression: Accuracy = 0.7875, Precision = 0.7901, Recall = 0.7875, F1 Score = 0.7852, Training Time = 10.7022 s, Prediction Time = 0.0434 s
Naive Bayes: Accuracy = 0.6679, Precision = 0.7409, Recall = 0.6679, F1 Score = 0.6758, Training Time = 0.1233 s, Prediction Time = 0.0537 s
Decision Tree: Accuracy = 0.8589, Precision = 0.8699, Recall = 0.8589, F1 Score = 0.8577, Training Time = 2.1491 s, Prediction Time = 0.0243 s
Random Forest: Accuracy = 0.9554, Precision = 0.9568, Recall = 0.9554, F1 Score = 0.9554, Training Time = 52.9638 s, Prediction Time = 0.1749 s
KNN: Accuracy = 0.4232, Precision = 0.5381, Recall = 0.4232, F1 Score = 0.4158, Training Time = 0.0372 s, Prediction Time = 0.2146 s
Gradient Boosting: Accuracy = 0.9679, Precision = 0.9682, Recall = 0.9679, F1 Score = 0.9679, Training Time = 8341.0641 s, Prediction Time = 0.1760 s
XGBoost: Accuracy = 0.9768, Precision = 0.9773, Recall = 0.9768, F1 Score = 0.9769, Training Time = 553.5942 s, Prediction Time = 0.1992 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324900
[LightGBM] [Info] Number of data points in the train set: 2240, number of used features: 1304
[LightGBM] [Info] Start training from score -1.964838
[LightGBM] [Info] Start training from score -1.927334
[LightGBM] [Info] Start training from score -1.974438
[LightGBM] [Info] Start training from score -1.955329
[LightGBM] [Info] Start training from score -1.912113
[LightGBM] [Info] Start training from score -1.974438
[LightGBM] [Info] Start training from score -1.915138
Voting Classifier: Accuracy = 0.9821, Precision = 0.9823, Recall = 0.9821, F1 Score = 0.9821, Training Time = 9412.4224 s, Prediction Time = 1.2157 s
MLP Classifier: Accuracy = 0.7554, Precision = 0.7596, Recall = 0.7554, F1 Score = 0.7522, Training Time = 58.8225 s, Prediction Time = 0.0538 s


HYPERPARAMETERS

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import lightgbm as lgb
import time

# Load data from CSV
data = pd.read_csv('/kaggle/input/librosaextractedtess/ExtractedFeaturesForTessDatasetLibrosaCleanedNa.csv')

# Separate features (X) and labels (y)
X = data.drop(columns=['Emotions'])
y = data['Emotions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into train and test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale features using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define hyperparameters for each model
param_grid = {
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'Decision Tree': {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]},
    'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]},
    'KNN': {'n_neighbors': [3, 5, 7]},
    'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.01, 0.001]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.01, 0.001]},
    'LightGBM': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.01, 0.001]}
}

# Initialize models with default parameters
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier()
}

# Initialize GridSearchCV objects for hyperparameter tuning
grid_searches = {}
for name, model in models.items():
    param_grid_model = param_grid.get(name, {})  # Get hyperparameters for the current model
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid_model, scoring='accuracy', cv=5, n_jobs=-1)
    grid_searches[name] = grid_search

# Training and evaluating each model with hyperparameter tuning
for name, grid_search in grid_searches.items():
    start_time = time.time()
    
    grid_search.fit(X_train_scaled, y_train_encoded)
    
    train_time = time.time() - start_time
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'{name}: Best Params = {grid_search.best_params_}, Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}, Training Time = {train_time:.4f} s')


Logistic Regression: Best Params = {'C': 10}, Accuracy = 0.9554, Precision = 0.9562, Recall = 0.9554, F1 Score = 0.9555, Training Time = 59.6209 s
Decision Tree: Best Params = {'max_depth': 10, 'min_samples_split': 2}, Accuracy = 0.8893, Precision = 0.8906, Recall = 0.8893, F1 Score = 0.8892, Training Time = 39.4901 s
Random Forest: Best Params = {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy = 0.9554, Precision = 0.9569, Recall = 0.9554, F1 Score = 0.9555, Training Time = 450.8667 s
KNN: Best Params = {'n_neighbors': 3}, Accuracy = 0.7482, Precision = 0.7707, Recall = 0.7482, F1 Score = 0.7451, Training Time = 1.2483 s


Models of LR

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten, Dropout
import xgboost as xgb
import lightgbm as lgb
import time

# Load data from CSV
data = pd.read_csv('/kaggle/input/librosaextractedtess/ExtractedFeaturesForTessDatasetLibrosaCleanedNa.csv')

# Separate features (X) and labels (y)
X = data.drop(columns=['Emotions'])
y = data['Emotions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into train and test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for CNN input
X_train_cnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_cnn = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

#solver{‘lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’}
# Initializing models
models = {
    'Logistic Regression0': LogisticRegression(max_iter=1000, penalty="l2"),
    'Logistic Regression1': LogisticRegression(max_iter=1000, penalty="none"),
    'Logistic Regression2': LogisticRegression(max_iter=1000, solver="liblinear", penalty="l1"),
    'Logistic Regression3': LogisticRegression(max_iter=1000, solver="liblinear", penalty="l2"),
    'Logistic Regression4': LogisticRegression(max_iter=1000, solver="newton-cg", penalty="l2"),
    'Logistic Regression5': LogisticRegression(max_iter=1000, solver="newton-cg", penalty="none"),
    'Logistic Regression6': LogisticRegression(max_iter=1000, solver="newton-cholesky", penalty="l2"),
    'Logistic Regression7': LogisticRegression(max_iter=1000, solver="newton-cholesky", penalty="none"),
    'Logistic Regression8': LogisticRegression(max_iter=1000, solver="sag", penalty="l2"),
    'Logistic Regression9': LogisticRegression(max_iter=1000, solver="sag", penalty="none"),
    'Logistic Regression10': LogisticRegression(max_iter=1000, solver="saga", penalty="l2"),
    'Logistic Regression11': LogisticRegression(max_iter=1000, solver="saga", penalty="elasticnet", l1_ratio=0.5),
    'Logistic Regression12': LogisticRegression(max_iter=1000, solver="saga", penalty="l1"),
    'Logistic Regression13': LogisticRegression(max_iter=1000, solver="saga", penalty="none"),
    'Logistic Regression14': LogisticRegression(max_iter=1000, class_weight="balanced"),
    'Logistic Regression15': LogisticRegression(max_iter=1000, C=10),
    'Logistic Regression16': LogisticRegression(max_iter=1000, C=0.001),
    'Logistic Regression17': LogisticRegression(max_iter=1000, C=0.1),
}



# Training and evaluating each model
for name, model in models.items():
    start_time = time.time()
    
    if name == 'Feedforward Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_scaled, y_train_encoded, epochs=10, batch_size=32, verbose=0)
    elif name == 'Convolutional Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_cnn, y_train_encoded, epochs=10, batch_size=32, verbose=0)
    elif name == 'Deep Belief Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_scaled, y_train_encoded, epochs=10, batch_size=32, verbose=0)
    else:
        model.fit(X_train, y_train_encoded)
    
    train_time = time.time() - start_time
    
    start_time = time.time()
    if name in ['Feedforward Neural Network', 'Convolutional Neural Network', 'Deep Belief Network']:
        y_pred_proba = model.predict(X_test_scaled)
        y_pred = y_pred_proba.argmax(axis=-1)
    else:
        y_pred = model.predict(X_test)
    
    predict_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'{name}: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}, Training Time = {train_time:.4f} s, Prediction Time = {predict_time:.4f} s')


2024-04-25 18:30:35.936264: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-25 18:30:35.936359: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-25 18:30:36.062757: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimiz

Logistic Regression0: Accuracy = 0.8071, Precision = 0.8114, Recall = 0.8071, F1 Score = 0.8065, Training Time = 10.1072 s, Prediction Time = 0.0421 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression1: Accuracy = 0.8107, Precision = 0.8131, Recall = 0.8107, F1 Score = 0.8089, Training Time = 8.9106 s, Prediction Time = 0.0423 s
Logistic Regression2: Accuracy = 0.8804, Precision = 0.8823, Recall = 0.8804, F1 Score = 0.8785, Training Time = 138.8424 s, Prediction Time = 0.0252 s
Logistic Regression3: Accuracy = 0.8571, Precision = 0.8619, Recall = 0.8571, F1 Score = 0.8547, Training Time = 116.4706 s, Prediction Time = 0.0343 s




Logistic Regression4: Accuracy = 0.8375, Precision = 0.8433, Recall = 0.8375, F1 Score = 0.8355, Training Time = 799.3787 s, Prediction Time = 0.0421 s
Logistic Regression5: Accuracy = 0.8107, Precision = 0.8175, Recall = 0.8107, F1 Score = 0.8080, Training Time = 39.4196 s, Prediction Time = 0.0418 s
Logistic Regression6: Accuracy = 0.8482, Precision = 0.8514, Recall = 0.8482, F1 Score = 0.8452, Training Time = 28.3227 s, Prediction Time = 0.0707 s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
STOP: TOTAL 

Logistic Regression7: Accuracy = 0.8071, Precision = 0.8153, Recall = 0.8071, F1 Score = 0.8075, Training Time = 11.0187 s, Prediction Time = 0.0432 s




Logistic Regression8: Accuracy = 0.7411, Precision = 0.7456, Recall = 0.7411, F1 Score = 0.7385, Training Time = 123.2374 s, Prediction Time = 0.0248 s




Logistic Regression9: Accuracy = 0.7411, Precision = 0.7456, Recall = 0.7411, F1 Score = 0.7385, Training Time = 123.2694 s, Prediction Time = 0.0312 s




Logistic Regression10: Accuracy = 0.7018, Precision = 0.7052, Recall = 0.7018, F1 Score = 0.6974, Training Time = 146.6237 s, Prediction Time = 0.0255 s




Logistic Regression11: Accuracy = 0.7018, Precision = 0.7052, Recall = 0.7018, F1 Score = 0.6974, Training Time = 357.1472 s, Prediction Time = 0.0249 s




Logistic Regression12: Accuracy = 0.7018, Precision = 0.7048, Recall = 0.7018, F1 Score = 0.6975, Training Time = 359.8141 s, Prediction Time = 0.0296 s




Logistic Regression13: Accuracy = 0.7018, Precision = 0.7052, Recall = 0.7018, F1 Score = 0.6974, Training Time = 146.1248 s, Prediction Time = 0.0244 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression14: Accuracy = 0.7911, Precision = 0.7953, Recall = 0.7911, F1 Score = 0.7889, Training Time = 8.9063 s, Prediction Time = 0.0419 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression15: Accuracy = 0.7946, Precision = 0.7975, Recall = 0.7946, F1 Score = 0.7917, Training Time = 8.9724 s, Prediction Time = 0.0430 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression16: Accuracy = 0.7875, Precision = 0.7901, Recall = 0.7875, F1 Score = 0.7852, Training Time = 9.8156 s, Prediction Time = 0.0413 s
Logistic Regression17: Accuracy = 0.7893, Precision = 0.7942, Recall = 0.7893, F1 Score = 0.7870, Training Time = 8.9973 s, Prediction Time = 0.0417 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


wav2vec feature extractiom

In [7]:
import os
import torch
import torch.nn as nn
import pandas as pd
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torchaudio
from torchaudio.transforms import Resample
import numpy as np
import time

class EmotionModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)

    def forward(self, input_values):
        outputs = self.wav2vec2(input_values)
        hidden_states = outputs.last_hidden_state.squeeze().mean(axis=0)
        return hidden_states

def process_func(audio_dir):
    device = 'cpu'
    model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2Model.from_pretrained(model_name)
    model.to(device)

    features_list = []
    paths_list = []

    resampler = Resample(orig_freq=24414, new_freq=16000)

    start_time = time.time()

    for root, dirs, files in os.walk(audio_dir):
        for file in files:
            audio_path = os.path.join(root, file)
            array, fs = torchaudio.load(audio_path)
            array_resampled = resampler(array)
            input_values = processor(array_resampled.squeeze(), sampling_rate=16000, return_tensors="pt")
            input_values = input_values.input_values.to(device)
            with torch.no_grad():
                hidden_states = model(input_values)
            features_list.append(hidden_states.last_hidden_state.squeeze().mean(axis=0).cpu().numpy().tolist())
            paths_list.append(audio_path)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Time taken to extract features:", elapsed_time, "seconds")

    return features_list, paths_list

# Example usage:
audio_dir = '/kaggle/input/toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data'
features_list, paths_list = process_func(audio_dir)

# Convert lists to DataFrame
data = {'Path': paths_list}
for i in range(len(features_list[0])):
    data[f'Feature_{i}'] = [feature[i] for feature in features_list]

df = pd.DataFrame(data)

# Save DataFrame to CSV
csv_file = '/kaggle/working/featuresforTessWav2vec.csv'
df.to_csv(csv_file, index=False)
print("Features saved to:", csv_file)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Time taken to extract features: 2517.401907682419 seconds
Features saved to: /kaggle/working/featuresforTessWav2vec.csv


In [24]:
data = pd.read_csv('/kaggle/input/wav2vectessfeatures/ExtractedFeaturesForTessDatasetWav2Vec.csv')
data.head()
#31 mis

Unnamed: 0,Path,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,...,Feature_1015,Feature_1016,Feature_1017,Feature_1018,Feature_1019,Feature_1020,Feature_1021,Feature_1022,Feature_1023,Emotions
0,/kaggle/input/toronto-emotional-speech-set-tes...,-0.007627,0.006393,-0.009016,-0.006457,-0.005413,-0.00774,-0.079955,0.005843,0.079756,...,0.010743,0.009884,0.008419,0.0052,-0.031274,0.333547,0.006994,0.009229,0.003448,fear
1,/kaggle/input/toronto-emotional-speech-set-tes...,-0.007651,0.006757,-0.00865,-0.004695,-0.005046,-0.008044,-0.078741,0.006259,0.028489,...,0.010798,0.045597,0.008512,0.005134,-0.082161,0.31479,0.007343,0.009305,0.004567,fear
2,/kaggle/input/toronto-emotional-speech-set-tes...,-0.007654,0.007805,-0.008533,-0.005969,-0.005547,-0.007536,0.011286,0.006206,0.10066,...,0.010882,0.014696,0.008473,0.043847,-0.053118,0.300673,0.007296,0.009384,0.00446,fear
3,/kaggle/input/toronto-emotional-speech-set-tes...,-0.007416,0.007406,-0.007871,-0.006976,-0.005285,-0.008491,-0.08046,0.005944,0.120987,...,0.011316,-0.004393,0.008385,0.00521,0.001423,0.202869,0.007186,0.009218,0.005208,fear
4,/kaggle/input/toronto-emotional-speech-set-tes...,-0.007399,0.006563,-0.007964,-0.005912,-0.00546,-0.008129,-0.074036,0.006423,0.064441,...,0.010993,-0.006457,0.008377,-0.001801,-0.09061,0.247391,0.007254,0.009252,0.005467,fear


In [9]:
import pandas as pd

# Read the CSV file into a DataFrame
csv_file = '/kaggle/input/wav2vectessfeatures/ExtractedFeaturesForTessDatasetWav2Vec.csv
df = pd.read_csv(csv_file)

# Get the number of rows and columns
num_rows, num_cols = df.shape

# Print the number of rows and columns
print("Number of rows:", num_rows)
print("Number of columns:", num_cols)


Number of rows: 2800
Number of columns: 1025


In [22]:
import pandas as pd

# Read the first CSV file
csv1 = pd.read_csv("/kaggle/input/librosaextractedtess/ExtractedFeaturesForTessDatasetLibrosaCleanedNa.csv")

# Read the second CSV file
csv2 = pd.read_csv("/kaggle/input/wav2vecextractedtess/featuresforTessWav2vec.csv")

# Copy the 'Emotions' column from csv1 to csv2
csv2['Emotions'] = csv1['Emotions']

# Save the modified csv2 to a new CSV file
csv2.to_csv("/kaggle/working/ExtractedFeaturesForTessDatasetWav2Vec.csv", index=False)

print("Emotions column copied from csv1 to csv2 and saved to output.csv")


Emotions column copied from csv1 to csv2 and saved to output.csv


In [23]:
import pandas as pd

# Read the CSV file into a DataFrame
csv_file_path = "/kaggle/working/ExtractedFeaturesForTessDatasetWav2Vec.csv"
data = pd.read_csv(csv_file_path)

# Print a particular column (e.g., "emotion")
print(data["Emotions"])


0           fear
1           fear
2           fear
3           fear
4           fear
          ...   
2795    surprise
2796    surprise
2797    surprise
2798    surprise
2799    surprise
Name: Emotions, Length: 2800, dtype: object


In [4]:
import pandas as pd

# Read the CSV file
csv_file_path = "/kaggle/working/features.csv"
data = pd.read_csv(csv_file_path)

# Split the values in the "Features" column based on the delimiter " "
data['Features'] = data['Features'].str.split(" ")

# Expand the list of values into separate columns
data_expanded = data['Features'].apply(pd.Series)

# Rename the columns if needed
# data_expanded = data_expanded.rename(columns=lambda x: f"feature_{x}")

# Save the modified DataFrame to a new CSV file
new_csv_file_path = "Wav2vectessfeatures.csv"
data_expanded.to_csv(new_csv_file_path, index=False)

# Print the first few rows of the modified DataFrame
print(data_expanded.head())


             0  1           2            3            4    5           6   \
0  [-0.00762739     0.00639291  -0.00901586          ...       0.00699424   
1  [-0.00765135     0.00675729  -0.00864979          ...        0.0073433   
2  [-0.00765371     0.00780493  -0.00853262          ...       0.00729563   
3  [-0.00741571     0.00740618  -0.00787141          ...       0.00718567   
4   [-0.0073986                  0.00656308  -0.00796391  ...               

           7             8             9            10           11   12   13  \
0               0.0092293\n                0.00344799]          NaN  NaN  NaN   
1                            0.00930463\n               0.00456725]  NaN  NaN   
2              0.00938383\n                0.00445961]          NaN  NaN  NaN   
3              0.00921761\n                0.00520833]          NaN  NaN  NaN   
4  0.00725434                0.00925199\n                 0.0054665    ]  NaN   

    14   15  
0  NaN  NaN  
1  NaN  NaN  
2  NaN  

In [5]:
data = pd.read_csv('/kaggle/working/Wav2vectessfeatures.csv')
data.head()
#31 mis

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,[-0.00762739,,0.006393,-0.009016,...,,0.00699424,,0.0092293\n,,0.00344799],,,,,
1,[-0.00765135,,0.006757,-0.00865,...,,0.0073433,,,0.009305,,0.00456725],,,,
2,[-0.00765371,,0.007805,-0.008533,...,,0.00729563,,0.00938383\n,,0.00445961],,,,,
3,[-0.00741571,,0.007406,-0.007871,...,,0.00718567,,0.00921761\n,,0.00520833],,,,,
4,[-0.0073986,,,0.006563,-0.00796391,...,,0.00725434,,0.009252,,0.0054665,],,,


SVM C

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, LSTM, Embedding, Bidirectional, Dropout
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings

# Load data from CSV
data = pd.read_csv('/kaggle/input/wav2vectessfeatures/ExtractedFeaturesForTessDatasetWav2Vec.csv')

# Separate features (X) and labels (y)
X = data.drop(columns=['Emotions','Path'])
y = data['Emotions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into train and test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initializing models
models = {
    'Support Vector Machine': SVC(),
    'Recurrent Neural Network': Sequential([
        Embedding(input_dim=X_train_scaled.shape[1], output_dim=64),
        LSTM(64),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Gated Recurrent Unit': Sequential([
        Embedding(input_dim=X_train_scaled.shape[1], output_dim=64),
        GRU(64),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Long Short-Term Memory': Sequential([
        Embedding(input_dim=X_train_scaled.shape[1], output_dim=64),
        Bidirectional(LSTM(64)),
        Dense(len(label_encoder.classes_), activation='softmax')
    ])
}

# Training and evaluating each model
for name, model in models.items():
    start_time = time.time()
    
    if name == 'Support Vector Machine':
        model.fit(X_train_scaled, y_train_encoded)
    else:
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_scaled, y_train_encoded, epochs=10, batch_size=32, verbose=0)
    
    train_time = time.time() - start_time
    
    start_time = time.time()
    if name == 'Support Vector Machine':
        y_pred = model.predict(X_test_scaled)
    else:
        y_pred_proba = model.predict(X_test_scaled)
        y_pred = y_pred_proba.argmax(axis=-1)
    
    predict_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'{name}: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}, Training Time = {train_time:.4f} s, Prediction Time = {predict_time:.4f} s')


Support Vector Machine: Accuracy = 1.0000, Precision = 1.0000, Recall = 1.0000, F1 Score = 1.0000, Training Time = 0.9212 s, Prediction Time = 0.4704 s
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 125ms/step
Recurrent Neural Network: Accuracy = 0.3179, Precision = 0.2945, Recall = 0.3179, F1 Score = 0.2771, Training Time = 381.5648 s, Prediction Time = 2.4898 s


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten, Dropout
import xgboost as xgb
import lightgbm as lgb
import time

# Load data from CSV
data = pd.read_csv('/kaggle/input/wav2vectessfeatures/ExtractedFeaturesForTessDatasetWav2Vec.csv')

# Separate features (X) and labels (y)
X = data.drop(columns=['Emotions','Path'])
y = data['Emotions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into train and test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for CNN input
X_train_cnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_cnn = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

#solver{‘lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’}
# Initializing models
models = {
    'Logistic Regression0': LogisticRegression(max_iter=1000, penalty="l2"),
    'Logistic Regression1': LogisticRegression(max_iter=1000, penalty="none"),
    'Logistic Regression2': LogisticRegression(max_iter=1000, solver="liblinear", penalty="l1"),
    'Logistic Regression3': LogisticRegression(max_iter=1000, solver="liblinear", penalty="l2"),
    'Logistic Regression4': LogisticRegression(max_iter=1000, solver="newton-cg", penalty="l2"),
    'Logistic Regression5': LogisticRegression(max_iter=1000, solver="newton-cg", penalty="none"),
    'Logistic Regression6': LogisticRegression(max_iter=1000, solver="newton-cholesky", penalty="l2"),
    'Logistic Regression7': LogisticRegression(max_iter=1000, solver="newton-cholesky", penalty="none"),
    'Logistic Regression8': LogisticRegression(max_iter=1000, solver="sag", penalty="l2"),
    'Logistic Regression9': LogisticRegression(max_iter=1000, solver="sag", penalty="none"),
    'Logistic Regression10': LogisticRegression(max_iter=1000, solver="saga", penalty="l2"),
    'Logistic Regression11': LogisticRegression(max_iter=1000, solver="saga", penalty="elasticnet", l1_ratio=0.5),
    'Logistic Regression12': LogisticRegression(max_iter=1000, solver="saga", penalty="l1"),
    'Logistic Regression13': LogisticRegression(max_iter=1000, solver="saga", penalty="none"),
    'Logistic Regression14': LogisticRegression(max_iter=1000, class_weight="balanced"),
    'Logistic Regression15': LogisticRegression(max_iter=1000, C=10),
    'Logistic Regression16': LogisticRegression(max_iter=1000, C=0.001),
    'Logistic Regression17': LogisticRegression(max_iter=1000, C=0.1),
}



# Training and evaluating each model
for name, model in models.items():
    start_time = time.time()
    
    if name == 'Feedforward Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_scaled, y_train_encoded, epochs=10, batch_size=32, verbose=0)
    elif name == 'Convolutional Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_cnn, y_train_encoded, epochs=10, batch_size=32, verbose=0)
    elif name == 'Deep Belief Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_scaled, y_train_encoded, epochs=10, batch_size=32, verbose=0)
    else:
        model.fit(X_train, y_train_encoded)
    
    train_time = time.time() - start_time
    
    start_time = time.time()
    if name in ['Feedforward Neural Network', 'Convolutional Neural Network', 'Deep Belief Network']:
        y_pred_proba = model.predict(X_test_scaled)
        y_pred = y_pred_proba.argmax(axis=-1)
    else:
        y_pred = model.predict(X_test)
    
    predict_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'{name}: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}, Training Time = {train_time:.4f} s, Prediction Time = {predict_time:.4f} s')


Logistic Regression0: Accuracy = 0.9982, Precision = 0.9982, Recall = 0.9982, F1 Score = 0.9982, Training Time = 1.6470 s, Prediction Time = 0.0657 s




Logistic Regression1: Accuracy = 1.0000, Precision = 1.0000, Recall = 1.0000, F1 Score = 1.0000, Training Time = 0.5571 s, Prediction Time = 0.0304 s
Logistic Regression2: Accuracy = 0.9964, Precision = 0.9965, Recall = 0.9964, F1 Score = 0.9964, Training Time = 1.9662 s, Prediction Time = 0.0186 s
Logistic Regression3: Accuracy = 0.9964, Precision = 0.9965, Recall = 0.9964, F1 Score = 0.9964, Training Time = 4.1868 s, Prediction Time = 0.0191 s
Logistic Regression4: Accuracy = 0.9982, Precision = 0.9982, Recall = 0.9982, F1 Score = 0.9982, Training Time = 0.8492 s, Prediction Time = 0.0319 s




Logistic Regression5: Accuracy = 1.0000, Precision = 1.0000, Recall = 1.0000, F1 Score = 1.0000, Training Time = 1.0405 s, Prediction Time = 0.0327 s
Logistic Regression6: Accuracy = 0.9982, Precision = 0.9982, Recall = 0.9982, F1 Score = 0.9982, Training Time = 9.3366 s, Prediction Time = 0.0304 s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.28353e-21): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.28353e-21): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.28353e-21): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remed

Logistic Regression7: Accuracy = 1.0000, Precision = 1.0000, Recall = 1.0000, F1 Score = 1.0000, Training Time = 2.2471 s, Prediction Time = 0.0322 s
Logistic Regression8: Accuracy = 0.9982, Precision = 0.9982, Recall = 0.9982, F1 Score = 0.9982, Training Time = 2.9399 s, Prediction Time = 0.0211 s




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
import time
import xgboost as xgb
import lightgbm as lgb

# Load data from CSV
data = pd.read_csv('/kaggle/input/wav2vectessfeatures/ExtractedFeaturesForTessDatasetWav2Vec.csv')

# Separate features (X) and labels (y)
X = data.drop(columns=['Emotions','Path'])
y = data['Emotions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into train and test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Standardize features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for CNN input
X_train_cnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_cnn = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)


# Initializing models with improved parameters
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, C= 0.001),
    'Naive Bayes': GaussianNB(var_smoothing=1e-11),
    'Decision Tree': DecisionTreeClassifier(max_depth= 7, min_samples_split=5),
    'Random Forest': RandomForestClassifier(n_estimators=1000, max_depth=10, min_samples_split=5),
    'KNN': KNeighborsClassifier(n_neighbors=15),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth= 10, subsample=0.8),
    'XGBoost': xgb.XGBClassifier(n_estimators=300, learning_rate=0.01, max_depth= 10, subsample=0.8),
    'LightGBM': lgb.LGBMClassifier(n_estimators=300, learning_rate=0.01, max_depth=10, subsample=0.8),
   # Modified architectures for neural network models
  'Feedforward Neural Network': Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
   ]),
# Convolutional Neural Network
  'Convolutional Neural Network': Sequential([
    Conv1D(128, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], X_train_cnn.shape[2])),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
  ]),
# Deep Belief Network
  'Deep Belief Network': Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
   ]),
    'Voting Classifier': VotingClassifier(estimators=[
        ('Logistic Regression', LogisticRegression(max_iter=1000, C= 0.001)),
        ('Naive Bayes', GaussianNB(var_smoothing=1e-11)),
        ('Decision Tree', DecisionTreeClassifier(max_depth=7, min_samples_split=5)),
        ('Random Forest', RandomForestClassifier(n_estimators=1000, max_depth=10, min_samples_split=5)),
        ('KNN', KNeighborsClassifier(n_neighbors=15)),
        ('Gradient Boosting', GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=10, subsample=0.8)),
        ('XGBoost', xgb.XGBClassifier(n_estimators=300, learning_rate=0.01, max_depth=10, subsample=0.8)),
        ('LightGBM', lgb.LGBMClassifier(n_estimators=300, learning_rate=0.01, max_depth=10, subsample=0.8)),
    ]),
'MLP Classifier': MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),  # Increased hidden layer sizes
    max_iter=1000,  # Increased maximum number of iterations
    activation='relu',  # ReLU activation function
    solver='adam',  # Adam optimizer
    random_state=42
)
}

# Training and evaluating each model
for name, model in models.items():
    start_time = time.time()
    
    if name == 'Feedforward Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_scaled, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    elif name == 'Convolutional Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_cnn, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    elif name == 'Deep Belief Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_scaled, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    else:
        model.fit(X_train, y_train_encoded)
    
    train_time = time.time() - start_time
    
    start_time = time.time()
    if name in ['Feedforward Neural Network', 'Convolutional Neural Network', 'Deep Belief Network']:
        y_pred_proba = model.predict(X_test_scaled)
        y_pred = y_pred_proba.argmax(axis=-1)
    else:
        y_pred = model.predict(X_test)
    
    predict_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'{name}: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}, Training Time = {train_time:.4f} s, Prediction Time = {predict_time:.4f} s')


2024-05-05 07:37:23.577801: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-05 07:37:23.577927: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-05 07:37:23.732092: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression: Accuracy = 0.7768, Precision = 0.7377, Recall = 0.7768, F1 Score = 0.7356, Training Time = 0.1730 s, Prediction Time = 0.0327 s
Naive Bayes: Accuracy = 0.9911, Precision = 0.9913, Recall = 0.9911, F1 Score = 0.9911, Training Time = 0.0976 s, Prediction Time = 0.0449 s
Decision Tree: Accuracy = 0.9214, Precision = 0.9231, Recall = 0.9214, F1 Score = 0.9220, Training Time = 1.7056 s, Prediction Time = 0.0187 s
Random Forest: Accuracy = 1.0000, Precision = 1.0000, Recall = 1.0000, F1 Score = 1.0000, Training Time = 47.6762 s, Prediction Time = 0.1599 s
KNN: Accuracy = 0.9929, Precision = 0.9929, Recall = 0.9929, F1 Score = 0.9929, Training Time = 0.0303 s, Prediction Time = 0.1770 s


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier0.9911
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import time

# Load data from CSV
data = pd.read_csv('/kaggle/input/librosaextractedtess/ExtractedFeaturesForTessDatasetLibrosaCleanedNa.csv')

# Separate features (X) and labels (y)
X = data.drop(columns=['Emotions'])
y = data['Emotions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into train and test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize LDA
lda = LDA()

# Fit LDA on the training data and transform both training and test data
X_train_lda = lda.fit_transform(X_train_scaled, y_train_encoded)
X_test_lda = lda.transform(X_test_scaled)

# Display the number of components retained after LDA
print(f'Number of components retained after LDA: {lda.explained_variance_ratio_.shape[0]}')

# Reshape data for CNN input after LDA
X_train_lda_cnn = X_train_lda.reshape(X_train_lda.shape[0], X_train_lda.shape[1], 1)
X_test_lda_cnn = X_test_lda.reshape(X_test_lda.shape[0], X_test_lda.shape[1], 1)

# Initializing models
models = {
    'Logistic Regression': LogisticRegression(C=1.0, penalty='l2'),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(max_depth=5),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
    'Feedforward Neural Network': Sequential([
        Dense(128, activation='relu', input_shape=(X_train_lda.shape[1],)),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Convolutional Neural Network': Sequential([
        Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_lda_cnn.shape[1], X_train_lda_cnn.shape[2])),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Deep Belief Network': Sequential([
        Dense(128, activation='relu', input_shape=(X_train_lda.shape[1],)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'SVM': SVC(kernel='linear')
}

# Initialize base models
base_models = [
    ('Logistic Regression', LogisticRegression(C=1.0, penalty='l2')),
    ('Naive Bayes', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(max_depth=5)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, max_depth=5)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1))
]

# Initialize ensemble methods
ensemble_methods = {
    'Voting Classifier': VotingClassifier(estimators=base_models),
    'Stacking Classifier': StackingClassifier(estimators=base_models, final_estimator=LogisticRegression()),
    'Bagging Classifier': BaggingClassifier(base_estimator=DecisionTreeClassifier())
}

# Include ensemble methods in the models dictionary
models.update(ensemble_methods)

# Training and evaluating each model with LDA-transformed data
for name, model in models.items():
    start_time = time.time()
    
    if name == 'Feedforward Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_lda, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    elif name == 'Convolutional Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_lda_cnn, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    elif name == 'Deep Belief Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_lda, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    else:
        model.fit(X_train_lda, y_train_encoded)
    
    train_time = time.time() - start_time
    
    start_time = time.time()
    if name in ['Feedforward Neural Network', 'Convolutional Neural Network', 'Deep Belief Network']:
        y_pred_proba = model.predict(X_test_lda)
        y_pred = y_pred_proba.argmax(axis=-1)
    else:
        y_pred = model.predict(X_test_lda)
    
    predict_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'{name} with LDA: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}, Training Time = {train_time:.4f} s, Prediction Time = {predict_time:.4f} s')


2024-05-05 10:03:45.914711: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-05 10:03:45.914839: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-05 10:03:46.074393: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Number of components retained after LDA: 6


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(


Logistic Regression with LDA: Accuracy = 0.8054, Precision = 0.8210, Recall = 0.8054, F1 Score = 0.8053, Training Time = 0.0717 s, Prediction Time = 0.0003 s
Naive Bayes with LDA: Accuracy = 0.8000, Precision = 0.8063, Recall = 0.8000, F1 Score = 0.7943, Training Time = 0.0026 s, Prediction Time = 0.0008 s
Decision Tree with LDA: Accuracy = 0.7536, Precision = 0.7669, Recall = 0.7536, F1 Score = 0.7488, Training Time = 0.0115 s, Prediction Time = 0.0005 s
Random Forest with LDA: Accuracy = 0.7875, Precision = 0.7869, Recall = 0.7875, F1 Score = 0.7821, Training Time = 0.4224 s, Prediction Time = 0.0131 s
KNN with LDA: Accuracy = 0.8161, Precision = 0.8156, Recall = 0.8161, F1 Score = 0.8113, Training Time = 0.0028 s, Prediction Time = 0.0401 s
Gradient Boosting with LDA: Accuracy = 0.7625, Precision = 0.7641, Recall = 0.7625, F1 Score = 0.7579, Training Time = 4.6005 s, Prediction Time = 0.0096 s
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Feedforward Neura

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Stacking Classifier with LDA: Accuracy = 0.8107, Precision = 0.8118, Recall = 0.8107, F1 Score = 0.8071, Training Time = 26.5531 s, Prediction Time = 0.0612 s
Bagging Classifier with LDA: Accuracy = 0.7679, Precision = 0.7749, Recall = 0.7679, F1 Score = 0.7639, Training Time = 0.0957 s, Prediction Time = 0.0026 s




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
import time
import xgboost as xgb
import lightgbm as lgb

# Load data from CSV
data = pd.read_csv('/kaggle/input/wav2vectessfeatures/ExtractedFeaturesForTessDatasetWav2Vec.csv')

# Separate features (X) and labels (y)
X = data.drop(columns=['Emotions','Path'])
y = data['Emotions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into train and test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for CNN input
X_train_cnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_cnn = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Initializing models with improved parameters
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, C= 0.001),
    'Naive Bayes': GaussianNB(var_smoothing=1e-11),
    'Decision Tree': DecisionTreeClassifier(max_depth= 7, min_samples_split=5),
    'Random Forest': RandomForestClassifier(n_estimators=1000, max_depth=10, min_samples_split=5),
    'KNN': KNeighborsClassifier(n_neighbors=15),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth= 10, subsample=0.8),
    'XGBoost': xgb.XGBClassifier(n_estimators=300, learning_rate=0.01, max_depth= 10, subsample=0.8),
    'LightGBM': lgb.LGBMClassifier(n_estimators=300, learning_rate=0.01, max_depth=10, subsample=0.8),
    'Feedforward Neural Network': Sequential([
        Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Convolutional Neural Network': Sequential([
        Conv1D(128, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], X_train_cnn.shape[2])),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Deep Belief Network': Sequential([
        Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Voting Classifier': VotingClassifier(estimators=[
        ('Logistic Regression', LogisticRegression(max_iter=1000, C= 0.001)),
        ('Naive Bayes', GaussianNB(var_smoothing=1e-11)),
        ('Decision Tree', DecisionTreeClassifier(max_depth=7, min_samples_split=5)),
        ('Random Forest', RandomForestClassifier(n_estimators=1000, max_depth=10, min_samples_split=5)),
        ('KNN', KNeighborsClassifier(n_neighbors=15)),
        ('Gradient Boosting', GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=10, subsample=0.8)),
        ('XGBoost', xgb.XGBClassifier(n_estimators=300, learning_rate=0.01, max_depth=10, subsample=0.8)),
        ('LightGBM', lgb.LGBMClassifier(n_estimators=300, learning_rate=0.01, max_depth=10, subsample=0.8)),
    ]),
    'MLP Classifier': MLPClassifier(
        hidden_layer_sizes=(256, 128, 64),
        max_iter=1000,
        activation='relu',
        solver='adam',
        random_state=42
    )
}

# Training and evaluating each model
for name, model in models.items():
    start_time = time.time()
    
    if name in ['Feedforward Neural Network', 'Convolutional Neural Network', 'Deep Belief Network']:
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        if name == 'Convolutional Neural Network':
            model.fit(X_train_cnn, y_train_encoded, epochs=20, batch_size=64, verbose=0)
        else:
            model.fit(X_train_scaled, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    else:
        model.fit(X_train, y_train_encoded)
    
    train_time = time.time() - start_time
    
    start_time = time.time()
    if name in ['Feedforward Neural Network', 'Convolutional Neural Network', 'Deep Belief Network']:
        y_pred_proba = model.predict(X_test_scaled)
        y_pred = y_pred_proba.argmax(axis=-1)
    else:
        y_pred = model.predict(X_test)
    
    predict_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'{name}: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}, Training Time = {train_time:.4f} s, Prediction Time = {predict_time:.4f} s')


2024-05-31 11:04:06.152071: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-31 11:04:06.152314: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-31 11:04:06.305916: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression: Accuracy = 0.7768, Precision = 0.7377, Recall = 0.7768, F1 Score = 0.7356, Training Time = 0.2579 s, Prediction Time = 0.0545 s
Naive Bayes: Accuracy = 0.9911, Precision = 0.9913, Recall = 0.9911, F1 Score = 0.9911, Training Time = 0.1388 s, Prediction Time = 0.0538 s
Decision Tree: Accuracy = 0.9196, Precision = 0.9241, Recall = 0.9196, F1 Score = 0.9206, Training Time = 1.7385 s, Prediction Time = 0.0254 s
Random Forest: Accuracy = 1.0000, Precision = 1.0000, Recall = 1.0000, F1 Score = 1.0000, Training Time = 49.9124 s, Prediction Time = 0.2082 s
KNN: Accuracy = 0.9929, Precision = 0.9929, Recall = 0.9929, F1 Score = 0.9929, Training Time = 0.0366 s, Prediction Time = 0.2128 s


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from sklearn.decomposition import PCA
import time

# Load data from CS
data = pd.read_csv('/kaggle/input/librosaextractedtess/ExtractedFeaturesForTessDatasetLibrosaCleanedNa.csv')

# Separate features (X) and labels (y)
X = data.drop(columns=['Emotions'])
y = data['Emotions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into train and test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance

# Fit PCA on the training data and transform both training and test data
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Display the number of components retained after PCA
print(f'Number of components retained after PCA: {pca.n_components_}')

# Reshape data for CNN input after PCA
X_train_pca_cnn = X_train_pca.reshape(X_train_pca.shape[0], X_train_pca.shape[1], 1)
X_test_pca_cnn = X_test_pca.reshape(X_test_pca.shape[0], X_test_pca.shape[1], 1)

# Initializing models
models = {
    'Logistic Regression': LogisticRegression(C=1.0, penalty='l2'),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(max_depth=5),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
    'Feedforward Neural Network': Sequential([
        Dense(128, activation='relu', input_shape=(X_train_pca.shape[1],)),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Convolutional Neural Network': Sequential([
        Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_pca_cnn.shape[1], X_train_pca_cnn.shape[2])),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Deep Belief Network': Sequential([
        Dense(128, activation='relu', input_shape=(X_train_pca.shape[1],)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'SVM': SVC(kernel='linear')
}

# Initialize base models
base_models = [
    ('Logistic Regression', LogisticRegression(C=1.0, penalty='l2')),
    ('Naive Bayes', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(max_depth=5)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, max_depth=5)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1))
]

# Initialize ensemble methods
ensemble_methods = {
    'Voting Classifier': VotingClassifier(estimators=base_models),
    'Stacking Classifier': StackingClassifier(estimators=base_models, final_estimator=LogisticRegression()),
    'Bagging Classifier': BaggingClassifier(base_estimator=DecisionTreeClassifier())
}

# Include ensemble methods in the models dictionary
models.update(ensemble_methods)

# Training and evaluating each model with PCA-transformed data
for name, model in models.items():
    start_time = time.time()
    
    if name == 'Feedforward Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_pca, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    elif name == 'Convolutional Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_pca_cnn, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    elif name == 'Deep Belief Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_pca, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    else:
        model.fit(X_train_pca, y_train_encoded)
    
    train_time = time.time() - start_time
    
    start_time = time.time()
    if name in ['Feedforward Neural Network', 'Convolutional Neural Network', 'Deep Belief Network']:
        y_pred_proba = model.predict(X_test_pca)
        y_pred = y_pred_proba.argmax(axis=-1)
    else:
        y_pred = model.predict(X_test_pca)
    
    predict_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'{name} with PCA: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}, Training Time = {train_time:.4f} s, Prediction Time = {predict_time:.4f} s')


Number of components retained after PCA: 64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression with PCA: Accuracy = 0.4518, Precision = 0.4874, Recall = 0.4518, F1 Score = 0.4498, Training Time = 0.2070 s, Prediction Time = 0.0007 s
Naive Bayes with PCA: Accuracy = 0.4250, Precision = 0.5252, Recall = 0.4250, F1 Score = 0.4351, Training Time = 0.0072 s, Prediction Time = 0.0032 s
Decision Tree with PCA: Accuracy = 0.4946, Precision = 0.5297, Recall = 0.4946, F1 Score = 0.4885, Training Time = 0.1285 s, Prediction Time = 0.0004 s
Random Forest with PCA: Accuracy = 0.5643, Precision = 0.5645, Recall = 0.5643, F1 Score = 0.5396, Training Time = 1.0363 s, Prediction Time = 0.0141 s
KNN with PCA: Accuracy = 0.4518, Precision = 0.4588, Recall = 0.4518, F1 Score = 0.4486, Training Time = 0.0014 s, Prediction Time = 0.1166 s
Gradient Boosting with PCA: Accuracy = 0.6464, Precision = 0.6432, Recall = 0.6464, F1 Score = 0.6402, Training Time = 36.2285 s, Prediction Time = 0.0098 s
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Feedforward Neur

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import time

# Load data from CSV
data = pd.read_csv('/kaggle/input/wav2vectessfeatures/ExtractedFeaturesForTessDatasetWav2Vec.csv')

# Separate features (X) and labels (y)
X = data.drop(columns=['Emotions','Path'])
y = data['Emotions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into train and test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize LDA
lda = LDA()

# Fit LDA on the training data and transform both training and test data
X_train_lda = lda.fit_transform(X_train_scaled, y_train_encoded)
X_test_lda = lda.transform(X_test_scaled)

# Display the number of components retained after LDA
print(f'Number of components retained after LDA: {lda.explained_variance_ratio_.shape[0]}')

# Reshape data for CNN input after LDA
X_train_lda_cnn = X_train_lda.reshape(X_train_lda.shape[0], X_train_lda.shape[1], 1)
X_test_lda_cnn = X_test_lda.reshape(X_test_lda.shape[0], X_test_lda.shape[1], 1)

# Initializing models
models = {
    'Logistic Regression': LogisticRegression(C=1.0, penalty='l2'),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(max_depth=5),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
    'Feedforward Neural Network': Sequential([
        Dense(128, activation='relu', input_shape=(X_train_lda.shape[1],)),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Convolutional Neural Network': Sequential([
        Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_lda_cnn.shape[1], X_train_lda_cnn.shape[2])),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'Deep Belief Network': Sequential([
        Dense(128, activation='relu', input_shape=(X_train_lda.shape[1],)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ]),
    'SVM': SVC(kernel='linear')
}

# Initialize base models
base_models = [
    ('Logistic Regression', LogisticRegression(C=1.0, penalty='l2')),
    ('Naive Bayes', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(max_depth=5)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, max_depth=5)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1))
]

# Initialize ensemble methods
ensemble_methods = {
    'Voting Classifier': VotingClassifier(estimators=base_models),
    'Stacking Classifier': StackingClassifier(estimators=base_models, final_estimator=LogisticRegression()),
    'Bagging Classifier': BaggingClassifier(base_estimator=DecisionTreeClassifier())
}

# Include ensemble methods in the models dictionary
models.update(ensemble_methods)

# Training and evaluating each model with LDA-transformed data
for name, model in models.items():
    start_time = time.time()
    
    if name == 'Feedforward Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_lda, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    elif name == 'Convolutional Neural Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_lda_cnn, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    elif name == 'Deep Belief Network':
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train_lda, y_train_encoded, epochs=20, batch_size=64, verbose=0)
    else:
        model.fit(X_train_lda, y_train_encoded)
    
    train_time = time.time() - start_time
    
    start_time = time.time()
    if name in ['Feedforward Neural Network', 'Convolutional Neural Network', 'Deep Belief Network']:
        y_pred_proba = model.predict(X_test_lda)
        y_pred = y_pred_proba.argmax(axis=-1)
    else:
        y_pred = model.predict(X_test_lda)
    
    predict_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'{name} with LDA: Accuracy = {accuracy:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}, Training Time = {train_time:.4f} s, Prediction Time = {predict_time:.4f} s')


Number of components retained after LDA: 6


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(


Logistic Regression with LDA: Accuracy = 1.0000, Precision = 1.0000, Recall = 1.0000, F1 Score = 1.0000, Training Time = 0.0452 s, Prediction Time = 0.0003 s
Naive Bayes with LDA: Accuracy = 1.0000, Precision = 1.0000, Recall = 1.0000, F1 Score = 1.0000, Training Time = 0.0023 s, Prediction Time = 0.0008 s
Decision Tree with LDA: Accuracy = 0.8268, Precision = 0.8850, Recall = 0.8268, F1 Score = 0.7913, Training Time = 0.0076 s, Prediction Time = 0.0003 s
Random Forest with LDA: Accuracy = 1.0000, Precision = 1.0000, Recall = 1.0000, F1 Score = 1.0000, Training Time = 0.4118 s, Prediction Time = 0.0133 s
KNN with LDA: Accuracy = 1.0000, Precision = 1.0000, Recall = 1.0000, F1 Score = 1.0000, Training Time = 0.0024 s, Prediction Time = 0.0380 s
Gradient Boosting with LDA: Accuracy = 0.9714, Precision = 0.9733, Recall = 0.9714, F1 Score = 0.9718, Training Time = 4.1912 s, Prediction Time = 0.0065 s
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Feedforward Neura

