<a href="https://colab.research.google.com/github/Ravikiran-Bhonagiri/Feature-Engineering/blob/main/Audio_Modeling_Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install mlflow



In [2]:
import joblib
import os
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Perceptron
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Lookup function for classifiers
algorithm_names = [
    'Random Forest', 'SVM', 'KNN', 'Logistic Regression', 'XGBoost', 'Decision Tree', 'Naive Bayes',
    'Gradient Boosting', 'AdaBoost', 'LightGBM', 'Extra Trees', 'Bagging',
    'LDA', 'QDA', 'Ridge Classifier']

# Function to get the classifier based on the name
def get_classifier(algorithm_name):
    classifiers = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'SVM': SVC(kernel='linear', probability=True, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Logistic Regression': LogisticRegression(solver='lbfgs', max_iter=500, random_state=42),
        'XGBoost': XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Naive Bayes': GaussianNB(),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'LightGBM': LGBMClassifier(random_state=42),
        'Extra Trees': ExtraTreesClassifier(random_state=42),
        'Bagging': BaggingClassifier(random_state=42),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
        'Ridge Classifier': RidgeClassifier(random_state=42)
    }
    return classifiers.get(algorithm_name)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
df = pd.read_csv('audio_features.csv')

df['mean_pitch'] = df.groupby('audio_label')['mean_pitch'].transform(lambda x: x.fillna(x.mean()))

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Splitting the dataset
X = df.drop(columns=['audio_file', 'audio_label'])  # Features
y = df['audio_label']  # Labels


# Label Encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

In [5]:
import joblib
import os

# Directory to save models
os.makedirs('/content/saved_models', exist_ok=True)

# DataFrame to store confusion matrix metrics
results = []

# Iterate over each algorithm
for algo_name in algorithm_names:
    clf = get_classifier(algo_name)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Confusion Matrix and Metrics
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Save model
    model_path = f'/content/saved_models/{algo_name.replace(" ", "_").lower()}.joblib'
    joblib.dump(clf, model_path)

    # Get model size in KB
    model_size_kb = os.path.getsize(model_path) / 1024

    # Store results
    results.append({
        'Algorithm': algo_name,
        'Accuracy': acc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'True Positive (TP)': cm[0, 0] if cm.shape[0] > 1 else 0,
        'True Negative (TN)': cm[1, 1] if cm.shape[0] > 1 else 0,
        'False Positive (FP)': cm[0, 1] if cm.shape[0] > 1 else 0,
        'False Negative (FN)': cm[1, 0] if cm.shape[0] > 1 else 0,
        'Model Size (KB)': model_size_kb
    })


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000100 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 543
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 34
[LightGBM] [Info] Start training from score -1.860752
[LightGBM] [Info] Start training from score -1.860752
[LightGBM] [Info] Start training from score -1.727221
[LightGBM] [Info] Start training from score -1.727221
[LightGBM] [Info] Start training from score -1.860752
[LightGBM] [Info] Start training from score -1.727221


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df[['Algorithm', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Model Size (KB)']]

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1 Score,Model Size (KB)
0,Random Forest,1.0,1.0,1.0,1.0,170.993164
1,SVM,0.9,0.94,0.9,0.8875,8.393555
2,KNN,0.95,0.9625,0.95,0.948571,14.06543
3,Logistic Regression,0.9,0.94,0.9,0.8875,3.459961
4,XGBoost,1.0,1.0,1.0,1.0,419.129883
5,Decision Tree,1.0,1.0,1.0,1.0,3.461914
6,Naive Bayes,0.95,0.9625,0.95,0.948571,4.991211
7,Gradient Boosting,1.0,1.0,1.0,1.0,588.106445
8,AdaBoost,0.35,0.228125,0.35,0.247368,39.878906
9,LightGBM,1.0,1.0,1.0,1.0,212.660156


In [7]:
import joblib
import os

# Directory to save models
os.makedirs('/content/saved_models', exist_ok=True)

# DataFrame to store confusion matrix metrics
results = []

# Iterate over each algorithm
for algo_name in algorithm_names:
    clf = get_classifier(algo_name)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Save model
    model_path = f'/content/saved_models/{algo_name.replace(" ", "_").lower()}.joblib'
    joblib.dump(clf, model_path)

    # Load saved model
    loaded_clf = joblib.load(model_path)
    y_pred_loaded = loaded_clf.predict(X_test)

    # Confusion Matrix and Metrics for loaded model
    cm = confusion_matrix(y_test, y_pred_loaded)
    acc = accuracy_score(y_test, y_pred_loaded)
    precision = precision_score(y_test, y_pred_loaded, average='weighted')
    recall = recall_score(y_test, y_pred_loaded, average='weighted')
    f1 = f1_score(y_test, y_pred_loaded, average='weighted')

    # Get model size in KB
    model_size_kb = os.path.getsize(model_path) / 1024

    # Store results
    results.append({
        'Algorithm': algo_name,
        'Accuracy': acc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'True Positive (TP)': cm[0, 0] if cm.shape[0] > 1 else 0,
        'True Negative (TN)': cm[1, 1] if cm.shape[0] > 1 else 0,
        'False Positive (FP)': cm[0, 1] if cm.shape[0] > 1 else 0,
        'False Negative (FN)': cm[1, 0] if cm.shape[0] > 1 else 0,
        'Model Size (KB)': model_size_kb
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df[['Algorithm', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Model Size (KB)']]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 543
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 34
[LightGBM] [Info] Start training from score -1.860752
[LightGBM] [Info] Start training from score -1.860752
[LightGBM] [Info] Start training from score -1.727221
[LightGBM] [Info] Start training from score -1.727221
[LightGBM] [Info] Start training from score -1.860752
[LightGBM] [Info] Start training from score -1.727221


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1 Score,Model Size (KB)
0,Random Forest,1.0,1.0,1.0,1.0,170.993164
1,SVM,0.9,0.94,0.9,0.8875,8.393555
2,KNN,0.95,0.9625,0.95,0.948571,14.06543
3,Logistic Regression,0.9,0.94,0.9,0.8875,3.459961
4,XGBoost,1.0,1.0,1.0,1.0,419.129883
5,Decision Tree,1.0,1.0,1.0,1.0,3.461914
6,Naive Bayes,0.95,0.9625,0.95,0.948571,4.991211
7,Gradient Boosting,1.0,1.0,1.0,1.0,588.106445
8,AdaBoost,0.35,0.228125,0.35,0.247368,39.878906
9,LightGBM,1.0,1.0,1.0,1.0,212.660156


## Tracking Experiments

In [8]:
import mlflow
import mlflow.sklearn
import os
import shutil
import joblib


# Clean up and set up the MLflow directory
if os.path.exists('/content/mlruns-new'):
    shutil.rmtree('/content/mlruns-new')
os.makedirs('/content/mlruns-new', exist_ok=True)

# Set the tracking URI
mlflow.set_tracking_uri("/content/mlruns-new")

# Start tracking experiments
mlflow.set_experiment("Multi-Class Classification Experiment")

# Directory to save models
os.makedirs('/content/saved_models', exist_ok=True)

# List of algorithms to evaluate
algorithm_names = [
    'Random Forest', 'SVM', 'KNN', 'Logistic Regression', 'XGBoost', 'Decision Tree', 'Naive Bayes',
    'Gradient Boosting', 'AdaBoost', 'LightGBM', 'CatBoost', 'Extra Trees', 'Bagging',
    'LDA', 'QDA', 'Ridge Classifier', 'Perceptron'
]

# Function to get the classifier based on the name
def get_classifier(algorithm_name):
    classifiers = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'SVM': SVC(kernel='linear', probability=True, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Logistic Regression': LogisticRegression(solver='lbfgs', max_iter=500, random_state=42),
        'XGBoost': XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Naive Bayes': GaussianNB(),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'LightGBM': LGBMClassifier(random_state=42),
        'Extra Trees': ExtraTreesClassifier(random_state=42),
        'Bagging': BaggingClassifier(random_state=42),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
        'Ridge Classifier': RidgeClassifier(random_state=42)
    }
    return classifiers.get(algorithm_name)

# DataFrame to store confusion matrix metrics
results = []

# Iterate over each algorithm
for algo_name in algorithm_names:
    clf = get_classifier(algo_name)

    if clf is not None:
        with mlflow.start_run(run_name=algo_name):
            # Fit model
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            # Log parameters and metrics to MLflow
            mlflow.log_param("algorithm", algo_name)

            # Save model
            model_path = f'/content/saved_models/{algo_name.replace(" ", "_").lower()}.joblib'
            joblib.dump(clf, model_path)
            mlflow.log_artifact(model_path, artifact_path="models")

            # Load saved model
            loaded_clf = joblib.load(model_path)
            y_pred_loaded = loaded_clf.predict(X_test)

            # Confusion Matrix and Metrics for loaded model
            cm = confusion_matrix(y_test, y_pred_loaded)
            acc = accuracy_score(y_test, y_pred_loaded)
            precision = precision_score(y_test, y_pred_loaded, average='weighted')
            recall = recall_score(y_test, y_pred_loaded, average='weighted')
            f1 = f1_score(y_test, y_pred_loaded, average='weighted')

            # Get model size in KB
            model_size_kb = os.path.getsize(model_path) / 1024

            # Log metrics
            metrics_dict = {
                "accuracy": acc,
                "precision": precision,
                "recall": recall,
                "f1_score": f1,
                "model_size": model_size_kb,
            }

            # Logging the metrics as a dictionary
            mlflow.log_metrics(metrics_dict)

            # Store results
            results.append({
                'Algorithm': algo_name,
                'Accuracy': acc,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1,
                'True Positive (TP)': cm[0, 0] if cm.shape[0] > 1 else 0,
                'True Negative (TN)': cm[1, 1] if cm.shape[0] > 1 else 0,
                'False Positive (FP)': cm[0, 1] if cm.shape[0] > 1 else 0,
                'False Negative (FN)': cm[1, 0] if cm.shape[0] > 1 else 0,
                'Model Size (KB)': model_size_kb
            })

# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df[['Algorithm', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Model Size (KB)']]

2025/01/15 22:19:40 INFO mlflow.tracking.fluent: Experiment with name 'Multi-Class Classification Experiment' does not exist. Creating a new experiment.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 543
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 34
[LightGBM] [Info] Start training from score -1.860752
[LightGBM] [Info] Start training from score -1.860752
[LightGBM] [Info] Start training from score -1.727221
[LightGBM] [Info] Start training from score -1.727221
[LightGBM] [Info] Start training from score -1.860752
[LightGBM] [Info] Start training from score -1.727221


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1 Score,Model Size (KB)
0,Random Forest,1.0,1.0,1.0,1.0,170.993164
1,SVM,0.9,0.94,0.9,0.8875,8.393555
2,KNN,0.95,0.9625,0.95,0.948571,14.06543
3,Logistic Regression,0.9,0.94,0.9,0.8875,3.459961
4,XGBoost,1.0,1.0,1.0,1.0,419.129883
5,Decision Tree,1.0,1.0,1.0,1.0,3.461914
6,Naive Bayes,0.95,0.9625,0.95,0.948571,4.991211
7,Gradient Boosting,1.0,1.0,1.0,1.0,588.106445
8,AdaBoost,0.35,0.228125,0.35,0.247368,39.878906
9,LightGBM,1.0,1.0,1.0,1.0,212.660156


In [9]:
!zip -r /content/mlflow_experiment_tracking.zip /content/mlruns-new

updating: content/mlruns-new/ (stored 0%)
updating: content/mlruns-new/.trash/ (stored 0%)
  adding: content/mlruns-new/898608651850578358/ (stored 0%)
  adding: content/mlruns-new/898608651850578358/8e064c4a8ee647fc8da61f1dfc72f0b5/ (stored 0%)
  adding: content/mlruns-new/898608651850578358/8e064c4a8ee647fc8da61f1dfc72f0b5/params/ (stored 0%)
  adding: content/mlruns-new/898608651850578358/8e064c4a8ee647fc8da61f1dfc72f0b5/params/algorithm (stored 0%)
  adding: content/mlruns-new/898608651850578358/8e064c4a8ee647fc8da61f1dfc72f0b5/meta.yaml (deflated 45%)
  adding: content/mlruns-new/898608651850578358/8e064c4a8ee647fc8da61f1dfc72f0b5/tags/ (stored 0%)
  adding: content/mlruns-new/898608651850578358/8e064c4a8ee647fc8da61f1dfc72f0b5/tags/mlflow.source.name (deflated 5%)
  adding: content/mlruns-new/898608651850578358/8e064c4a8ee647fc8da61f1dfc72f0b5/tags/mlflow.runName (stored 0%)
  adding: content/mlruns-new/898608651850578358/8e064c4a8ee647fc8da61f1dfc72f0b5/tags/mlflow.user (stored 

In [10]:
%rm -rf /content/mlruns-new