In [4]:
import null

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Voice Emotion Detection - Model Comparison\n",
    "\n",
    "This notebook compares different machine learning models for emotion classification from audio features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import joblib\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# Set plot style\n",
    "plt.style.use('ggplot')\n",
    "sns.set_palette(\"deep\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load and Explore the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Function to find a file in various possible locations\n",
    "def find_file(filename, search_paths=None):\n",
    "    \"\"\"Find a file in various possible locations\"\"\"\n",
    "    if search_paths is None:\n",
    "        # Default search paths\n",
    "        search_paths = [\n",
    "            \"\",  # Current directory\n",
    "            \"data/\",\n",
    "            \"../data/\",\n",
    "            \"scripts/data/\",\n",
    "            os.path.join(os.getcwd(), \"data/\")\n",
    "        ]\n",
    "    \n",
    "    # Try each path\n",
    "    for path in search_paths:\n",
    "        full_path = os.path.join(path, filename)\n",
    "        if os.path.exists(full_path):\n",
    "            print(f\"✅ Found file at: {os.path.abspath(full_path)}\")\n",
    "            return full_path\n",
    "    \n",
    "    # If we get here, file wasn't found\n",
    "    print(f\"❌ Error: Could not find {filename}. Searched in:\")\n",
    "    for path in search_paths:\n",
    "        print(f\"  - {os.path.abspath(os.path.join(path, filename))}\")\n",
    "    return None\n",
    "\n",
    "# Find and load the features.csv file\n",
    "features_path = find_file(\"features.csv\")\n",
    "df = None\n",
    "\n",
    "if features_path is not None:\n",
    "    try:\n",
    "        df = pd.read_csv(features_path)\n",
    "        print(f\"📊 Loaded dataset with {len(df)} samples and {df.shape[1]} columns\")\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Error loading dataset: {str(e)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Display the first few rows of the dataset\n",
    "if df is not None:\n",
    "    print(f\"Dataset shape: {df.shape}\")\n",
    "    df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Prepare Data for Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Prepare features and target\n",
    "X = None\n",
    "y = None\n",
    "\n",
    "if df is not None:\n",
    "    try:\n",
    "        # Check if required columns exist\n",
    "        if \"emotion\" not in df.columns:\n",
    "            print(\"❌ Error: The dataset does not contain an 'emotion' column.\")\n",
    "        else:\n",
    "            # Prepare features (X) by dropping non-feature columns\n",
    "            drop_cols = [col for col in df.columns if col in [\"file\", \"emotion\"]]\n",
    "            X = df.drop(columns=drop_cols)\n",
    "            y = df[\"emotion\"]\n",
    "            \n",
    "            # Print dataset info\n",
    "            print(f\"\\n📊 Dataset Information:\")\n",
    "            print(f\"  - Total samples: {len(df)}\")\n",
    "            print(f\"  - Features: {X.shape[1]}\")\n",
    "            print(f\"  - Emotion classes: {', '.join(y.unique())}\")\n",
    "            print(f\"  - Class distribution:\")\n",
    "            for emotion, count in y.value_counts().items():\n",
    "                print(f\"    - {emotion}: {count} samples ({count/len(y)*100:.1f}%)\")\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Error preparing data: {str(e)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Visualize class distribution\n",
    "if y is not None:\n",
    "    try:\n",
    "        plt.figure(figsize=(10, 6))\n",
    "        ax = sns.countplot(x=y)\n",
    "        plt.title('Emotion Class Distribution', fontsize=16)\n",
    "        plt.xlabel('Emotion', fontsize=14)\n",
    "        plt.ylabel('Count', fontsize=14)\n",
    "        \n",
    "        # Add count labels on top of bars\n",
    "        for p in ax.patches:\n",
    "            ax.annotate(f'{p.get_height()}', \n",
    "                        (p.get_x() + p.get_width() / 2., p.get_height()), \n",
    "                        ha = 'center', va = 'bottom', \n",
    "                        fontsize=12)\n",
    "        \n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Error plotting class distribution: {str(e)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Split data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = None, None, None, None\n",
    "\n",
    "if X is not None and y is not None:\n",
    "    try:\n",
    "        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "        print(f\"\\n🔪 Data split: {len(X_train)} training samples, {len(X_test)} test samples\")\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Error splitting data: {str(e)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Define Models to Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Define the models to compare\n",
    "models = None\n",
    "\n",
    "if X_train is not None and y_train is not None:\n",
    "    models = {\n",
    "        \"Random Forest\": RandomForestClassifier(n_estimators=100, random_state=42),\n",
    "        \"SVM (Linear Kernel)\": SVC(kernel='linear', C=1, random_state=42),\n",
    "        \"KNN\": KNeighborsClassifier(n_neighbors=5),\n",
    "        \"Logistic Regression\": LogisticRegression(max_iter=1000, random_state=42)\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Train and Evaluate Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Function to plot confusion matrix\n",
    "def plot_confusion_matrix(y_true, y_pred, title, labels=None):\n",
    "    try:\n",
    "        cm = confusion_matrix(y_true, y_pred)\n",
    "        plt.figure(figsize=(10, 8))\n",
    "        \n",
    "        # Use provided labels or get unique values from y_true\n",
    "        if labels is None:\n",
    "            labels = sorted(set(y_true))\n",
    "            \n",
    "        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', \n",
    "                    xticklabels=labels, \n",
    "                    yticklabels=labels)\n",
    "        plt.title(title, fontsize=16)\n",
    "        plt.ylabel('True Label', fontsize=14)\n",
    "        plt.xlabel('Predicted Label', fontsize=14)\n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Error plotting confusion matrix: {str(e)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Train and evaluate each model\n",
    "results = []\n",
    "best_model = None\n",
    "best_accuracy = 0\n",
    "best_model_name = \"\"\n",
    "\n",
    "if models is not None and X_train is not None and y_train is not None:\n",
    "    for name, model in models.items():\n",
    "        print(f\"\\n🔍 Training: {name}\")\n",
    "        try:\n",
    "            model.fit(X_train, y_train)\n",
    "            y_pred = model.predict(X_test)\n",
    "            acc = accuracy_score(y_test, y_pred)\n",
    "            \n",
    "            print(f\"✅ Accuracy: {acc * 100:.2f}%\")\n",
    "            print(\"📋 Classification Report:\")\n",
    "            report = classification_report(y_test, y_pred)\n",
    "            print(report)\n",
    "            \n",
    "            # Plot confusion matrix\n",
    "            plot_confusion_matrix(y_test, y_pred, f\"Confusion Matrix - {name}\", sorted(y.unique()))\n",
    "            \n",
    "            results.append({\n",
    "                \"model\": name,\n",
    "                \"accuracy\": acc,\n",
    "                \"report\": report\n",
    "            })\n",
    "            \n",
    "            if acc > best_accuracy:\n",
    "                best_accuracy = acc\n",
    "                best_model = model\n",
    "                best_model_name = name\n",
    "        except Exception as e:\n",
    "            print(f\"❌ Error training {name}: {str(e)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Compare Model Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Visualize model comparison\n",
    "if len(results) > 0:\n",
    "    try:\n",
    "        # Sort results by accuracy\n",
    "        sorted_results = sorted(results, key=lambda x: x[\"accuracy\"], reverse=True)\n",
    "        \n",
    "        # Create bar chart\n",
    "        plt.figure(figsize=(12, 6))\n",
    "        model_names = [result[\"model\"] for result in sorted_results]\n",
    "        accuracies = [result[\"accuracy\"] * 100 for result in sorted_results]\n",
    "        \n",
    "        bars = plt.bar(model_names, accuracies, color=sns.color_palette(\"deep\", len(model_names)))\n",
    "        \n",
    "        # Add accuracy values on top of bars\n",
    "        for bar in bars:\n",
    "            height = bar.get_height()\n",
    "            plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,\n",
    "                    f'{height:.2f}%', ha='center', va='bottom', fontsize=12)\n",
    "        \n",
    "        plt.title('Model Accuracy Comparison', fontsize=16)\n",
    "        plt.xlabel('Model', fontsize=14)\n",
    "        plt.ylabel('Accuracy (%)', fontsize=14)\n",
    "        plt.ylim(0, 100)\n",
    "        plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "        \n",
    "        # Print summary\n",
    "        print(\"\\n📊 Model Comparison Summary:\")\n",
    "        for result in sorted_results:\n",
    "            print(f\"  - {result['model']}: {result['accuracy'] * 100:.2f}%\")\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Error visualizing model comparison: {str(e)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Save the Best Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Function to ensure a directory exists\n",
    "def ensure_dir_exists(directory):\n",
    "    \"\"\"Make sure a directory exists, create it if it doesn't\"\"\"\n",
    "    if not os.path.exists(directory):\n",
    "        os.makedirs(directory)\n",
    "        print(f\"✅ Created directory: {os.path.abspath(directory)}\")\n",
    "    return directory\n",
    "\n",
    "# Save the best performing model\n",
    "if best_model is not None:\n",
    "    try:\n",
    "        # Ensure models directory exists\n",
    "        models_dir = ensure_dir_exists(\"models\")\n",
    "        \n",
    "        # Save the model\n",
    "        best_model_path = os.path.join(models_dir, \"best_model.pkl\")\n",
    "        joblib.dump(best_model, best_model_path)\n",
    "        print(f\"\\n💾 Best model ({best_model_name}) saved to {os.path.abspath(best_model_path)}\")\n",
    "        print(f\"   Accuracy: {best_accuracy * 100:.2f}%\")\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Error saving best model: {str(e)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Feature Importance Analysis (for Random Forest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# If the best model is Random Forest, visualize feature importance\n",
    "if best_model is not None and X is not None and isinstance(best_model, RandomForestClassifier):\n",
    "    try:\n",
    "        # Get feature importances\n",
    "        importances = best_model.feature_importances_\n",
    "        indices = np.argsort(importances)[::-1]\n",
    "        \n",
    "        # Plot the top 15 features (or fewer if there aren't 15)\n",
    "        num_features = min(15, X.shape[1])\n",
    "        plt.figure(figsize=(12, 8))\n",
    "        plt.title('Feature Importance (Random Forest)', fontsize=16)\n",
    "        plt.bar(range(num_features), importances[indices][:num_features], align='center')\n",
    "        plt.xticks(range(num_features), [X.columns[i] for i in indices][:num_features], rotation=90)\n",
    "        plt.xlabel('Features', fontsize=14)\n",
    "        plt.ylabel('Importance', fontsize=14)\n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "        \n",
    "        # Print top 10 features (or fewer if there aren't 10)\n",
    "        num_to_print = min(10, X.shape[1])\n",
    "        print(\"\\n🔝 Top 10 Most Important Features:\")\n",
    "        for i in range(num_to_print):\n",
    "            print(f\"{i+1}. {X.columns[indices[i]]}: {importances[indices[i]]:.4f}\")\n",
    "    except Exception as e:\n",
    "        print(f\"❌ Error analyzing feature importance: {str(e)}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

ImportError: cannot import name 'MutableMapping' from 'collections' (C:\Users\jorge\AppData\Local\Programs\Python\Python313\Lib\collections\__init__.py)