In [4]:
import os 
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "PUT YOUR KEY HERE" #This is the place holder for your google cloud tts
import ast  # Module to convert string representations of lists into actual lists
import pandas as pd
from pathlib import Path
import librosa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from google.cloud import texttospeech
import boto3 #For this one you have to set up your key already in your console

---

## First Step: Processing the Audio Files

The goal of this script is to **automate the process of extracting meaningful audio features** from a collection of `.wav` audio files located within a specified directory. By leveraging the `librosa` library, it accomplishes the following:

- **Extracts a set of audio features** for each file, including:
  - **Mel-frequency cepstral coefficients (MFCCs):** Capture the timbre of the audio.
  - **Zero-crossing rate:** Measures the number of times the signal crosses the horizontal axis.
  - **Pitch statistics:** Includes mean and standard deviation of the audio's pitch.
  - **Spectral centroid:** Indicates where the center of mass of the spectrum is located.
  - **Spectral bandwidth:** Describes the width of the spectral energy distribution.
  - **Spectral rolloff:** Measures the shape of the spectrum.
  - **Harmonic and percussive components:** Separate the harmonic and percussive elements of the audio signal.

These features are **critical in various applications** such as audio classification, music information retrieval, and sound analysis.

- **Compiles the extracted features** into a structured format, along with the file and folder names for context, and **saves them into a CSV file**. This streamlined process facilitates easy analysis and modeling by providing a **ready-to-use dataset of audio features**.

---


In [7]:
def extract_features(file_path):
    try:
        audio, sr = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        zcr = librosa.feature.zero_crossing_rate(y=audio)
        pitch, _ = librosa.piptrack(y=audio, sr=sr)
        spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
        spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
        harmonic, percussive = librosa.effects.hpss(audio)

        features = {
            'MFCCs': np.mean(mfccs, axis=1).tolist(),
            'ZCR': np.mean(zcr).item(),
            'PitchMean': np.mean(pitch).item(),
            'PitchStd': np.std(pitch).item(),
            'SpectralCentroid': np.mean(spectral_centroid).item(),
            'SpectralBandwidth': np.mean(spectral_bandwidth).item(),
            'SpectralRolloff': np.mean(spectral_rolloff).item(),
            'Harmonic': np.mean(harmonic).item(),
            'Percussive': np.mean(percussive).item(),
        }
        return features
    except Exception as e:
        print(f"Could not process file {file_path}: {e}")
        return None

def process_audio_directory(directory_path):
    audio_features = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                folder_name = os.path.basename(root)
                print(f"Processing file: {file_path}")
                features = extract_features(file_path)
                if features:
                    features['Folder'] = folder_name  # Add folder name
                    features['File'] = file  # Add file name
                    audio_features.append(features)
                else:
                    print(f"Failed to process {file_path}")
    return audio_features

ccdata_path = 'ccdata'
audio_features = process_audio_directory(ccdata_path)

# Ensure audio_features is not empty before proceeding
if audio_features:
    # Define column order with 'Folder' and 'File' at the beginning
    column_order = ['Folder', 'File'] + [col for col in audio_features[0] if col not in ['Folder', 'File']]
    # Convert to DataFrame with specified column order
    df_audio_features = pd.DataFrame(audio_features, columns=column_order)
    output_file = 'enhanced_audio_features.csv'  # Full path to save in your Drive
    df_audio_features.to_csv(output_file, index=False)
    print(f"Feature extraction complete. Results saved to {output_file}")
else:
    print("No audio features were extracted. Please check the files and their format.")

Processing file: ccdata/An individual_s culture/citizen_audio_0c4e7b6c-6f28-4efa-bc30-85fefb055b2b.wav
Processing file: ccdata/An individual_s culture/citizen_audio_9eb31fe4-a6ab-448e-843a-02424613e681.wav
Processing file: ccdata/An individual_s culture/citizen_audio_0a66b438-9651-4748-ae5e-33db97b7b5c0.wav
Processing file: ccdata/An individual_s culture/citizen_audio_459d5457-2a8e-4812-bbd3-de9cb15b9094.wav
Processing file: ccdata/An individual_s culture/citizen_audio_11ebd1b3-b4b1-4b04-a134-a958feae4cdc.wav
Processing file: ccdata/An individual_s culture/citizen_audio_2bd82592-e0b8-426b-8b3c-50d8589708f6.wav
Processing file: ccdata/An individual_s culture/citizen_audio_12175079-597a-4854-a039-2cf3ad937bc2.wav
Processing file: ccdata/An individual_s culture/citizen_audio_64b6d0cd-237a-481e-888e-6ac308d2b83a.wav
Processing file: ccdata/An individual_s culture/citizen_audio_0413c4f8-2eef-4c15-8a68-88db002575a2.wav
Processing file: ccdata/An individual_s culture/citizen_audio_630e0f8e-4c

  audio, sr = librosa.load(file_path, sr=None)


Processing file: ccdata/To build a house/citizen_audio_710e01c2-2617-414f-8489-c4bf2c88e1f1.wav
Processing file: ccdata/To build a house/citizen_audio_865b63c8-a0a7-4846-b6ca-26a85bb15828.wav
Processing file: ccdata/To build a house/citizen_audio_98aa52a3-e72f-480f-83d6-e010cff49d24.wav
Processing file: ccdata/To build a house/citizen_audio_de324724-de22-4f4f-8121-0d4243e9286b.wav
Processing file: ccdata/To build a house/citizen_audio_29ad9fc5-1191-472b-a331-f9c60001c327.wav
Processing file: ccdata/To build a house/citizen_audio_bf37ed3e-dcad-4fb4-a67b-cc775e680292.wav
Processing file: ccdata/To build a house/citizen_audio_49ef82f0-497a-422c-8533-598d02265ada.wav
Processing file: ccdata/To build a house/citizen_audio_8821c0d8-1df8-4afe-a83a-846d0cf208bc.wav
Processing file: ccdata/To build a house/citizen_audio_20f28680-b251-4076-a45c-7d3305388a3d.wav
Processing file: ccdata/To build a house/citizen_audio_80bdafb6-984b-425e-80ca-f526f10cde57.wav
Processing file: ccdata/To build a house

---

## Second Step: Classification
After successfully completing the initial phase of audio feature extraction and securing the "enhanced_audio_features.csv", a crucial enhancement was introduced to the dataset. I personally augmented the data with five pivotal columns: "Clarity", "Confidence", "Agreeableness", "Pace", and "Volume Dynamics", alongside a binary "Compelling (0 or 1)" indicator. This manual enrichment was driven by a comprehensive auditory analysis, wherein I meticulously listened to each voice recording, discerning and annotating these nuanced traits. This labor-intensive process was not merely for data augmentation but aimed at crafting a robust foundation for our classification endeavor.

With this enriched dataset, dubbed "final_classification_features.csv", the journey towards developing a predictive model commenced. The preparatory steps involved ensuring the dataset's integrity, which included verifying the presence and accurate spelling of each column, especially the critical "Compelling" column which serves as our target variable. The non-essential "Folder" and "File" columns, primarily used for tracking and organization, were excluded from the model training dataset to maintain focus on the features directly influencing voice compellingness.

Transforming the "MFCCs" column from a string representation to a list and subsequently expanding it into individual features was a pivotal preprocessing step. This transformation facilitated a detailed feature analysis, allowing each Mel-frequency cepstral coefficient to contribute independently to the model's learning process.

The classification model's development phase encompassed the standard practices of splitting the dataset into training and testing subsets, feature scaling for normalization, and finally, model training and evaluation. A Gradient Boosting Classifier, known for its effectiveness in handling diverse datasets, was chosen as the predictive model. This model was then rigorously trained on the scaled training data.

Evaluation metrics and a confusion matrix provided insights into the model's performance, revealing its strengths and areas for improvement. Additionally, an analysis of feature importance was conducted, offering a window into the attributes most influential in determining the compellingness of a voice. This not only informed the model refinement process but also deepened the understanding of what makes a voice compelling from an auditory perspective.

---

In [8]:
# Load the dataset
df = pd.read_csv('final_classificaiton_features.csv')
# Print all column names to see if 'Compelling' is present and correctly spelled
print(df.columns)
# Assuming 'Folder' and 'File' columns are not needed for model training
df.drop(['Folder', 'File'], axis=1, inplace=True)

# Convert string representations in 'MFCCs' to lists and expand them into separate columns
df['MFCCs'] = df['MFCCs'].apply(ast.literal_eval)
mfccs_df = pd.DataFrame(df['MFCCs'].tolist(), index=df.index)
mfccs_df.columns = ['MFCC_' + str(i) for i in range(len(mfccs_df.columns))]
df = pd.concat([df.drop(['MFCCs'], axis=1), mfccs_df], axis=1)

# Define features and target
X = df.drop(['Compelling'], axis=1)
y = df['Compelling']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model training
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Model evaluation
y_pred = model.predict(X_test_scaled)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Feature importance
feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
print("Feature Importances:\n", feature_importances)

Index(['Folder', 'File', 'MFCCs', 'ZCR', 'PitchMean', 'PitchStd',
       'SpectralCentroid', 'SpectralBandwidth', 'SpectralRolloff', 'Harmonic',
       'Percussive', 'Clarity', 'Confidence', 'Agreeableness', 'Pace',
       'Volume Dynamics', 'Compelling'],
      dtype='object')
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.95      0.82        19
           1       0.86      0.46      0.60        13

    accuracy                           0.75        32
   macro avg       0.79      0.70      0.71        32
weighted avg       0.78      0.75      0.73        32

Confusion Matrix:
 [[18  1]
 [ 7  6]]
Feature Importances:
                      importance
Agreeableness      7.240002e-01
Volume Dynamics    4.050539e-02
MFCC_12            3.072972e-02
MFCC_8             2.424984e-02
Clarity            2.371334e-02
SpectralRolloff    2.268341e-02
MFCC_10            1.907508e-02
MFCC_9             1.628832e-02
SpectralCentroid   1.2

---

## Enhancing the Project: Introducing Text-to-Speech Synthesis

Building upon the insights gained from the classification model, the next step in our exploration involves leveraging Google Cloud's Text-to-Speech (TTS) API to synthesize speech from text. This process not only enriches our dataset but also provides a practical avenue to assess the accuracy and applicability of our classification results in real-world scenarios.

### Objective

The objective here is to dynamically generate audio files using various voice configurations to simulate different speaking styles and accents. This allows us to:

- Evaluate the robustness of our classification model across a diverse range of voices.
- Understand how different voice characteristics (e.g., pitch, speaking rate) might influence audience engagement, as predicted by our model.

### Implementation

1. **Initialize the Text-to-Speech Client:** A crucial first step to access Google Cloud's TTS service, enabling us to convert text into natural-sounding speech.

2. **Define the Text:** We select a piece of text intended for an engaging presentation, serving as the base content for our speech synthesis.

3. **Configure Voices:** A series of voice configurations are defined to cover a broad spectrum of languages, accents, genders, pitches, and speaking rates. These variations aim to mimic real-world speaking conditions as closely as possible.

4. **Synthesize Speech:** For each voice configuration, we:
   - Convert the specified text into speech.
   - Adjust the voice parameters according to our predefined configurations.
   - Generate an MP3 file for each configuration, allowing for a practical and accessible way to review the synthesized speech.

By integrating text-to-speech synthesis into our project, we bridge the gap between theoretical data analysis and tangible, audible results. This step not only enhances our dataset but also lays the groundwork for a deeper understanding of how voice attributes impact listener perception and engagement.

---

In [12]:
# Initialize the Text-to-Speech client
client = texttospeech.TextToSpeechClient()

# Define the text to be spoken
text = "Hi, today I'm going to teach you guys how to make an engaging presentation. In order to get people engaged, we need to make eye contact with our audience."

# Define a list of configurations for different voices
voice_configs = [
    {"language_code": "en-US", "name": "en-US-Wavenet-A", "ssml_gender": texttospeech.SsmlVoiceGender.MALE, "pitch": 0, "speaking_rate": 1.0},
    {"language_code": "en-US", "name": "en-US-Wavenet-D", "ssml_gender": texttospeech.SsmlVoiceGender.MALE, "pitch": 5, "speaking_rate": 1.2},
    {"language_code": "en-GB", "name": "en-GB-Wavenet-A", "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE, "pitch": -5, "speaking_rate": 0.9},
    {"language_code": "en-AU", "name": "en-AU-Wavenet-C", "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE, "pitch": 2, "speaking_rate": 1.1},
    {"language_code": "en-IN", "name": "en-IN-Wavenet-A", "ssml_gender": texttospeech.SsmlVoiceGender.MALE, "pitch": -2, "speaking_rate": 0.95},
    {"language_code": "en-US", "name": "en-US-Wavenet-E", "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE, "pitch": 10, "speaking_rate": 0.8},
    {"language_code": "en-US", "name": "en-US-Wavenet-F", "ssml_gender": texttospeech.SsmlVoiceGender.NEUTRAL, "pitch": -10, "speaking_rate": 1.3},
    {"language_code": "en-GB", "name": "en-GB-Wavenet-D", "ssml_gender": texttospeech.SsmlVoiceGender.MALE, "pitch": 3, "speaking_rate": 1.05},
    {"language_code": "en-AU", "name": "en-AU-Wavenet-A", "ssml_gender": texttospeech.SsmlVoiceGender.MALE, "pitch": -3, "speaking_rate": 1.15},
    {"language_code": "en-US", "name": "en-US-Wavenet-B", "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE, "pitch": 6, "speaking_rate": 0.85},
]

# Iterate over the voice configurations and synthesize speech
for i, config in enumerate(voice_configs, start=1):
    synthesis_input = texttospeech.SynthesisInput(text=text)
    voice_params = texttospeech.VoiceSelectionParams(
        language_code=config["language_code"],
        name=config["name"],
        ssml_gender=config["ssml_gender"]
    )
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
        pitch=config["pitch"],
        speaking_rate=config["speaking_rate"]
    )

    # Perform the text-to-speech request
    response = client.synthesize_speech(input=synthesis_input, voice=voice_params, audio_config=audio_config)

    # Save the output to an MP3 file
    filename = f'output_voice_{i}.mp3'  # Adjust path as needed
    with open(filename, 'wb') as out:
        out.write(response.audio_content)
        print(f'Audio content written to file "{filename}"')

Audio content written to file "output_voice_1.mp3"
Audio content written to file "output_voice_2.mp3"
Audio content written to file "output_voice_3.mp3"
Audio content written to file "output_voice_4.mp3"
Audio content written to file "output_voice_5.mp3"
Audio content written to file "output_voice_6.mp3"
Audio content written to file "output_voice_7.mp3"
Audio content written to file "output_voice_8.mp3"
Audio content written to file "output_voice_9.mp3"
Audio content written to file "output_voice_10.mp3"


---

## Expanding the Project: Leveraging Amazon Polly for Text-to-Speech

In this phase, we utilize Amazon Polly, a service that turns text into lifelike speech, to further enhance our project. Amazon Polly allows us to explore an extensive range of voices and languages, providing a rich set of options for speech synthesis.

### Objective

The goal is to experiment with different voices and configurations, including the neural engine for more natural-sounding speech. This exploration includes:

- Verifying which voices support the neural text-to-speech (TTS) engine for superior audio quality.
- Demonstrating various speaking styles, rates, and volumes to simulate real-life scenarios and presentations.

### Implementation

1. **Initialize Amazon Polly Client:** Set up the client using AWS SDK for Python (Boto3), enabling access to the Polly service.

2. **Check Neural Engine Support:** Before synthesizing speech, we ensure the selected voice supports the neural engine, promising a more natural and lifelike audio output.

3. **Define Text and Voice Configurations:** A predefined text is ready to be synthesized, and a set of voice configurations is prepared, including one that demonstrates the 'newscaster' speaking style with the voice 'Matthew'.

4. **Synthesize and Save Speech:** For each configuration, we:
   - Create SSML (Speech Synthesis Markup Language) text to include specific speaking styles if required.
   - Call Amazon Polly's `synthesize_speech` method to generate speech.
   - Save the synthesized speech to an MP3 file for each voice configuration.

Through integrating Amazon Polly, we can produce a diverse array of speech samples from text, enhancing our project's ability to model and analyze speech patterns. This step is not just about expanding our dataset but also about understanding the impact of speech synthesis technologies on communication and presentation skills.

---

In [29]:
polly_client = boto3.client('polly')

def voice_supports_neural_engine(voice_id):
    response = polly_client.describe_voices(Engine='neural')
    neural_voices = [voice['Id'] for voice in response['Voices']]
    return voice_id in neural_voices

text = "Hi, today I'm going to teach you guys how to make an engaging presentation. To get people engaged, we need to make eye contact with our audience."

# Include a demonstration of the newscaster speaking style for 'Matthew' as an example
voice_configs = [
    {"voice_id": "Joanna", "speaking_rate": "1.0", "volume": "medium"},
    {"voice_id": "Matthew", "speaking_rate": "1.2", "volume": "loud", "style": "newscaster"},
    {"voice_id": "Salli", "speaking_rate": "0.9", "volume": "soft"},
    {"voice_id": "Kimberly", "speaking_rate": "1.05", "volume": "medium"},
    {"voice_id": "Joey", "speaking_rate": "1.1", "volume": "loud"}
]

for i, config in enumerate(voice_configs, start=1):
    if not voice_supports_neural_engine(config["voice_id"]):
        print(f"Voice {config['voice_id']} does not support Neural engine. Skipping.")
        continue

    # Adjust SSML to include the newscaster speaking style if specified
    if 'style' in config and config['style'] == 'newscaster':
        ssml_text = f"""
        <speak>
            <amazon:domain name="news">
                {text}
            </amazon:domain>
        </speak>
        """.strip()
    else:
        ssml_text = f"""
        <speak>
            <prosody rate='{config['speaking_rate']}' volume='{config['volume']}'>
                {text}
            </prosody>
            <break time="500ms"/>
            Remember, the key to a great presentation is not just what you say, but how you say it.
        </speak>
        """.strip()

    try:
        response = polly_client.synthesize_speech(
            Engine='neural',
            Text=ssml_text,
            TextType='ssml',
            OutputFormat='mp3',
            VoiceId=config["voice_id"]
        )

        filename = f'polly_neural_voice_{i}.mp3'
        with open(filename, 'wb') as file:
            file.write(response['AudioStream'].read())
            print(f'Amazon Polly audio content written to file "{filename}"')
    except boto3.exceptions.Boto3Error as e:
        print(f"An error occurred: {e}")

Amazon Polly audio content written to file "polly_neural_voice_1.mp3"
Amazon Polly audio content written to file "polly_neural_voice_2.mp3"
Amazon Polly audio content written to file "polly_neural_voice_3.mp3"
Amazon Polly audio content written to file "polly_neural_voice_4.mp3"
Amazon Polly audio content written to file "polly_neural_voice_5.mp3"


---

## Audio Processing for the Generated Voices
#### Here I did the same processing for the audio files but this time for the generated ones and got them in a dataset

---

In [5]:
def extract_features(file_path):
    try:
        audio, sr = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        zcr = librosa.feature.zero_crossing_rate(y=audio)
        pitch, _ = librosa.piptrack(y=audio, sr=sr)
        spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
        spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
        harmonic, percussive = librosa.effects.hpss(audio)

        features = {
            'MFCCs': np.mean(mfccs, axis=1).tolist(),
            'ZCR': np.mean(zcr).item(),
            'PitchMean': np.mean(pitch).item(),
            'PitchStd': np.std(pitch).item(),
            'SpectralCentroid': np.mean(spectral_centroid).item(),
            'SpectralBandwidth': np.mean(spectral_bandwidth).item(),
            'SpectralRolloff': np.mean(spectral_rolloff).item(),
            'Harmonic': np.mean(harmonic).item(),
            'Percussive': np.mean(percussive).item(),
        }
        return features
    except Exception as e:
        print(f"Could not process file {file_path}: {e}")
        return None

def process_audio_directory(directory_path):
    audio_features = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.mp3'):  # Changed from .wav to .mp3
                file_path = os.path.join(root, file)
                folder_name = os.path.basename(root)
                print(f"Processing file: {file_path}")
                features = extract_features(file_path)
                if features:
                    features['Folder'] = folder_name  # Add folder name
                    features['File'] = file  # Add file name
                    audio_features.append(features)
                else:
                    print(f"Failed to process {file_path}")
    return audio_features

# Adjust this path to where your mp3 files are located
audio_folder_path = 'audio_folder'
audio_features = process_audio_directory(audio_folder_path)

# Ensure audio_features is not empty before proceeding
if audio_features:
    # Define column order with 'Folder' and 'File' at the beginning
    column_order = ['Folder', 'File'] + [col for col in audio_features[0] if col not in ['Folder', 'File']]
    # Convert to DataFrame with specified column order
    df_audio_features = pd.DataFrame(audio_features, columns=column_order)
    output_file = 'enhanced_audio_features_for_generated_voices.csv'
    df_audio_features.to_csv(output_file, index=False)
    print(f"Feature extraction complete. Results saved to {output_file}")
else:
    print("No audio features were extracted. Please check the files and their format.")

Processing file: audio_folder\output_voice_1.mp3
Processing file: audio_folder\output_voice_10.mp3
Processing file: audio_folder\output_voice_2.mp3
Processing file: audio_folder\output_voice_3.mp3
Processing file: audio_folder\output_voice_4.mp3
Processing file: audio_folder\output_voice_5.mp3
Processing file: audio_folder\output_voice_6.mp3
Processing file: audio_folder\output_voice_7.mp3
Processing file: audio_folder\output_voice_8.mp3
Processing file: audio_folder\output_voice_9.mp3
Processing file: audio_folder\polly_neural_voice_1.mp3
Processing file: audio_folder\polly_neural_voice_2.mp3
Processing file: audio_folder\polly_neural_voice_3.mp3
Processing file: audio_folder\polly_neural_voice_4.mp3
Processing file: audio_folder\polly_neural_voice_5.mp3
Feature extraction complete. Results saved to enhanced_audio_features_for_generated_voices.csv


---

## Audio Processing for the Generated Voices

### Objective

I decided to get all the averages for all the columns in our "final_classificaiton_features.csv" exluding the hand labeled columns and compare them to each row(audio files) of the new generated voices.

---

In [23]:
def preprocess_mfccs(df):
    # Convert 'MFCCs' column from string representation of lists to actual lists of floats
    df['MFCCs'] = df['MFCCs'].apply(ast.literal_eval)

    # Assuming each list in 'MFCCs' has a uniform length (e.g., 13 MFCCs)
    # Expand each list into its own set of columns 'MFCC_0', 'MFCC_1', ..., 'MFCC_12'
    mfcc_columns = ['MFCC_' + str(i) for i in range(len(df['MFCCs'].iloc[0]))]
    df[mfcc_columns] = pd.DataFrame(df['MFCCs'].tolist(), index=df.index)

    return df

# Load the datasets
df_classification = pd.read_csv('final_classificaiton_features.csv')
df_generated_voices = pd.read_csv('enhanced_audio_features_for_generated_voices.csv')

# Preprocess 'MFCCs' for both datasets
df_classification = preprocess_mfccs(df_classification)
df_generated_voices = preprocess_mfccs(df_generated_voices)

# Create a DataFrame with just the averages of the compelling voice features, 
# repeated to match the number of rows in the generated voices dataset
compelling_averages_df = pd.DataFrame([avg_compelling_features] * len(df_generated_voices), 
                                      columns=avg_compelling_features.index).reset_index(drop=True)

# Include 'File' column from df_generated_voices in the reset index step
df_generated_voices_reset = df_generated_voices[['File'] + feature_columns].reset_index(drop=True)

# Concatenate the two DataFrames side by side for direct comparison
# Now including 'File' column as part of the generated voices' features
comparison_df = pd.concat([df_generated_voices_reset.add_suffix('_Generated'), 
                           compelling_averages_df.add_suffix('_CompellingAvg')], axis=1)

# Note: The '_Generated' suffix will also be applied to the 'File' column, 
# resulting in 'File_Generated' to distinguish it clearly

# Save the final DataFrame to a CSV file for external analysis or review
comparison_df.to_csv('comparison_generated_vs_compelling_with_files.csv', index=False)

---

## Audio Processing for the Generated Voices cont.

### Objective

After getting the new dataset with all the rows with the new generated voices data and new columns with all the averages for 
all the compelling(1) voices; I decided to find euclidian diistance and find which voices are the closes to the avg of compelling voices data,
and compare them to our survey result

---

In [36]:
# Select the feature columns for the generated voices and compelling averages
features_generated = [f'MFCC_{i}_Generated' for i in range(13)] + ['ZCR_Generated', 'PitchMean_Generated', 'PitchStd_Generated', 'SpectralCentroid_Generated', 'SpectralBandwidth_Generated', 'SpectralRolloff_Generated', 'Harmonic_Generated', 'Percussive_Generated']
features_compelling_avg = [f'MFCC_{i}_CompellingAvg' for i in range(13)] + ['ZCR_CompellingAvg', 'PitchMean_CompellingAvg', 'PitchStd_CompellingAvg', 'SpectralCentroid_CompellingAvg', 'SpectralBandwidth_CompellingAvg', 'SpectralRolloff_CompellingAvg', 'Harmonic_CompellingAvg', 'Percussive_CompellingAvg']

# Normalize the feature values
scaler = StandardScaler()
generated_features_scaled = scaler.fit_transform(comparison_df[features_generated])
compelling_avg_scaled = scaler.transform([avg_compelling_features])

# Calculate Euclidean distances from each generated voice to the average compelling voice
distances = np.sqrt(((generated_features_scaled - compelling_avg_scaled) ** 2).sum(axis=1))

# Add distances to the comparison DataFrame
comparison_df['Distance_to_Avg_Compelling'] = distances

# Sort the DataFrame by distances (ascending order so that the smallest distances are at the top)
comparison_df_sorted = comparison_df.sort_values('Distance_to_Avg_Compelling')

# Print or save the entire sorted DataFrame
print(comparison_df_sorted[['File_Generated', 'Distance_to_Avg_Compelling']])

              File_Generated  Distance_to_Avg_Compelling
12  polly_neural_voice_3.mp3                    7.577815
10  polly_neural_voice_1.mp3                    9.117620
6         output_voice_6.mp3                   10.580027
3         output_voice_3.mp3                   11.309343
4         output_voice_4.mp3                   11.760292
9         output_voice_9.mp3                   11.867753
7         output_voice_7.mp3                   11.932012
13  polly_neural_voice_4.mp3                   12.212530
11  polly_neural_voice_2.mp3                   12.249074
2         output_voice_2.mp3                   12.462378
1        output_voice_10.mp3                   12.598452
0         output_voice_1.mp3                   12.599167
8         output_voice_8.mp3                   13.407799
5         output_voice_5.mp3                   13.692681
14  polly_neural_voice_5.mp3                   14.634078


