 # Deep Learning - Project

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import librosa
import librosa.display
import librosa.util
import IPython.display as ipd
import soundfile as sf
import torch
from torchaudio.transforms import MFCC
from IPython.display import Audio


 ## Load Data

In [None]:
current_dir = os.getcwd()

print("Contents of the current directory:")
for item in os.listdir(current_dir):
    print(item)


In [None]:
data_dir = "data_subsample/"
languages = ["dutch", "english", "chinese", "italian"]


In [None]:
# Loop through languages
for language in languages:
    # language path
    language_path = os.path.join(data_dir, language)
    print("Language path:", language_path) 

    # Check if language path exists
    if os.path.isdir(language_path):
        # Get all directories inside the language folder
        subdirectories = [d for d in os.listdir(language_path) if os.path.isdir(os.path.join(language_path, d))]

        # Check if there are any subdirectories
        if subdirectories:
            # Print subdirectories
            print("Subdirectories:", subdirectories)
        else:
            print(f"No subdirectories found for {language}")

        # clips path
        clips_path = os.path.join(language_path, "clips")

        # Check if clips path exists
        if os.path.isdir(clips_path):
            # Count the number of files in the clips folder
            num_files = len(os.listdir(clips_path))
            print(f"Number of files in {language} clips folder:", num_files)
        else:
            print(f"No clips folder found for {language}")

        print(f"\n")
    else:
        print(f"No directory found for {language}")



In [None]:
def calculate_audio_length(audio_path):
    # Load audio file using librosa
    y, sr = librosa.load(audio_path, sr=None)
    # Compute duration of audio file in seconds
    duration = librosa.get_duration(y=y, sr=sr)
    return duration

def calculate_average_length(language_folder, max_instances=None):
    total_duration = 0.0
    total_files = 0
    instances_processed = 0

    language_path = os.path.join(data_dir, language_folder)

    # Check if it's a directory
    if os.path.isdir(language_path):
        # Iterate over clips folders
        for clips_folder in os.listdir(language_path):
            clips_path = os.path.join(language_path, clips_folder)

            # Check if it's a directory
            if os.path.isdir(clips_path):
                # Iterate over audio clips
                for audio_file in os.listdir(clips_path):
                    audio_path = os.path.join(clips_path, audio_file)
                    try:
                        # Calculate duration of audio file
                        duration = calculate_audio_length(audio_path)
                        total_duration += duration
                        total_files += 1
                        instances_processed += 1

                        if max_instances and instances_processed >= max_instances:
                            break  # Stop processing instances if limit reached
                    except Exception as e:
                        print(f"Error processing audio file {audio_file}: {e}")

    if total_files > 0:
        average_length = total_duration / total_files
        print(f"Average audio length for language {language_folder}: {average_length:.2f} seconds")
    else:
        print(f"No audio files found for language {language_folder}")


In [None]:
# Dictionary to store average audio lengths for each language
avg_lengths = {}

for language in languages:
    # Calculate average audio length
    avg_length=calculate_average_length(language, max_instances=10)
    avg_lengths[language] = avg_length


In [None]:
def get_sampling_rate(language_folder, data_dir, max_instances=None, max_length=None):
    """
    Get the sampling rate from audio files within the clips directory of the specified language.

    Args:
    - language_folder (str): Name of the language folder.
    - data_dir (str): Path to the data directory containing language folders.
    - max_instances (int, optional): Maximum number of instances to read.
    - max_length (int, optional): Maximum length of audio files.

    Returns:
    - sampling_rate (int or None): Sampling rate of the audio files, or None if no audio files found.
    """
    # Construct the path to the clips directory for the specified language
    language_path = os.path.join(data_dir, language_folder, "clips")

    # Initialize an empty list to store sampling rates
    sampling_rates = []

    # Check if the clips directory exists
    if os.path.isdir(language_path):
        # Iterate over audio files in the directory
        for root, dirs, files in os.walk(language_path):
            for file in files:
                # Check if the file is an audio file (you can modify this condition if needed)
                if file.endswith(('.wav', '.mp3', '.flac')):
                    # Construct the full path to the audio file
                    file_path = os.path.join(root, file)
                    try:
                        # Open the audio file and read the sampling rate
                        with sf.SoundFile(file_path) as f:
                            sampling_rate = f.samplerate
                            # Append the sampling rate to the list
                            sampling_rates.append(sampling_rate)

                        # Check if maximum instances limit reached
                        if max_instances and len(sampling_rates) >= max_instances:
                            break
                    except Exception as e:
                        print(f"Error reading sampling rate from {file}: {e}")

        # If there are no audio files, return None
        if not sampling_rates:
            return None

        # Calculate the most common sampling rate
        sampling_rate = max(set(sampling_rates), key=sampling_rates.count)
        return sampling_rate
    else:
        print(f"No clips directory found for language {language_folder}")
        return None


In [None]:
for language in languages:
    sampling_rate = get_sampling_rate(language, data_dir)
    print(f"Sampling rate for {language} language: {sampling_rate}")


In [None]:
def load_audio_data(language_folder, max_instances=None, max_length=None):
    data = []
    labels = []

    language_path = os.path.join(data_dir, language_folder)

    # Check if it's a directory
    if os.path.isdir(language_path):
        # Iterate over clips folders
        for clips_folder in os.listdir(language_path):
            clips_path = os.path.join(language_path, clips_folder)

            # Check if it's a directory
            if os.path.isdir(clips_path):
                # Iterate over audio clips
                for idx, audio_file in enumerate(os.listdir(clips_path)):
                    if max_instances and idx >= max_instances:
                        break  # Stop reading instances if limit reached

                    audio_path = os.path.join(clips_path, audio_file)
                    try:
                        # Load audio file using librosa
                        y, sr = librosa.load(audio_path, sr=None)

                        # Optionally, pad or truncate the waveform array
                        if max_length:
                            y = librosa.util.fix_length(y, size=max_length)

                        # Append audio data and label to the lists
                        data.append(y)
                        labels.append(language_folder)
                    except Exception as e:
                        print(f"Error loading audio file {audio_file}: {e}")
            else:
                print(f"No clips directory found in {language_folder}")

    return np.array(data), np.array(labels)


In [None]:
MAX_INSTANCES = 10  # max number of instances to read
MAX_LENGTH = 48000 * 60 * 3  # 3 minutes of audio (first term is the assumed sampling rate, in Hz)

"""""
 The MAX_LENGTH parameter should be determined in relation to the sampling rate, 
 as it represents the maximum length of audio data in terms of samples.
"""""

X_all = []
y_all = []

for language in languages:

    # Load audio data for the current language using the language-specific sampling rate
    X_lang, y_lang = load_audio_data(language, max_instances=MAX_INSTANCES, max_length=MAX_LENGTH)

    # Append the instances for the current language to the overall arrays
    X_all.extend(X_lang)
    y_all.extend(y_lang)

# Convert lists to numpy arrays
X_all = np.array(X_all)
y_all = np.array(y_all)


In [None]:
# check
print("First few elements of X_all:", X_all[:5])
print("First few elements of y_all:", y_all[:5])


 # Inspect the data DIRECTLY FROM LOAD


In [None]:


def inspect_audio_data(data_dir, languages, language_dict):
    """
    Inspects the audio data by printing language paths, subdirectories, and the number of files in clips folders.
    Also plots the waveform of a randomly selected audio sample for each language.

    Args:
    - data_dir (str): Path to the data directory containing language folders.
    - languages (list): List of language names.
    - language_dict (dict): Dictionary mapping language names to integers.
    """
    for language in languages:
        # Language path
        language_path = os.path.join(data_dir, language)
        print("Language path:", language_path) 

        # Check if language path exists
        if os.path.isdir(language_path):
            # Get all directories inside the language folder
            subdirectories = [d for d in os.listdir(language_path) if os.path.isdir(os.path.join(language_path, d))]

            # Check if there are any subdirectories
            if subdirectories:
                # Print subdirectories
                print("Subdirectories:", subdirectories)
            else:
                print(f"No subdirectories found for {language}")

            # Clips path
            clips_path = os.path.join(language_path, "clips")

            # Check if clips path exists
            if os.path.isdir(clips_path):
                # Count the number of files in the clips folder
                num_files = len(os.listdir(clips_path))
                print(f"Number of files in {language} clips folder:", num_files)
            else:
                print(f"No clips folder found for {language}")

            print(f"\n")
            
            # Plot waveform of a randomly selected audio sample
            if num_files > 0:
                # Randomly select an audio file
                audio_files = os.listdir(clips_path)
                random_audio_file = np.random.choice(audio_files)
                audio_path = os.path.join(clips_path, random_audio_file)

                # Load audio file using librosa
                audio, sr = librosa.load(audio_path, sr=None)

                # Plot waveform
                plt.figure(figsize=(10, 4))
                plt.plot(np.arange(len(audio)) / sr, audio)
                plt.title(f"Waveform of a randomly selected audio sample for {language} language")
                plt.xlabel("Time (seconds)")
                plt.ylabel("Amplitude")
                plt.show()

                # Play audio
                display(Audio(audio, rate=sr))
        else:
            print(f"No directory found for {language}")




In [None]:
# Example usage
data_dir = "data_subsample"
languages = ["dutch", "english", "chinese", "italian"]
language_dict = {languages[i]: i for i in range(len(languages))}

inspect_audio_data(data_dir, languages, language_dict)


 # Inspect the CURRENT DATA

In [None]:
def inspect_audio_data(X, y, languages):
    """
    Inspects the loaded audio data.

    Args:
    - X (numpy.ndarray): Array containing audio data.
    - y (numpy.ndarray): Array containing labels corresponding to the audio data.
    - languages (list): List of language labels.

    Returns:
    None
    """
    # Print basic information about the dataset
    print("Dataset Information:")
    print("Number of audio samples:", len(X))
    print("Number of unique languages:", len(np.unique(y)))
    print("Languages:", np.unique(y))

    # Plot distribution of classes
    plt.figure(figsize=(10, 6))
    plt.hist(y, bins=len(np.unique(y)), edgecolor='black', alpha=0.7)
    plt.title("Distribution of Classes")
    plt.xlabel("Language")
    plt.ylabel("Number of Samples")
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()

    # Plot example waveforms for each language
    for language in languages:
        plt.figure(figsize=(10, 2))
        samples = X[y == language]
        num_samples = min(len(samples), 3)  # Plot up to 5 samples per language
        for i in range(num_samples):
            plt.subplot(1, num_samples, i + 1)
            plt.plot(samples[i])
            plt.title(f"{language} Sample {i+1}")
            plt.xlabel("Sample Index")
            plt.ylabel("Amplitude")
        plt.tight_layout()
        plt.show()



In [None]:
inspect_audio_data(X_all, y_all, languages)


 ## Normalize Input

In [None]:
def normalize_and_transform(X, sampling_rate):
    normalized_data = []

    for x in X:
        # Z-score normalization
        mean = np.mean(x)
        std = np.std(x)
        x_normalized = (x - mean) / std

        # Compute MFCC transform
        """
        Both the MFCC transform and the Mel spectrogram represent the frequency content of an audio clip, but they serve different 
        purposes. MFCCs are commonly used as features for tasks like speech recognition, while Mel spectrograms are useful 
        for visualizing and analyzing the frequency content of audio signals.
        """
        mfcc_transform = MFCC(sample_rate=sampling_rate)
        x_mfcc = mfcc_transform(torch.tensor(x_normalized))

        # Append the normalized and transformed data
        normalized_data.append(x_mfcc.numpy())

    return np.array(normalized_data)


In [None]:
normalized_data = normalize_and_transform(X_all, sampling_rate)
print(normalized_data)


 ## Create Spectograms

In [None]:
def compute_mel_spectograms(X, sampling_rate):
    """
    Computes the Mel spectogram for all audio clips in X.
    """
    spectograms = []

    for x in X:
        # Compute Mel spectrogram
        S = librosa.feature.melspectrogram(y=x, sr=sampling_rate)

        # Convert to dB scale
        S_db = librosa.power_to_db(S, ref=np.max)
        
        spectograms.append(S_db)

    return spectograms   


def visualize_mel_spectograms(spectograms, sampling_rate):
    """
    Creates a figure for all Mel Spectograms in spectograms.
    """
    
    for S_db in spectograms:
        # Plot Mel spectrogram
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(S_db, sr=sampling_rate, x_axis='time', y_axis='mel')
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel spectrogram')
        plt.show()


In [None]:

specs = compute_mel_spectograms(X_all, sampling_rate)
visualize_mel_spectograms(specs, sampling_rate)


