# Data preprocessing
In this part of the notebook, we will preprocess the data to make it ready for training. We will do the following steps:
1. Visualize the RTM data
2. Check the sampling rate of the data and resample to consist to 16kHz to enssure uniformity across the dataset
3. Extract MFCC features from the audio data and allign with the RTM data to create the final dataset
 


In [77]:
# Visualize the RTM data

# All importss
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import librosa as librosa
import sys


# Parse the dummy RTTM file
def parse_rttm(file_path):
    columns = [
        "Type",
        "File ID",
        "Channel ID",
        "Turn Onset",
        "Turn Duration",
        "Orthography Field",
        "Speaker Type",
        "Speaker Name",
        "Confidence Score",
        "Signal Lookahead Time",
    ]
    df = pd.read_csv(file_path, sep="\s+", names=columns)
    return df[["Turn Onset", "Turn Duration", "Speaker Name"]]


# Grabs the important data from the RTM file and creats End Time column
def prepare_data(rttm_data):
    rttm_data["Turn Onset"] = rttm_data["Turn Onset"].astype(float)
    rttm_data["Turn Duration"] = rttm_data["Turn Duration"].astype(float)
    rttm_data["End Time"] = rttm_data["Turn Onset"] + rttm_data["Turn Duration"]
    return rttm_data


# Plots the timeline of the speakers
def plot_timeline(data):
    """
    Plots the speaker timeline based on the provided data.

    Args:
        data (pandas.DataFrame): The data containing speaker information.

    Returns:
        None
    """
    fig, ax = plt.subplots(figsize=(12, 8))

    speakers = data["Speaker Name"].unique()
    speaker_indices = {speaker: idx for idx, speaker in enumerate(speakers)}

    for idx, row in data.iterrows():
        start = row["Turn Onset"]
        end = row["End Time"]
        speaker = row["Speaker Name"]
        ax.plot([start, end], [speaker_indices[speaker]] * 2, linewidth=10)

    ax.set_yticks(range(len(speakers)))
    ax.set_yticklabels(speakers)
    ax.set_xlabel("Time (s)")
    ax.set_ylabel("Speakers")
    ax.set_title("Speaker Timeline")

    plt.show()


def debugger_test():
    # Test the RTM data
    rttm_data = parse_rttm("../Dataset/RTMS/Dev/abjxc.rttm")
    rttm_data = prepare_data(rttm_data)
    plot_timeline(rttm_data)


def debugger_run_all():
    devRTM_path = "../Dataset/RTMS/Dev/"
    # Visaualize all RTM data in the folder devRTM_path

    # Get all the files in the folder
    files = os.listdir(devRTM_path)
    for file in files:
        if file.endswith(".rttm"):
            file_path = os.path.join(devRTM_path, file)
            rttm_data = parse_rttm(file_path)
            rttm_data = prepare_data(rttm_data)
            plot_timeline(rttm_data)



In [85]:
# Get sampling rate of all audio files, allign mfcc with RTM data such that they can be used for training

# Get the MFCC of the audio file using librosa
def get_mfcc(file_path):
    """
    Compute the Mel-frequency cepstral coefficients (MFCC) for an audio file.

    Parameters:
    file_path (str): The path to the audio file.

    Returns:
    mfcc (ndarray): The computed MFCC coefficients.
    sr (int): The sample rate of the audio file.
    """
    y, sr = librosa.load(file_path)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # Use 13 MFCC coefficients
    return mfcc, sr


# Align the MFCC data with the RTM data -- Speaker segmentation for training
def align_mfcc(mfcc_data, sr, rttm_path, hop_length=220):
    """
    Aligns MFCC data with RTTM data using a specified hop length.

    Args:
        mfcc_data (numpy.ndarray): MFCC data.
        sr (int): Sampling rate.
        rttm_path (str): Path to the RTTM file.
        hop_length (int, optional): Number of samples between successive frames. Defaults to 220.

    Returns:
        list of dicts: Each dictionary contains 'Speaker Name' and 'MFCC Segment'.

    Raises:
        KeyError: If required columns are missing in the DataFrame.
    """
    rttm_data = parse_rttm(rttm_path)
    rttm_data = prepare_data(rttm_data)  # Ensure this doesn't drop necessary columns

    # Check if necessary columns exist
    if "End Time" not in rttm_data.columns or "Turn Onset" not in rttm_data.columns:
        raise KeyError("Necessary columns are missing from RTTM data.")

    # Convert RTTM times to frame indices
    rttm_data["Start Frame"] = (rttm_data["Turn Onset"] * sr / hop_length).astype(int)
    rttm_data["End Frame"] = (rttm_data["End Time"] * sr / hop_length).astype(int)

    segments = []
    num_frames = mfcc_data.shape[1]

    for _, row in rttm_data.iterrows():
        start_frame = row["Start Frame"]
        end_frame = row["End Frame"]

        # Ensure start_frame and end_frame are within the bounds of MFCC data length
        if start_frame >= num_frames:
            continue  # Skip segments that start beyond the audio length
        if end_frame > num_frames:
            end_frame = num_frames

        segment_mfcc = mfcc_data[:, start_frame:end_frame]
        segments.append(
            {"Speaker Name": row["Speaker Name"], "MFCC Segment": segment_mfcc}
        )

    return segments

# Visualize the MFCC data with speaker labels --> Useless for now
def visualize_MFCC(mfcc_data, speaker_labels, sr, hop_length=512, duration=1.0):
    """
    Visualize the MFCC data with speaker labels.

    Args:
        mfcc_data (numpy.ndarray): MFCC data.
        speaker_labels (list): Speaker labels for each MFCC frame.
        sr (int): Sample rate of the audio file.
        hop_length (int, optional): Hop length used in the MFCC calculation. Defaults to 512.
        duration (float, optional): Duration in seconds to display. Defaults to 1.0.

    Returns:
        None
    """
    # Calculate the number of frames to display for the given duration
    num_frames = int((duration * sr) / hop_length)

    # Limit the mfcc_data and speaker_labels to the specified duration
    mfcc_data = mfcc_data[:, :num_frames]
    speaker_labels = speaker_labels[:num_frames]

    fig, ax = plt.subplots(figsize=(12, 8))

    for idx, (mfcc_frame, speaker_label) in enumerate(zip(mfcc_data.T, speaker_labels)):
        ax.plot(mfcc_frame + idx * 20, label=speaker_label)

    ax.set_xlabel("MFCC Coefficients")
    ax.set_ylabel("Frame Index")
    ax.set_title("MFCC Visualization")
    ax.legend()

    plt.show()


# Test get_mfcc on filepath
path_wave = "../Dataset/Audio/Dev/afjiv.wav"
path_rttm = "../Dataset/RTMS/Dev/afjiv.rttm"

# Get the MFCC and allign it with the RTM data for training --> Working
mfcc_test, sampling_rate = get_mfcc(path_wave)
test_rttm_data = parse_rttm(path_rttm)
test_rttm_data = prepare_data(test_rttm_data)
test_alligned = align_mfcc(mfcc_test,sampling_rate,path_rttm)


sampling_rate, 


## Parse the entire dataset for training and testing
Here we will go through all audio/dev wave files and extract the MFCC features and allign with the RTM data to create the final dataset for training and validation. Pick validation of 20% of the data

In [None]:
# Goes to the 
def get_training_and_validation(dev_audio_path, dev_RTM_path):
