# Data preprocessing
In this part of the notebook, we will preprocess the data to make it ready for training. We will do the following steps:
1. Visualize the RTM data
2. Check the sampling rate of the data and resample to consist to 16kHz to enssure uniformity across the dataset
3. Extract MFCC features from the audio data and allign with the RTM data to create the final dataset
 


In [4]:
# Visualize the RTM data

# All importss
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import librosa

# Parse the dummy RTTM file
def parse_rttm(file_path):
    columns = [
        "Type",
        "File ID",
        "Channel ID",
        "Turn Onset",
        "Turn Duration",
        "Orthography Field",
        "Speaker Type",
        "Speaker Name",
        "Confidence Score",
        "Signal Lookahead Time",
    ]
    df = pd.read_csv(file_path, delim_whitespace=True, names=columns)
    return df[["Turn Onset", "Turn Duration", "Speaker Name"]]


# Prepare the parsed data
def prepare_data(rttm_data):
    rttm_data["Turn Onset"] = rttm_data["Turn Onset"].astype(float)
    rttm_data["Turn Duration"] = rttm_data["Turn Duration"].astype(float)
    rttm_data["End Time"] = rttm_data["Turn Onset"] + rttm_data["Turn Duration"]
    return rttm_data


# Plots the timeline of the speakers
def plot_timeline(data):
    fig, ax = plt.subplots(figsize=(12, 8))

    speakers = data["Speaker Name"].unique()
    speaker_indices = {speaker: idx for idx, speaker in enumerate(speakers)}

    for idx, row in data.iterrows():
        start = row["Turn Onset"]
        end = row["End Time"]
        speaker = row["Speaker Name"]
        ax.plot([start, end], [speaker_indices[speaker]] * 2, linewidth=10)

    ax.set_yticks(range(len(speakers)))
    ax.set_yticklabels(speakers)
    ax.set_xlabel("Time (s)")
    ax.set_ylabel("Speakers")
    ax.set_title("Speaker Timeline")

    plt.show()


def debugger_test():
    # Test the RTM data
    rttm_data = parse_rttm("../Dataset/RTMS/Dev/abjxc.rttm")
    rttm_data = prepare_data(rttm_data)
    plot_timeline(rttm_data)


def debugger_run_all():
    devRTM_path = "../Dataset/RTMS/Dev/"
    # Visaualize all RTM data in the folder devRTM_path

    # Get all the files in the folder
    files = os.listdir(devRTM_path)
    for file in files:
        if file.endswith(".rttm"):
            file_path = os.path.join(devRTM_path, file)
            rttm_data = parse_rttm(file_path)
            rttm_data = prepare_data(rttm_data)
            plot_timeline(rttm_data)

def 


Downloading file 'sorohanro_-_solo-trumpet-06.ogg' from 'https://librosa.org/data/audio/sorohanro_-_solo-trumpet-06.ogg' to 'C:\Users\rakin\AppData\Local\librosa\librosa\Cache'.


MFCC shape: (13, 230)
librosa is working correctly.
