In [1]:
%pip install numpy pandas -q
import numpy as np
import pandas as pd
from constants import *
import sys
from python_speech_features import delta
from time import time
from tqdm.notebook import tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
def getFormattedEntry(line, coeff=ALL_COEFFS):
    """
    Processes a line of text from the data file and extracts the relevant coefficients.

    :param line: The line from the data file containing space-separated coefficients.
    :param coeff: The coefficients to load. If all coefficients are required, use `ALL_COEFFS`.
    :return: A numpy array of the selected coefficients, as type float64.
    """
    # Strip leading and trailing whitespace from the line
    line = line.strip()

    # Convert the line into a numpy array of floats
    arr = np.array(line.split(), dtype=np.float64)

    if np.array_equal(coeff, ALL_COEFFS):
        # If the entire set of coefficients is needed, return the whole array
        return arr
    else:
        # If only a subset of coefficients is required, return the corresponding elements
        return arr[coeff].astype(np.float64)

In [3]:
def loadTrain(filepath, coeff=ALL_COEFFS):
    """
    :param filepath: path to the file to load
    Organizes the data in the input file into a list of matrices. Each row in the resulting DataFrame
    includes metadata (block, utterance, speaker, gender, digit) and the MFCC coefficients for each frame.
    
    :param coeff: The coefficients to load. If all coefficients are required, use `ALL_COEFFS`.
    
    :return: A DataFrame containing the loaded data, with columns for block, utter
    """
    
    timeStart = time()  # Start time for loading data
    
    try:
        with open(filepath) as f:
            print(f"Loading {filepath}")
    except FileNotFoundError:
        print(f"File {filepath} not found")
        return None
    
    # Define columns for the output DataFrame
    labels = ['block', 'utterance', 'speaker', 'gender', 'digit', 'time_window'] + list(map(str, coeff))
    
    # Initialize metadata variables
    utterances = 1
    speaker = 1
    gender = 'm'
    digit = 0
    block = 1
    time_window = 0
    
    rows = []  # List to accumulate rows

    # Open file again to process lines with tqdm
    print(f"Processing {filepath}")
    
    with open(filepath, 'r') as f:
        for idx, line in tqdm(enumerate(f), desc="Processing training data", file=sys.stdout, colour="green", position=0, dynamic_ncols=True):
            # Skip the first line if it's a header
            if idx == 0:
                continue
            
            # Check for blank lines indicating a new block
            if line.isspace() or line == "\n":
                utterances += 1
                block += 1
                digit = (block - 1) // 660  # Compute digit index based on block count
                
                # Reset utterances and handle speaker/gender transitions
                if utterances > 10:
                    utterances = 1
                    speaker += 1
                
                # Determine gender based on the speaker
                if (speaker - 1) // 33 == 1:
                    gender = 'f'
                elif (speaker - 1) // 66 == 1:
                    gender = 'm'
                    speaker = 1
                
                # Reset time window for the new block
                time_window = 0
                continue

            entry = getFormattedEntry(line, coeff)  # Convert line to array of coefficients
            
            new_row = [block, utterances, speaker, gender, digit, time_window] + entry.tolist()
            rows.append(new_row)
            
            time_window += 1
    
    df = pd.DataFrame(rows, columns=labels)
    
    timeEnd = time()
    print("COMPLETED LOADING TRAINING DATA")
    print(f"\t\tLoaded {len(df)} entries in {timeEnd - timeStart:.2f} seconds")
    return df


In [4]:
def loadTest(filepath, coeff=ALL_COEFFS):
    """
    Optimized version for loading test data into a DataFrame.
    
    :param filepath: path to the file to load
    :param coeff: The coefficients to load. If all coefficients are required, use `ALL_COEFFS`.
    
    :return: A DataFrame containing the loaded data, with columns for block, utterance, speaker, gender, digit, and time_window, along with the selected MFCC coefficients.
    """
    
    timeStart = time()  # Start timer
    
    try:
        with open(filepath) as f:
            print(f"Loading {filepath}")
    except FileNotFoundError:
        print(f"File {filepath} not found")
        return None
    
    # Define the labels for the DataFrame columns
    labels = ['block', 'utterance', 'speaker', 'gender', 'digit', 'time_window'] + list(map(str, coeff))
    
    # Initialize variables
    utterances = 1
    speaker = 1
    gender = 'm'
    digit = 0
    block = 1
    time_window = 0

    # Use a list to collect rows
    rows = []

    # Efficiently count lines once before processing
    with open(filepath, 'r') as f:
        total_lines = sum(1 for _ in f)

    # Load data with progress bar using the pre-counted total_lines
    print(f"Processing {filepath}")
    with open(filepath, 'r') as f:
        for idx, line in tqdm(enumerate(f), desc="Processing lines", total=total_lines, colour="green", position=0, dynamic_ncols=True):
            # Skip header
            if idx == 0:
                continue

            # Handle blank lines indicating a new block
            if line.isspace() or line == "\n":
                utterances += 1
                block += 1
                digit = (block - 1) // 220  # Update digit after every 220 blocks
                
                # Reset utterances and handle speaker/gender transitions
                if utterances > 10:
                    utterances = 1
                    speaker += 1

                # Determine gender by speaker group (male/female alternation every 11 speakers)
                gender = 'f' if (speaker - 1) // 11 % 2 else 'm'

                # Reset time window for new block
                time_window = 0
                continue

            # Process the line (frame) to get the coefficients
            entry = getFormattedEntry(line, coeff)  # Convert line to array of coefficients
            
            # Append the row with metadata and coefficients
            rows.append([block, utterances, speaker, gender, digit, time_window] + entry.tolist())

            # Increment time window for the current block
            time_window += 1

    # Convert list of rows into a DataFrame
    df = pd.DataFrame(rows, columns=labels)

    timeEnd = time()
    print("COMPLETED LOADING TEST DATA")
    print(f"\t\tLoaded {len(df)} entries in {timeEnd - timeStart:.2f} seconds")
    return df

In [5]:
if __name__ == "__main__":
    trainDF = loadTrain(TRAIN_FILE, coeff=ALL_COEFFS)
    testDF = loadTest(TEST_FILE, coeff=ALL_COEFFS)

Loading ../data/Train_Arabic_Digit.txt
Processing ../data/Train_Arabic_Digit.txt


Processing training data: 0it [00:00, ?it/s]

COMPLETED LOADING TRAINING DATA
		Loaded 263256 entries in 1.86 seconds
Loading ../data/Test_Arabic_Digit.txt
Processing ../data/Test_Arabic_Digit.txt


Processing lines:   0%|          | 0/89263 [00:00<?, ?it/s]

COMPLETED LOADING TEST DATA
		Loaded 87063 entries in 0.55 seconds


In [6]:
trainDF.head()

Unnamed: 0,block,utterance,speaker,gender,digit,time_window,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1,1,1,m,0,0,-0.81101,-7.2382,1.5429,-0.64774,1.4271,0.61356,0.36516,0.088906,0.47031,0.98844,0.044692,0.20817,0.5114
1,1,1,1,m,0,1,-0.37028,-7.1336,1.8856,-0.34316,0.96733,0.32763,0.42988,0.50479,0.41533,0.28804,0.086109,0.6269,0.78115
2,1,1,1,m,0,2,0.59659,-8.3059,1.6943,-0.66611,0.34967,-0.17425,0.82077,1.2611,0.41653,0.5005,0.57163,0.45316,0.64465
3,1,1,1,m,0,3,1.4585,-8.1957,1.8454,-1.1496,0.8266,-0.51313,0.067443,0.25637,0.115,-0.10915,0.085991,0.69064,0.33769
4,1,1,1,m,0,4,2.0824,-8.667,1.1995,-1.124,1.2445,-0.10251,0.99867,0.57174,1.0384,0.17564,-0.032857,0.53229,0.32941


In [7]:
testDF.head()

Unnamed: 0,block,utterance,speaker,gender,digit,time_window,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1,1,1,m,0,0,1.2572,-8.2449,0.8483,-1.5782,0.4736,-0.063273,0.42481,0.50017,0.7042,0.28973,0.076053,0.025883,-0.22968
1,1,1,1,m,0,1,3.3638,-9.0154,1.4104,-1.5884,1.3725,-0.33481,1.0529,0.89804,0.79525,0.74112,-0.15351,0.51718,0.44204
2,1,1,1,m,0,2,3.4461,-9.4871,1.3425,-1.4066,1.4422,0.12447,0.58199,0.88984,0.9026,0.11521,-0.047091,0.40989,0.81545
3,1,1,1,m,0,3,4.773,-10.007,1.2143,-2.0118,2.0864,0.28562,0.52868,0.33971,1.1688,0.42569,-0.47099,0.5086,-0.33061
4,1,1,1,m,0,4,5.3317,-9.6834,1.5131,-2.3545,1.6933,0.066311,-0.088666,0.16826,0.24546,-0.34749,-0.098748,0.81093,-0.12837


In [8]:
# print the shape of the dataframes and statistics
print("Train data shape: ", trainDF.shape)
print("Test data shape: ", testDF.shape)
print("Train data statistics: ", trainDF.describe())
print("Test data statistics: ", testDF.describe())
print("Train data info: ", trainDF.info())
print("Test data info: ", testDF.info())

Train data shape:  (263256, 19)
Test data shape:  (87063, 19)
Train data statistics:                 block      utterance        speaker          digit  \
count  263256.000000  263256.000000  263256.000000  263256.000000   
mean     3375.131932       5.498534      34.104510       4.603922   
std      1877.991353       2.873385      18.801597       2.830455   
min         1.000000       1.000000       1.000000       0.000000   
25%      1795.000000       3.000000      18.000000       2.000000   
50%      3326.000000       5.000000      35.000000       5.000000   
75%      5045.000000       8.000000      50.000000       7.000000   
max      6600.000000      10.000000      66.000000       9.000000   

         time_window              0              1              2  \
count  263256.000000  263256.000000  263256.000000  263256.000000   
mean       20.395831       1.980291      -3.079727      -0.360437   
std        13.092223       2.750261       2.127967       1.736583   
min         0.00

In [9]:
trainDF.to_csv('../data/train.csv', index=False)
testDF.to_csv('../data/test.csv', index=False)
print("Data saved to CSV files.")

Data saved to CSV files.


### Data aggregation
This dataset have the MFCC features of the audio files. To get the full understanding of the audio files, we need to aggregate the MFCC features that are part of a same audio file (same speaker, same utterance, same digit).

In [10]:
# Group by 'utterance', 'speaker', and 'digit', and sort by 'time_window' to preserve the time series
trainDF_grouped = trainDF.sort_values(by='time_window').groupby(['utterance', 'speaker', 'digit', 'gender'])

# Apply the transformation for each group to get a list of coefficients
trainDF_grouped_df = trainDF_grouped.apply(lambda x: x[[str(i) for i in ALL_COEFFS]].values.tolist()).reset_index(name='coefficients')

# The resulting DataFrame will have 'utterance', 'speaker', 'digit' and the list of coefficients as 'coefficients'
trainDF_grouped_df.head()


  trainDF_grouped_df = trainDF_grouped.apply(lambda x: x[[str(i) for i in ALL_COEFFS]].values.tolist()).reset_index(name='coefficients')


Unnamed: 0,utterance,speaker,digit,gender,coefficients
0,1,1,0,m,"[[-0.81101, -7.2382, 1.5429, -0.64774, 1.4271,..."
1,1,1,1,m,"[[0.088296, -1.3841, 1.5712, 0.015122, -0.7615..."
2,1,1,2,m,"[[2.4273, -3.0185, 1.977, 1.5598, -2.4448, -1...."
3,1,1,3,m,"[[1.3635, -5.9484, 0.22766, -0.31943, -0.16718..."
4,1,1,4,m,"[[0.48353, -0.96949, -0.030561, -2.4044, 0.583..."


In [11]:
grouped = testDF.sort_values(by='time_window').groupby(['utterance', 'speaker', 'digit', 'gender'])
testDF_grouped_df = grouped.apply(lambda x: x[[str(i) for i in ALL_COEFFS]].values.tolist()).reset_index(name='coefficients')
testDF_grouped_df.head()

  testDF_grouped_df = grouped.apply(lambda x: x[[str(i) for i in ALL_COEFFS]].values.tolist()).reset_index(name='coefficients')


Unnamed: 0,utterance,speaker,digit,gender,coefficients
0,1,1,0,m,"[[1.2572, -8.2449, 0.8483, -1.5782, 0.4736, -0..."
1,1,2,0,m,"[[-0.62166, -6.809, 1.1914, -0.24391, 0.71214,..."
2,1,3,0,m,"[[-1.5878, -5.7215, 2.5979, -0.40329, 1.8357, ..."
3,1,4,0,m,"[[-2.3541, -4.4249, 2.3098, 0.32279, 0.52037, ..."
4,1,5,0,m,"[[-0.57648, -6.4786, 1.5552, -0.84299, 0.61214..."


In [14]:
print(f"Train data shape: {trainDF_grouped_df.shape}")
print(f"Test data shape: {testDF_grouped_df.shape}")

Train data shape: (6600, 5)
Test data shape: (2200, 5)


Therefore, we have 8800 recordings as described in the dataset description.

In [15]:
print("Saving aggregated data to CSV files...")
trainDF_grouped_df.to_csv('../data/train_grouped.csv', index=False)
testDF_grouped_df.to_csv('../data/test_grouped.csv', index=False)
print("Data saved to CSV files.")

Saving aggregated data to CSV files...
Data saved to CSV files.
