In [None]:
%pip install numpy pandas -q
import numpy as np
import pandas as pd
from constants import *
import sys
from python_speech_features import delta
from time import time

Note: you may need to restart the kernel to use updated packages.


In [None]:
def loadTrain(filepath, coeff=ALL_COEFFS):
    """
    :param filepath: path to the file to load
    Organizes the data in the input file into a list of matrices. Each matrix is the data for a spoken digit,
    where the columns are the coefficients and each row is the index of the time window
    """
    timeStart = time()
    try:
        f = open(filepath)
        print(f"Loading {filepath}")
    except FileNotFoundError:
        print(f"File {filepath} not found")
        return None
    
    digits = []
    labels = np.append(['block','utterance','speaker','gender','digit'],coeff)
    utterances = 1
    speaker = 1
    gender = 'm'
    digit = 0
    block = 1
    
    
    df = pd.DataFrame(columns=labels)
    
    # load the data
    with f:
        print(f"Loading {filepath}")
        for idx, line in enumerate(f):
            if idx == 0:
                continue
            if line.isspace() or line=="\n":
                utterances+=1
                block+=1
                digit = (block-1)//660
                if utterances>10:
                    utterances = 1
                    speaker += 1
                
                if (speaker-1)//33==1:
                    gender = 'f'
                elif (speaker-1)//66==1:
                    gender = 'm'
                    speaker = 1
                
                continue
            entry = getFormattedEntry(line, coeff)
            new_row = [block,utterances, speaker, gender, digit] + entry.tolist()
            entry_df = pd.DataFrame([new_row], columns=df.columns)

            # Append the new row to the main DataFrame
            df = pd.concat([df, entry_df], ignore_index=True)
    
    timeEnd = time()
    print("COMPLETED LOADING TRAINING DATA")
    print(f"\t\tLoaded {len(df)} entries in {timeEnd-timeStart:.2f} seconds")
    return df
                

In [14]:
def loadTest(filepath, coeff=ALL_COEFFS):
    """
    :param filepath: path to the file to load
    Organizes the test data in the input file into a list of matrices. Each matrix is the data for a spoken digit,
    where the columns are the coefficients and each row is the index of the time window.
    Handles test-specific structure: 220 blocks per digit (110 male, 110 female).
    """
    try:
        f = open(filepath)
    except FileNotFoundError:
        print(f"File {filepath} not found")
        return None

    digits = []
    labels = np.append(['block', 'utterance', 'speaker', 'gender', 'digit'], coeff)
    utterances = 1
    speaker = 1
    gender = 'm'
    digit = 0
    block = 1

    df = pd.DataFrame(columns=labels)

    # Load the data
    with f:
        print(f"Loading {filepath}")
        for idx, line in enumerate(f):
            if idx == 0:
                continue
            if line.isspace() or line == "\n":
                utterances += 1
                block += 1
                digit = (block - 1) // 220  # Each digit has 220 blocks in total

                if utterances > 10:
                    utterances = 1
                    speaker += 1

                # Determine gender based on speaker index
                if (speaker - 1) // 11 == 1:  # First 11 speakers are male, next 11 are female
                    gender = 'f'
                elif (speaker - 1) // 22 == 1:  # Reset to male after every set of speakers
                    gender = 'm'
                    speaker = 1

                continue

            # Process each line and add it to the DataFrame
            entry = getFormattedEntry(line, coeff)
            new_row = [block, utterances, speaker, gender, digit] + entry.tolist()
            entry_df = pd.DataFrame([new_row], columns=df.columns)

            # Append the new row to the main DataFrame
            df = pd.concat([df, entry_df], ignore_index=True)

    print("COMPLETED LOADING TEST DATA")
    print(f"\t\tLoaded {len(df)} entries")
    return df

In [15]:
def getFormattedEntry(line, coeff=ALL_COEFFS):
    """
    :param coeff: the coefficients to load
    :return: the columns to load from the data file
    """
    if coeff.all() == ALL_COEFFS.all():
        line = line.strip() # remove leading and trailing whitespace
        arr = np.array(line.split(" "))
        return arr.astype(np.float64)
    else:
        return [i+1 for i in coeff]

In [16]:
if __name__ == "__main__":
    trainDF = loadTrain(TRAIN_FILE, coeff=ALL_COEFFS)
    testDF = loadTest(TEST_FILE, coeff=ALL_COEFFS)

Loading ../data/Train_Arabic_Digit.txt


  df = pd.concat([df, entry_df], ignore_index=True)


COMPLETED LOADING TRAINING DATA
		Loaded 263256 entries
Loading ../data/Test_Arabic_Digit.txt


  df = pd.concat([df, entry_df], ignore_index=True)


COMPLETED LOADING TEST DATA
		Loaded 87063 entries


In [17]:
trainDF.head()

Unnamed: 0,block,utterance,speaker,gender,digit,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1,1,1,m,0,-0.81101,-7.2382,1.5429,-0.64774,1.4271,0.61356,0.36516,0.088906,0.47031,0.98844,0.044692,0.20817,0.5114
1,1,1,1,m,0,-0.37028,-7.1336,1.8856,-0.34316,0.96733,0.32763,0.42988,0.50479,0.41533,0.28804,0.086109,0.6269,0.78115
2,1,1,1,m,0,0.59659,-8.3059,1.6943,-0.66611,0.34967,-0.17425,0.82077,1.2611,0.41653,0.5005,0.57163,0.45316,0.64465
3,1,1,1,m,0,1.4585,-8.1957,1.8454,-1.1496,0.8266,-0.51313,0.067443,0.25637,0.115,-0.10915,0.085991,0.69064,0.33769
4,1,1,1,m,0,2.0824,-8.667,1.1995,-1.124,1.2445,-0.10251,0.99867,0.57174,1.0384,0.17564,-0.032857,0.53229,0.32941


In [18]:
testDF.head()

Unnamed: 0,block,utterance,speaker,gender,digit,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1,1,1,m,0,1.2572,-8.2449,0.8483,-1.5782,0.4736,-0.063273,0.42481,0.50017,0.7042,0.28973,0.076053,0.025883,-0.22968
1,1,1,1,m,0,3.3638,-9.0154,1.4104,-1.5884,1.3725,-0.33481,1.0529,0.89804,0.79525,0.74112,-0.15351,0.51718,0.44204
2,1,1,1,m,0,3.4461,-9.4871,1.3425,-1.4066,1.4422,0.12447,0.58199,0.88984,0.9026,0.11521,-0.047091,0.40989,0.81545
3,1,1,1,m,0,4.773,-10.007,1.2143,-2.0118,2.0864,0.28562,0.52868,0.33971,1.1688,0.42569,-0.47099,0.5086,-0.33061
4,1,1,1,m,0,5.3317,-9.6834,1.5131,-2.3545,1.6933,0.066311,-0.088666,0.16826,0.24546,-0.34749,-0.098748,0.81093,-0.12837


In [19]:
# print the shape of the dataframes and statistics
print("Train data shape: ", trainDF.shape)
print("Test data shape: ", testDF.shape)
print("Train data statistics: ", trainDF.describe())
print("Test data statistics: ", testDF.describe())
print("Train data info: ", trainDF.info())
print("Test data info: ", testDF.info())

Train data shape:  (263256, 18)
Test data shape:  (87063, 18)
Train data statistics:                     0              1              2              3  \
count  263256.000000  263256.000000  263256.000000  263256.000000   
mean        1.980291      -3.079727      -0.360437      -1.146497   
std         2.750261       2.127967       1.736583       1.295992   
min       -10.435000     -12.957000      -6.661500      -5.600300   
25%         0.542065      -4.031600      -1.546500      -2.162800   
50%         2.382500      -2.764100      -0.050925      -1.134700   
75%         3.903800      -1.834200       0.911198      -0.105610   
max         9.155000       9.341600       5.324700       4.501400   

                   4              5              6              7  \
count  263256.000000  263256.000000  263256.000000  263256.000000   
mean       -0.688541      -0.617507      -0.281008      -0.431595   
std         1.071546       0.872268       0.729381       0.835536   
min        -5.49

In [20]:
trainDF.to_csv('../data/train.csv', index=False)
testDF.to_csv('../data/test.csv', index=False)
print("Data saved to CSV files.")

Data saved to CSV files.
