In [1]:
%pip install numpy pandas -q
import numpy as np
import pandas as pd
from constants import *
import sys

Note: you may need to restart the kernel to use updated packages.


In [2]:
df = pd.DataFrame()

In [3]:
def load(filepath, coeff=ALL_COEFFS):
    """
    :param filepath: path to the file to load
    Organizes the data in the input file into a list of matrices. Each matrix is the data for a spoken digit,
    where the columns are the coefficients and each row is the index of the time window
    """
    
    
    try:
        f = open(filepath)
    except FileNotFoundError:
        print(f"File {filepath} not found")
        return None
    
    digits = []
    labels = np.append(['block','utterance','speaker','gender','digit'],coeff)
    utterances = 1
    speaker = 1
    gender = 'm'
    digit = 0
    block = 1
    
    
    global df
    df = pd.DataFrame(columns=labels)
    
    # load the data
    with f:
        print(f"Loading {filepath}")
        for idx, line in enumerate(f):
            if idx == 0:
                continue
            if line.isspace() or line=="\n":
                utterances+=1
                block+=1
                digit = (block-1)//660
                if utterances>10:
                    utterances = 1
                    speaker += 1
                
                if (speaker-1)//33==1:
                    gender = 'f'
                elif (speaker-1)//66==1:
                    gender = 'm'
                    speaker = 1
                
                continue
            entry = getFormattedEntry(line, coeff)
            new_row = [block,utterances, speaker, gender, digit] + entry.tolist()
            entry_df = pd.DataFrame([new_row], columns=df.columns)

            # Append the new row to the main DataFrame
            df = pd.concat([df, entry_df], ignore_index=True)
                

In [4]:
def getFormattedEntry(line, coeff=ALL_COEFFS):
    """
    :param coeff: the coefficients to load
    :return: the columns to load from the data file
    """
    if coeff.all() == ALL_COEFFS.all():
        line = line.strip() # remove leading and trailing whitespace
        arr = np.array(line.split(" "))
        return arr.astype(np.float64)
    else:
        return [i+1 for i in coeff]

In [5]:
def getTrainDataset(coeffs=ALL_COEFFS):
    """
    Loads the training dataset
    """
    return load(TRAIN_FILE,coeffs)

In [6]:
1320 // 660

2

In [7]:
if __name__ == "__main__":
    trainDF = getTrainDataset(coeffs=ALL_COEFFS)

Loading ../data/Train_Arabic_Digit.txt


  df = pd.concat([df, entry_df], ignore_index=True)


In [10]:
df.to_csv("../data/dataframe.csv")

In [23]:
df.columns

Index(['block', 'utterance', 'speaker', 'gender', 'digit', '0', '1', '2', '3',
       '4', '5', '6', '7', '8', '9', '10', '11', '12'],
      dtype='object')

In [22]:
df_0 = df[df['digit']==9]
df_0['block'].unique()

array([5941, 5942, 5943, 5944, 5945, 5946, 5947, 5948, 5949, 5950, 5951,
       5952, 5953, 5954, 5955, 5956, 5957, 5958, 5959, 5960, 5961, 5962,
       5963, 5964, 5965, 5966, 5967, 5968, 5969, 5970, 5971, 5972, 5973,
       5974, 5975, 5976, 5977, 5978, 5979, 5980, 5981, 5982, 5983, 5984,
       5985, 5986, 5987, 5988, 5989, 5990, 5991, 5992, 5993, 5994, 5995,
       5996, 5997, 5998, 5999, 6000, 6001, 6002, 6003, 6004, 6005, 6006,
       6007, 6008, 6009, 6010, 6011, 6012, 6013, 6014, 6015, 6016, 6017,
       6018, 6019, 6020, 6021, 6022, 6023, 6024, 6025, 6026, 6027, 6028,
       6029, 6030, 6031, 6032, 6033, 6034, 6035, 6036, 6037, 6038, 6039,
       6040, 6041, 6042, 6043, 6044, 6045, 6046, 6047, 6048, 6049, 6050,
       6051, 6052, 6053, 6054, 6055, 6056, 6057, 6058, 6059, 6060, 6061,
       6062, 6063, 6064, 6065, 6066, 6067, 6068, 6069, 6070, 6071, 6072,
       6073, 6074, 6075, 6076, 6077, 6078, 6079, 6080, 6081, 6082, 6083,
       6084, 6085, 6086, 6087, 6088, 6089, 6090, 60