In [1]:
import numpy as np
import pandas as pd

### scaling for one feature

In [None]:
# scaling for sequential data (pitch & hnr)
def get_train_seq_minMax(train_data):
    max = float('-inf')
    min = float('inf')

    # find the max and min
    # each line is np.ndarray containing a sequence of floats
    for line in train_data:
        max_line = np.nanmax(line)  # ignore nans
        min_line = np.nanmin(line)
        if max_line > max:
            max = max_line
        if min_line < min:
            min = min_line
    print(f"the max value is {max}")
    print(f"the min value is {min}")

    ### faster if the size of arrays is not that big
    # all_values = np.concatenate([arr for arr in feature if arr.size > 0])
    # min = np.min(all_values)
    # max = np.max(all_values)
    return min, max

def scaling_seq(feature, min, max):
    
    # scaling
    scaled_feature = []
    difference = max - min
    for line in feature:
        scaled_list = []
        for i in range(len(line)):
            scaled_list.append((line[i] - min) / difference)
        scaled_feature.append(np.array(scaled_list))

    return(scaled_feature)


# scaling for non-sequential data (jitter & shimmer)
def get_train_nonseq_minMax(train_data):
    dim = len(train_data[0])
    max = [float('-inf')]*dim
    min = [float('inf')]*dim

    # find the max and min for each dimasion
    # each line is np.array containing floats in diffrent dim
    for line in train_data:
        for i in range(len(line)):
            if line[i] > max[i]:
                max[i] = line[i]
            if line[i] < min[i]:
                min[i] = line[i]
    print(f"the max value is {max}")
    print(f"the min value is {min}")

    return min, max

def scaling_nonseq(feature, min, max):
    # scaling
    scaled_feature = []
    difference = [a - b for a,b in zip(max, min)]
    for line in feature:
        scaled_list = []
        for i in range(len(line)):
            scaled_list.append((line[i] - min[i]) / difference[i])
        scaled_feature.append(np.array(scaled_list))

    return scaled_feature
    

### scaling for the entire dataframe

In [3]:
# replace the values in the dataframe with the scaled ones
def processing(df, min, max):
    print(f"---processing pitch---")
    pitch_scaled = scaling_seq(df['PITCH'], min[0], max[0])
    print(f"---processing hnr---")
    hnr_scaled = scaling_seq(df['HNR'], min[1], max[1])
    print(f"---processing jitter---")
    jitter_scaled = scaling_nonseq(df['JITTER'], min[2], max[2])
    print(f"---processing shimmer---")
    shimmer_scaled = scaling_nonseq(df['SHIMMER'], min[3], max[3])

    data = {'AUDIO_ID': df['AUDIO_ID'],
            'LABEL': df['LABEL'],
            'ATTACK_TYPE': df['ATTACK_TYPE'],
            'PITCH': pitch_scaled,
            'HNR': hnr_scaled,
            'JITTER': jitter_scaled,
            'SHIMMER': shimmer_scaled}
    
    return data

### produce new data files

In [4]:
train_path = '~/TeamLab_phonetics/prosody_features_train_wlabel.parquet'
dev_path = '~/TeamLab_phonetics/prosody_features_dev_wlabel.parquet'
eval_path = '~/TeamLab_phonetics/prosody_features_eval_wlabel.parquet'

df_train = pd.read_parquet(train_path, engine='pyarrow')
df_dev = pd.read_parquet(dev_path, engine='pyarrow')
df_eval = pd.read_parquet(eval_path, engine='pyarrow')

In [5]:
pitch_min, pitch_max = get_train_seq_minMax(df_train['PITCH'])
hnr_min, hnr_max = get_train_seq_minMax(df_train['HNR'])
jitter_min, jitter_max = get_train_nonseq_minMax(df_train['JITTER'])
shimmer_min, shimmer_max = get_train_nonseq_minMax(df_train['SHIMMER'])
min = [pitch_min, hnr_min, jitter_min, shimmer_min]
max = [pitch_max, hnr_max, jitter_max, shimmer_max]

the max value is 599.9943456812225
the min value is 74.92041345964803
the max value is 58.399494463862524
the min value is -200.0
the max value is [0.09882632, 0.0010225201, 0.048410866, 0.07295901, 0.1452326]
the min value is [0.004070737, 2.1154045e-05, 0.00074533327, 0.0013134623, 0.0022359998]
the max value is [0.30648893, 2.348075, 0.22382307, 0.28442547, 0.8268665, 0.6714692]
the min value is [0.028642662, 0.33427465, 0.0052041854, 0.007886035, 0.0037046608, 0.015612557]


In [6]:
train_scaled = processing(df_train, min, max)
df_train_scaled= pd.DataFrame(train_scaled)
#df_train_scaled.to_parquet("~/TeamLab_phonetics/prosody_features_train_scaled.parquet", engine='pyarrow')

---processing pitch---
---processing hnr---
---processing jitter---
---processing shimmer---


In [20]:
dev_scaled = processing(df_dev, min, max)
df_dev_scaled= pd.DataFrame(dev_scaled)
df_dev_scaled.to_parquet("~/TeamLab_phonetics/prosody_features_dev_scaled.parquet", engine='pyarrow')

---processing pitch---
---processing hnr---
---processing jitter---
---processing shimmer---


In [21]:
eval_scaled = processing(df_eval, min, max)
df_eval_scaled= pd.DataFrame(eval_scaled)
df_eval_scaled.to_parquet("~/TeamLab_phonetics/prosody_features_eval_scaled.parquet", engine='pyarrow')

---processing pitch---
---processing hnr---
---processing jitter---
---processing shimmer---


### Inspection of the new dataframes

In [None]:
print(df_train['PITCH'].describe())

In [None]:
print(df_train_scaled['PITCH'].describe())

In [7]:
print(df_train_scaled.head())
print(df_train_scaled.describe())

       AUDIO_ID  LABEL ATTACK_TYPE  \
0  LA_T_1000137      0         A04   
1  LA_T_1000406      1           -   
2  LA_T_1000648      0         A01   
3  LA_T_1000824      0         A04   
4  LA_T_1001074      0         A03   

                                               PITCH  \
0  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
1  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
2  [nan, nan, nan, nan, nan, 0.35835335634967486,...   
3  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
4  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   

                                                 HNR  \
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7987432090825...   
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
4  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   

                                              JITTER  \
0  [0.30073947, 0.23022015, 0.26707897, 0

### Scaling for MFCC

In [None]:
def get_row_minMax(train_data_row):
    max = float('-inf')
    min = float('inf')
    for line in train_data_row:
        max_line = np.nanmax(line)  # ignore nans
        min_line = np.nanmin(line)
        if max_line > max:
            max = max_line
        if min_line < min:
            min = min_line
    return min, max

def get_train_mfcc_minMax(train_data):
    train_data = train_data['MFCC']
    min_list = []
    max_list = []
    for row in train_data:
        min, max = get_row_minMax(row)
        min_list.append(min)
        max_list.append(max)
    print(min_list)
    print(max_list)
    return min_list, max_list

In [None]:
def scaling_mfcc(train, mfcc_df):
    min_list, max_list = get_train_mfcc_minMax(train)
    mfcc_list = mfcc_df['MFCC']
    mfcc_scaled = []
    for i, arr in enumerate(mfcc_list):
    # Determine how many rows this specific array has
        num_rows = arr.shape[0]
        print(f"\nProcessing array #{i+1} with {num_rows} rows.")

        # Slice the global min/max arrays to get the values for this array's rows
        current_mins = np.array(min_list[:num_rows])
        current_maxs = np.array(max_list[:num_rows])

        # Reshape for broadcasting (turn 1D array of shape (M,) to a column of shape (M, 1))
        mins_col = current_mins[:, np.newaxis]
        maxs_col = current_maxs[:, np.newaxis]
        
        # Calculate the range, handling the division-by-zero case
        range_col = maxs_col - mins_col
        range_col[range_col == 0] = 1 # Avoid division by zero

        # Apply the formula in a single vectorized operation
        scaled_arr = (arr - mins_col) / range_col
        mfcc_scaled.append(scaled_arr)

        data = {"AUDIO_ID": mfcc_df["AUDIO_ID"],
                "MFCC": mfcc_scaled}
    return data

In [4]:
mfcc_train_path = "/home/users1/liqe/TeamLab_phonetics/mfcc_train_df.pkl"
mfcc_eval_path = "/home/users1/liqe/TeamLab_phonetics/mfcc_eval_df.pkl"
mfcc_train_df = pd.read_pickle(mfcc_train_path)
mfcc_eval_df = pd.read_pickle(mfcc_eval_path)

In [10]:
print(mfcc_eval_df['MFCC'].apply(lambda x: isinstance(x, str)).any())
print(mfcc_train_df['MFCC'].apply(lambda x: isinstance(x, str)).any())

False
False


In [6]:
mfcc_scaled_df = scaling_mfcc(mfcc_train_df, mfcc_eval_df)
mfcc_scaled_df = pd.DataFrame(mfcc_scaled_df)
mfcc_scaled_df['MFCC'] = mfcc_scaled_df['MFCC'].apply(lambda arr: arr.astype(np.float32))
print(mfcc_scaled_df.info())
# print(mfcc_scaled_df.describe())
print(mfcc_scaled_df.head())


Processing array #1 with 60 rows.

Processing array #2 with 60 rows.

Processing array #3 with 60 rows.

Processing array #4 with 60 rows.

Processing array #5 with 60 rows.

Processing array #6 with 60 rows.

Processing array #7 with 60 rows.

Processing array #8 with 60 rows.

Processing array #9 with 60 rows.

Processing array #10 with 60 rows.

Processing array #11 with 60 rows.

Processing array #12 with 60 rows.

Processing array #13 with 60 rows.

Processing array #14 with 60 rows.

Processing array #15 with 60 rows.

Processing array #16 with 60 rows.

Processing array #17 with 60 rows.

Processing array #18 with 60 rows.

Processing array #19 with 60 rows.

Processing array #20 with 60 rows.

Processing array #21 with 60 rows.

Processing array #22 with 60 rows.

Processing array #23 with 60 rows.

Processing array #24 with 60 rows.

Processing array #25 with 60 rows.

Processing array #26 with 60 rows.

Processing array #27 with 60 rows.

Processing array #28 with 60 rows.



In [8]:
mfcc_scaled_df.to_pickle("/home/users1/liqe/TeamLab_phonetics/mfcc_eval_scaled.pkl")