In [1]:
import numpy as np
import pandas as pd

### scaling for one feature

In [2]:
# scaling for sequential data (pitch & hnr)
def scaling_seq(feature):
    max = float('-inf')
    min = float('inf')

    # find the max and min
    # each line is np.ndarray containing a sequence of floats
    for line in feature:
        max_line = np.nanmax(line)  # ignore nans
        min_line = np.nanmin(line)
        if max_line > max:
            max = max_line
        if min_line < min:
            min = min_line
    print(f"the max value is {max}")
    print(f"the min value is {min}")

    ### faster if the size of arrays is not that big
    # all_values = np.concatenate([arr for arr in feature if arr.size > 0])
    # min = np.min(all_values)
    # max = np.max(all_values)
    
    # scaling
    scaled_feature = []
    difference = max - min
    for line in feature:
        scaled_list = []
        for i in range(len(line)):
            scaled_list.append((line[i] - min) / difference)
        scaled_feature.append(np.array(scaled_list))

    return(scaled_feature)


# scaling for non-sequential data (jitter & shimmer)
def scaling_nonseq(feature):
    dim = len(feature[0])
    max = [float('-inf')]*dim
    min = [float('inf')]*dim

    # find the max and min for each dimasion
    # each line is np.array containing floats in diffrent dim
    for line in feature:
        for i in range(len(line)):
            if line[i] > max[i]:
                max[i] = line[i]
            if line[i] < min[i]:
                min[i] = line[i]
    print(f"the max value is {max}")
    print(f"the min value is {min}")

    # scaling
    scaled_feature = []
    difference = [a - b for a,b in zip(max, min)]
    for line in feature:
        scaled_list = []
        for i in range(len(line)):
            scaled_list.append((line[i] - min[i]) / difference[i])
        scaled_feature.append(np.array(scaled_list))

    return scaled_feature
    

### scaling for the entire dataframe

In [3]:
# replace the values in the dataframe with the scaled ones
def processing(df):
    print(f"---processing pitch---")
    pitch_scaled = scaling_seq(df['PITCH'])
    print(f"---processing hnr---")
    hnr_scaled = scaling_seq(df['HNR'])
    print(f"---processing jitter---")
    jitter_scaled = scaling_nonseq(df['JITTER'])
    print(f"---processing shimmer---")
    shimmer_scaled = scaling_nonseq(df['SHIMMER'])

    data = {'AUDIO_ID': df['AUDIO_ID'],
            'PITCH': pitch_scaled,
            'HNR': hnr_scaled,
            'JITTER': jitter_scaled,
            'SHIMMER': shimmer_scaled}
    
    return data

### produce new data files

In [6]:
train_path = '~/TeamLab_phonetics/prosody_features_train.parquet'
df_train = pd.read_parquet(train_path, engine='pyarrow')
train_scaled = processing(df_train)
df_train_scaled= pd.DataFrame(train_scaled)
#df_train_scaled.to_parquet("~/TeamLab_phonetics/prosody_features_train_scaled.parquet", engine='pyarrow')

---processing pitch---
the max value is 599.9943456812225
the min value is 74.92041345964803
---processing hnr---
the max value is 58.399494463862524
the min value is -200.0
---processing jitter---
the max value is [0.09882632, 0.0010225201, 0.048410866, 0.07295901, 0.1452326]
the min value is [0.004070737, 2.1154045e-05, 0.00074533327, 0.0013134623, 0.0022359998]
---processing shimmer---
the max value is [0.30648893, 2.348075, 0.22382307, 0.28442547, 0.8268665, 0.6714692]
the min value is [0.028642662, 0.33427465, 0.0052041854, 0.007886035, 0.0037046608, 0.015612557]


In [5]:

dev_path = '~/TeamLab_phonetics/prosody_features_dev.parquet'
df_dev = pd.read_parquet(dev_path, engine='pyarrow')
dev_scaled = processing(df_dev)
df_dev_scaled= pd.DataFrame(dev_scaled)
df_dev_scaled.to_parquet("~/TeamLab_phonetics/prosody_features_dev_scaled.parquet", engine='pyarrow')

---processing pitch---
the max value is 599.9929872589257
the min value is 74.93587631992636
---processing hnr---
the max value is 59.098423647801646
the min value is -200.0
---processing jitter---
the max value is [0.08257678, 0.0007544664, 0.049969174, 0.053996816, 0.14990751]
the min value is [0.0038994069, 1.7135175e-05, 0.0009003427, 0.0010258049, 0.0027010283]
---processing shimmer---
the max value is [0.2777213, 2.1647418, 0.1822776, 0.26462653, 0.73973536, 0.5468328]
the min value is [0.03461318, 0.41222075, 0.004212566, 0.008448172, 0.003971915, 0.012637698]


In [6]:
eval_path = '~/TeamLab_phonetics/prosody_features_eval.parquet'
df_eval = pd.read_parquet(eval_path, engine='pyarrow')
eval_scaled = processing(df_eval)
df_eval_scaled= pd.DataFrame(eval_scaled)
df_eval_scaled.to_parquet("~/TeamLab_phonetics/prosody_features_eval_scaled.parquet", engine='pyarrow')

---processing pitch---


  max_line = np.nanmax(line)  # ignore nans
  min_line = np.nanmin(line)


the max value is 599.9972615174114
the min value is 74.87789784576898
---processing hnr---
the max value is 72.20490462705462
the min value is -200.0
---processing jitter---
the max value is [0.115006894, 0.00097466557, 0.054002304, 0.083621025, 0.16200691]
the min value is [0.0028957126, 1.42644185e-05, 0.0010090551, 0.0014599963, 0.003027165]
---processing shimmer---
the max value is [0.286747, 2.5312984, 0.1651646, 0.30076027, 0.8187512, 0.49549383]
the min value is [0.031257696, 0.304777, 0.0063738464, 0.0011641324, 0.0012013044, 0.019121539]


### Inspection of the new dataframes

In [None]:
print(df_train['PITCH'].describe())

In [None]:
print(df_train_scaled['PITCH'].describe())

In [None]:
print(df_train_scaled.head())
print(df_train_scaled.describe())

       AUDIO_ID                                              PITCH  \
0  LA_T_1000137  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
1  LA_T_1000406  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
2  LA_T_1000648  [nan, nan, nan, nan, nan, 0.35835335634967486,...   
3  LA_T_1000824  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
4  LA_T_1001074  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   

                                                 HNR  \
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7987432090825...   
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
4  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   

                                              JITTER  \
0  [0.30073947, 0.23022015, 0.26707897, 0.2538646...   
1  [0.1494679, 0.09972349, 0.086890295, 0.0791289...   
2  [0.13972145, 0.05122372, 0.123514704, 0.096691...   
3  [0.28089172, 0.