In [1]:
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt


# name columns of the dataset
column_names=['Subject_identifier', 'Jitter_2', 'Jitter_3', 'Jitter_4', 'Jitter_5', 'Jitter_6', 'Shimmer_7', 'Shimmer_8', 'Shimmer_9', 'Shimmer_10', 'Shimmer_11', 'Shimmer_12', 'Harmonicity_13', 'Harmonicity_14', 'Harmonocity_15', 'Pitch_16', 'Pitch_17', 'Pitch_18', 'Pitch_19', 'Pitch_20', 'Pulse_21', 'Pulse_22', 'Pulse_23', 'Pulse_24', 'Voice_25', 'Voice_26', 'Voice_27', 'UPDRS', 'PD_indicator']

df = pd.read_csv("po1_data.txt", names= column_names)


In [3]:
# feature engineering
pitch_range = []    # create new pitch range variable
for index, row in df.iterrows():
    pitch_range.append(row['Pitch_20'] - row['Pitch_19'])


insert_index = df.columns.get_loc("Pitch_20") + 1   # index after the maximum pitch (Pitch_20)
df.insert(insert_index, 'Pitch Range', pitch_range)
column_names.append("Pitch Range")


voice_break_impact = [] # create new voice break impact variable
for index, row in df.iterrows():
    voice_break_impact.append((100 - row['Voice_25']) * row['Voice_26'])

insert_index = df.columns.get_loc("Voice_27") + 1   # index after the maximum pitch (Pitch_20)
df.insert(insert_index, 'Voice_Break_Impact', voice_break_impact)
column_names.append("Voice_Break_Impact")


pulse_period_deviation = [] # create new pulse period deviation
for index, row in df.iterrows():
    pulse_period_deviation.append((row['Pulse_21']) - row['Pulse_22'])

insert_index = df.columns.get_loc("Pulse_24") + 1   # index after the pulse 24
df.insert(insert_index, "Pulse_Period_Deviation", pulse_period_deviation)
column_names.append("Pulse_Period_Deviation")


In [7]:
# print informatino of df
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Subject_identifier      1040 non-null   int64  
 1   Jitter_2                1040 non-null   float64
 2   Jitter_3                1040 non-null   float64
 3   Jitter_4                1040 non-null   float64
 4   Jitter_5                1040 non-null   float64
 5   Jitter_6                1040 non-null   float64
 6   Shimmer_7               1040 non-null   float64
 7   Shimmer_8               1040 non-null   float64
 8   Shimmer_9               1040 non-null   float64
 9   Shimmer_10              1040 non-null   float64
 10  Shimmer_11              1040 non-null   float64
 11  Shimmer_12              1040 non-null   float64
 12  Harmonicity_13          1040 non-null   float64
 13  Harmonicity_14          1040 non-null   float64
 14  Harmonocity_15          1040 non-null   

In [9]:
# group the dataset by calculating mean of each person's rows of data
grouped_df = df.groupby('Subject_identifier').mean()
print(grouped_df)

In [11]:
# split the dataset into PPD and non_PPD
PPD = grouped_df.query('PD_indicator == 1') # suffer PPD
non_PPD = grouped_df.query('PD_indicator == 0') # not suffer PPD


# show information of 2 new datasets created 
print(PPD.head())
print()
print(non_PPD.head())


                    Jitter_2  Jitter_3  Jitter_4  Jitter_5  Jitter_6  \
Subject_identifier                                                     
1                   2.319462  0.000115  1.108269  1.262692  3.324808   
2                   2.688038  0.000215  1.274769  1.453346  3.824231   
3                   3.006423  0.000131  1.631077  1.689115  4.892885   
4                   1.545038  0.000063  0.806769  0.828808  2.419923   
5                   2.749600  0.000250  1.234960  1.275200  3.704840   

                    Shimmer_7  Shimmer_8  Shimmer_9  Shimmer_10  Shimmer_11  \
Subject_identifier                                                            
1                   10.548308   1.067231   4.189692    6.317538   11.896231   
2                   12.371192   1.182192   5.537654    7.136808   12.699423   
3                   17.155731   1.494769   8.115385   10.962200   15.306125   
4                   10.647423   1.015308   5.114423    6.672269    9.102962   
5                   1

In [13]:
# create 2 lists containing values of columns of PPD and non PPD datasets created above 
PPD_samples = []    # sample of PPD
non_PPD_samples = []    # sample of non_PPD
salient_features = []   # list of salient features
count = 0   # number of salient features
min = 0.5
min_variable = None


In [15]:
# loop through each feature (column) except "subject identifier", "PD indicator" and "UPDRS"
for column in column_names:
    if column in ["Subject_identifier", "PD_indicator", "UPDRS"]:
        continue
    PPD_samples = PPD[column].to_numpy()
    non_PPD_samples = non_PPD[column].to_numpy()

    # calculate means of the feature of 2 groups
    PPD_samples_mean = st.tmean(PPD_samples)
    non_PPD_samples_mean = st.tmean(non_PPD_samples)

    

    # calculate standard deviation values of the feature of 2 groups
    PPD_samples_std = st.tstd(PPD_samples)
    non_PPD_samples_std = st.tstd(non_PPD_samples)

    # calculate size of the feature of 2 groups
    PPD_samples_size = len(PPD_samples)
    non_PPD_samples_size = len(non_PPD_samples)

    # calculate p_value for this feature from 2 groups
    t_stats, p_value = st.ttest_ind_from_stats(PPD_samples_mean, PPD_samples_std, PPD_samples_size, 
                                                non_PPD_samples_mean, non_PPD_samples_std, non_PPD_samples_size,
                                                equal_var=False, alternative='two-sided')
    
    # check if the null hypothesis would be rejected
    if p_value < 0.05: # < 0.05 reject the null and this is the salient feature
        if p_value < min:
            min = p_value
            min_variable = column
        print(f"Column {column}: p-value = {p_value}")
        print(f"PPD: {PPD_samples_mean} and non_PPD: {non_PPD_samples_mean}\n")
        salient_features.append(column)
        count += 1
    

Column Voice_26: p-value = 0.01040110175348114
PPD: 1.0057692307692307 and non_PPD: 1.2634615384615384

Column Voice_27: p-value = 0.0030743687317130015
PPD: 10.458273076923076 and non_PPD: 14.28181153846154

Column Voice_Break_Impact: p-value = 0.02811659730772488
PPD: 64.3948076923077 and non_PPD: 77.53358076923078



In [None]:
print(f"Number of salien features: {count}")
# print salient features and done
if count != 0:
    print("Set of salient acoustic features:")
    for feature in salient_features:
        print(feature)
else: 
    print("There's no salient acoustic features from the dataset to distinguish between PPD and non PPD")
    
print()

# the most significant feature can be used to distinguish between PPD and non_PPD
if min_variable != None:
    print(f"The most significant feature: {min_variable} with p_value = {p_value}")
    