In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
import csv

# Load the data from a csv
the csv has two levels of column names, grouping the features based on the extraction method.
Although this can be imported directly into the dataframe, it results into a lot of unnamed columns. 
So, the first two lines are read from the csv and fill_empty_with_previous function is used to make the tuples pandas asked to make multilevel column indexes.

https://archive.ics.uci.edu/dataset/470/parkinson+s+disease+classification
https://www-sciencedirect-com.mu.idm.oclc.org/science/article/pii/S1568494618305799

In [3]:
def fill_empty_with_previous(lst: list[str]) -> list[str]:
    """Fills empty strings in the list with the last non-empty value.

    Args: lst (list[str]): A list of strings where some elements may be empty strings ('').

    Returns: list[str]: The modified list where empty strings are replaced by the last non-empty value.
    
    Example:
        fill_empty_with_previous(['a', '', 'b', '', '']) -> ['a', 'a', 'b', 'b', 'b']
    """
    last_value = ''  # Stores the last non-empty string encountered

    for index, item in enumerate(lst):
        # Skip processing for the last item in the list, for the pd_speech_features dataset this is to avoid labelling class
        if index == len(lst) - 1:
            continue

        # Update last_value if the current item is not empty
        elif item != '':
            last_value = item
        else:
            # Replace empty string with the last non-empty value
            lst[index] = last_value

    return lst

In [2]:
with open('pd_speech_features.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    feature_set = spamreader.__next__()
    features = spamreader.__next__()

feature_set = fill_empty_with_previous(feature_set)
columns = list(zip(feature_set,features))

speech = pd.read_csv('pd_speech_features.csv',header=1)
speech.columns = pd.MultiIndex.from_tuples(columns)
speech

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Baseline Features,Baseline Features,Baseline Features,Baseline Features,Baseline Features,Baseline Features,Baseline Features,Baseline Features,...,TQWT Features,TQWT Features,TQWT Features,TQWT Features,TQWT Features,TQWT Features,TQWT Features,TQWT Features,TQWT Features,Unnamed: 21_level_0
Unnamed: 0_level_1,id,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,0,1,0.85247,0.71826,0.57227,240,239,0.008064,0.000087,0.00218,...,1.5620,2.6445,3.8686,4.2105,5.1221,4.4625,2.6202,3.0004,18.9405,1
1,0,1,0.76686,0.69481,0.53966,234,233,0.008258,0.000073,0.00195,...,1.5589,3.6107,23.5155,14.1962,11.0261,9.5082,6.5245,6.3431,45.1780,1
2,0,1,0.85083,0.67604,0.58982,232,231,0.008340,0.000060,0.00176,...,1.5643,2.3308,9.4959,10.7458,11.0177,4.8066,2.9199,3.1495,4.7666,1
3,1,0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,...,3.7805,3.5664,5.2558,14.0403,4.2235,4.6857,4.8460,6.2650,4.0603,1
4,1,0,0.32790,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,...,6.1727,5.8416,6.0805,5.7621,7.7817,11.6891,8.2103,5.0559,6.1164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,250,0,0.80903,0.56355,0.28385,417,416,0.004627,0.000052,0.00064,...,3.0706,3.0190,3.1212,2.4921,3.5844,3.5400,3.3805,3.2003,6.8671,0
752,250,0,0.16084,0.56499,0.59194,415,413,0.004550,0.000220,0.00143,...,1.9704,1.7451,1.8277,2.4976,5.2981,4.2616,6.3042,10.9058,28.4170,0
753,251,0,0.88389,0.72335,0.46815,381,380,0.005069,0.000103,0.00076,...,51.5607,44.4641,26.1586,6.3076,2.8601,2.5361,3.5377,3.3545,5.0424,0
754,251,0,0.83782,0.74890,0.49823,340,339,0.005679,0.000055,0.00092,...,19.1607,12.8312,8.9434,2.2044,1.9496,1.9664,2.6801,2.8332,3.7131,0


In [4]:
column_list = list(speech.columns)
features = column_list.copy()
features.remove('id')
features.remove('gender')
features.remove('class')
features

['PPE',
 'DFA',
 'RPDE',
 'numPulses',
 'numPeriodsPulses',
 'meanPeriodPulses',
 'stdDevPeriodPulses',
 'locPctJitter',
 'locAbsJitter',
 'rapJitter',
 'ppq5Jitter',
 'ddpJitter',
 'locShimmer',
 'locDbShimmer',
 'apq3Shimmer',
 'apq5Shimmer',
 'apq11Shimmer',
 'ddaShimmer',
 'meanAutoCorrHarmonicity',
 'meanNoiseToHarmHarmonicity',
 'meanHarmToNoiseHarmonicity',
 'minIntensity',
 'maxIntensity',
 'meanIntensity',
 'f1',
 'f2',
 'f3',
 'f4',
 'b1',
 'b2',
 'b3',
 'b4',
 'GQ_prc5_95',
 'GQ_std_cycle_open',
 'GQ_std_cycle_closed',
 'GNE_mean',
 'GNE_std',
 'GNE_SNR_TKEO',
 'GNE_SNR_SEO',
 'GNE_NSR_TKEO',
 'GNE_NSR_SEO',
 'VFER_mean',
 'VFER_std',
 'VFER_entropy',
 'VFER_SNR_TKEO',
 'VFER_SNR_SEO',
 'VFER_NSR_TKEO',
 'VFER_NSR_SEO',
 'IMF_SNR_SEO',
 'IMF_SNR_TKEO',
 'IMF_SNR_entropy',
 'IMF_NSR_SEO',
 'IMF_NSR_TKEO',
 'IMF_NSR_entropy',
 'mean_Log_energy',
 'mean_MFCC_0th_coef',
 'mean_MFCC_1st_coef',
 'mean_MFCC_2nd_coef',
 'mean_MFCC_3rd_coef',
 'mean_MFCC_4th_coef',
 'mean_MFCC_5th_co

check if they are same ish before meaning
Pca
Heat map (euclidian distance)

think about imbalance

check all data errors in code and shwow them
