In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
from tqdm import trange
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
data = pd.read_csv("parkinsons.data")

Attribute Information

- name - ASCII subject name and recording number
- MDVP:Fo(Hz) - Average vocal fundamental frequency
- MDVP:Fhi(Hz) - Maximum vocal fundamental frequency
- MDVP:Flo(Hz) - Minimum vocal fundamental frequency
- MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several 
- measures of variation in fundamental frequency
- MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude
- NHR,HNR - Two measures of ratio of noise to tonal components in the voice
- status - Health status of the subject (one) - Parkinson's, (zero) - healthy
- RPDE,D2 - Two nonlinear dynamical complexity measures
- DFA - Signal fractal scaling exponent
- spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation 

In [4]:
data_samp = data.iloc[:, :-6]

In [5]:
# Parkinsons = 1, No Parkinsons = 0
data_samp['status'].value_counts()

status
1    147
0     48
Name: count, dtype: int64

In [6]:
park = data_samp[data_samp['status'] == 1]
no_park = data_samp[data_samp['status'] == 0]

In [7]:
park.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,status
count,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0
mean,145.180762,188.441463,106.893558,0.006989,5.1e-05,0.003757,0.0039,0.011273,0.033658,0.321204,0.017676,0.020285,0.0276,0.053027,0.029211,20.974048,1.0
std,32.34805,88.33918,32.274358,0.00524,3.7e-05,0.003241,0.002998,0.009724,0.01997,0.207798,0.010797,0.012847,0.018062,0.032391,0.044447,4.339143,0.0
min,88.333,102.145,65.476,0.00168,1e-05,0.00068,0.00092,0.00204,0.01022,0.09,0.00455,0.0057,0.00811,0.01364,0.00231,8.441,1.0
25%,117.572,133.7765,80.8755,0.004005,3e-05,0.00203,0.00219,0.006085,0.018295,0.168,0.009135,0.010575,0.015555,0.0274,0.008445,18.782,1.0
50%,145.174,163.335,99.77,0.00544,4e-05,0.00284,0.00314,0.00853,0.02838,0.263,0.01484,0.0165,0.02157,0.04451,0.01658,21.414,1.0
75%,170.071,207.1605,129.24,0.00767,6e-05,0.0041,0.00436,0.0123,0.042525,0.3945,0.022815,0.024935,0.034825,0.068455,0.02796,24.1645,1.0
max,223.361,588.518,199.02,0.03316,0.00026,0.02144,0.01958,0.06433,0.11908,1.302,0.05647,0.0794,0.13778,0.16942,0.31482,29.928,1.0


In [8]:
no_park.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,status
count,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,181.937771,223.63675,145.207292,0.003866,2.3e-05,0.001925,0.002056,0.005776,0.017615,0.162958,0.009504,0.010509,0.013305,0.028511,0.011483,24.67875,0.0
std,52.731067,96.727067,58.75707,0.002055,1.5e-05,0.001066,0.000943,0.003199,0.005544,0.057822,0.003456,0.003231,0.003825,0.010368,0.019088,3.43454,0.0
min,110.739,113.597,74.287,0.00178,7e-06,0.00092,0.00106,0.00276,0.00954,0.085,0.00468,0.00606,0.00719,0.01403,0.00065,17.883,0.0
25%,120.9475,139.41325,98.24375,0.002655,1e-05,0.001332,0.00148,0.003998,0.014475,0.129,0.00735,0.008193,0.011235,0.02206,0.004188,22.99325,0.0
50%,198.996,231.1615,113.9385,0.003355,2.5e-05,0.001625,0.001775,0.004875,0.016705,0.154,0.008775,0.010225,0.013015,0.02633,0.004825,24.997,0.0
75%,229.077,251.23925,199.183,0.00453,3e-05,0.001908,0.002228,0.005725,0.02021,0.18925,0.011512,0.01198,0.015945,0.03454,0.009213,26.13925,0.0
max,260.105,592.03,239.17,0.0136,8e-05,0.00624,0.00564,0.01873,0.04087,0.405,0.02336,0.02498,0.02745,0.07008,0.10715,33.047,0.0


Based on comparing the patients with parkinsons to without from the two statistic breakdowns above it looks like the
important features will be the ones based on measuring the variations in frequency and amplitude. I think that just vocal
fundamental frequency alone varies too much according to peoples voices, however, it still could be useful as a tiebreaker
for the model perhaps.

In [9]:
# Ran this cell a few times to analyze some samples, pretty difficult to pick up on any patterns
data_samp.sample(n=10)

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,status
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,0.02182,0.0313,0.02971,0.06545,0.02211,21.033,1
107,phon_R01_S25_5,151.884,157.765,133.751,0.00258,2e-05,0.00115,0.00122,0.00346,0.0145,0.137,0.00633,0.00788,0.01267,0.01898,0.00659,26.833,1
20,phon_R01_S05_3,153.848,165.738,65.782,0.0084,5e-05,0.00428,0.0045,0.01285,0.0381,0.328,0.01667,0.02383,0.04055,0.05,0.03871,17.536,1
21,phon_R01_S05_4,153.88,172.86,78.128,0.0048,3e-05,0.00232,0.00267,0.00696,0.04137,0.37,0.02021,0.02591,0.04525,0.06062,0.01849,19.493,1
141,phon_R01_S34_2,208.083,253.792,91.802,0.00757,4e-05,0.00428,0.00428,0.01285,0.06725,0.571,0.04016,0.04003,0.04464,0.12047,0.04238,15.648,1
158,phon_R01_S37_6,126.144,154.284,97.543,0.00975,8e-05,0.00593,0.00454,0.01778,0.02852,0.266,0.015,0.01581,0.02157,0.04499,0.03828,21.534,1
34,phon_R01_S07_5,203.184,211.526,196.16,0.00178,9e-06,0.00094,0.00106,0.00283,0.00958,0.085,0.00468,0.0061,0.00726,0.01403,0.00065,33.047,0
191,phon_R01_S50_3,209.516,253.017,89.488,0.00564,3e-05,0.00331,0.00292,0.00994,0.02751,0.263,0.01604,0.01657,0.01879,0.04812,0.0181,19.147,0
162,phon_R01_S39_4,114.554,126.778,91.121,0.00651,6e-05,0.00366,0.0034,0.01097,0.03658,0.369,0.01864,0.0194,0.03091,0.05592,0.02707,18.954,1
9,phon_R01_S02_4,95.056,120.103,91.226,0.00532,6e-05,0.00268,0.00332,0.00803,0.02838,0.255,0.01441,0.01725,0.02444,0.04324,0.01022,21.862,1


In [10]:
# Removing data in the dataset that doesn't seem to contribute to the task at hand
# Also because I don't understand what the values are representing
labels = data.iloc[:, -7]
data = data.iloc[:, :-7]

In [11]:
data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,0.02182,0.0313,0.02971,0.06545,0.02211,21.033
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,0.03134,0.04518,0.04368,0.09403,0.01929,19.085
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,0.03858,0.0359,0.0827,0.01309,20.651
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,0.02924,0.04005,0.03772,0.08771,0.01353,20.644
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,0.0349,0.04825,0.04465,0.1047,0.01767,19.649


In [12]:
# Not going to normalize the data because I want to maintain the relationships across features.
# i.e. don't want Fhi to end up being lower than Fo after normalizing against each column
data.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,154.228641,197.104918,116.324631,0.00622,4.4e-05,0.003306,0.003446,0.00992,0.029709,0.282251,0.015664,0.017878,0.024081,0.046993,0.024847,21.885974
std,41.390065,91.491548,43.521413,0.004848,3.5e-05,0.002968,0.002759,0.008903,0.018857,0.194877,0.010153,0.012024,0.016947,0.030459,0.040418,4.425764
min,88.333,102.145,65.476,0.00168,7e-06,0.00068,0.00092,0.00204,0.00954,0.085,0.00455,0.0057,0.00719,0.01364,0.00065,8.441
25%,117.572,134.8625,84.291,0.00346,2e-05,0.00166,0.00186,0.004985,0.016505,0.1485,0.008245,0.00958,0.01308,0.024735,0.005925,19.198
50%,148.79,175.829,104.315,0.00494,3e-05,0.0025,0.00269,0.00749,0.02297,0.221,0.01279,0.01347,0.01826,0.03836,0.01166,22.085
75%,182.769,224.2055,140.0185,0.007365,6e-05,0.003835,0.003955,0.011505,0.037885,0.35,0.020265,0.02238,0.0294,0.060795,0.02564,25.0755
max,260.105,592.03,239.17,0.03316,0.00026,0.02144,0.01958,0.06433,0.11908,1.302,0.05647,0.0794,0.13778,0.16942,0.31482,33.047


In [13]:
# No missing data
missing = data.isnull().sum()
missing

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
dtype: int64

In [14]:
# Quick random forest classifier for baseline values

# Load the dataset
X = data.iloc[:, 1:].values
y = labels.values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced_subsample")

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.87

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.57      0.62         7
           1       0.91      0.94      0.92        32

    accuracy                           0.87        39
   macro avg       0.79      0.75      0.77        39
weighted avg       0.87      0.87      0.87        39



In [15]:
class Net(torch.nn.Module):
    def __init__(self, inputs, n_hidden1, n_hidden2):
        super(Net, self).__init__()
        self.l1 = torch.nn.Linear(inputs, n_hidden1, bias=False)
        self.bn1 = torch.nn.BatchNorm1d(n_hidden1)
        self.relu = torch.nn.ReLU()
        self.l2 = torch.nn.Linear(n_hidden1, n_hidden2, bias=False)
        self.bn2 = torch.nn.BatchNorm1d(n_hidden2)
        self.relu = torch.nn.ReLU()
        self.l3 = torch.nn.Linear(n_hidden2, 1, bias=True)
        
        # Apply Kaiming initialization to the linear layers with ReLU activation
        torch.nn.init.kaiming_normal_(self.l1.weight, nonlinearity='relu')
        torch.nn.init.kaiming_normal_(self.l2.weight, nonlinearity='relu')
        torch.nn.init.xavier_normal_(self.l3.weight, gain=1.0)
        
    def forward(self, x):
        x = self.l1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.l2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.l3(x)
        return x

In [16]:
# Pre-training definitions
inputs = 16
n_hidden1 = 32
n_hidden2 = 8

model = Net(inputs, n_hidden1, n_hidden2)

torch_data = torch.tensor(data.iloc[:, 1:].values, dtype=torch.float32)
torch_labels = torch.tensor(labels.values, dtype=torch.float32).unsqueeze(1)

dataset = TensorDataset(torch_data, torch_labels)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

batch_size = 32
epochs = 500
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

opt = torch.optim.Adam(model.parameters(), lr=0.01)

lossi = []
accuracies = []
# 0.1 200 epochs

model.train()
for i in (t := trange(epochs)):
    for inputs, targets in train_loader:
        
        # forward pass
        outputs = model(inputs)
        loss = F.binary_cross_entropy_with_logits(outputs, targets, reduction='mean', pos_weight=torch.tensor(0.20).float())
        
        # backward pass
        opt.zero_grad()
        loss.backward()
        
        # update step
        opt.step()
        
    lossi.append(loss.item())
    
    with torch.no_grad():
        targs = targets.detach().numpy()
        preds = F.sigmoid(outputs).detach().numpy()
        acc = accuracy_score(targs, (preds > 0.5).astype(int))
        accuracies.append(acc)
    
    # For tqdm
    t.set_description(f"Loss: {loss:.2f} Accuracy: {acc:.2f}")

Loss: 0.06 Accuracy: 0.93: 100%|█████████████████████████████████████████████████████| 500/500 [00:14<00:00, 33.83it/s]


In [18]:
test_losses = []
test_accuracies = []
test_predictions = []
test_targets = []

with torch.no_grad():
    model.eval()
    
    for inputs, targets in test_loader:
        outputs = model(inputs)
        loss = F.binary_cross_entropy_with_logits(outputs, targets, reduction='mean', pos_weight=torch.tensor(0.20).float())
        
        t_targs = targets.detach().numpy()
        t_preds = F.sigmoid(outputs).detach().numpy()
        test_targets.append(t_targs)
        test_predictions.append(t_preds)
        
        acc = accuracy_score(targs, (preds > 0.5).astype(int))
        test_losses.append(loss.item())
        test_accuracies.append(acc)
        
    test_predictions = np.vstack(test_predictions)
    test_targets = np.vstack(test_targets)
    
    accuracy = accuracy_score(test_targets, (test_predictions > 0.5).astype(int))
    print("Accuracy: ", accuracy)

Accuracy:  0.8205128205128205
