# Parkinson's Disease detection with energy audio features using K-Nearest Neighbors coded without libraries.

# 1. Import the dependencies.

In [None]:
import numpy as np
from typing import Tuple
import os
import random
import librosa
import math

# 2. Load the audios with Librosa, and preprocess them using Peak Amplitude Normalization.
- load_audio will return all the audios with their sampling rate.
- to_positive will turn all values to postive numbers (audio is 1D).
- get_max will return the maximum value in each audio array.
- peak_amplitude_normalize will preprocess audios with this formula 10 ** (peak / 20) / maximum_value.
- hamming_window uses this formula 0.54 - 0.46 * math.cos((2 * math.pi * n) / frame_length - 1) to every frame of the audio and returns windowed audios.
- frame_audio_signal uses hamming_window on frames of audios and returns framed audios.

In [None]:
def load_audio(file_path):  # This function loads the audios
    audio, sampling_rate = librosa.load(file_path, sr=44100)
    return audio, sampling_rate

def to_positive(n_array):  # Turn all values to positive values
    for i in range(len(n_array)):
        if n_array[i] < 0:
            n_array[i] = -1 * n_array[i]
    return n_array

def get_max(n_array):  # Get the maximum value
    max_value = n_array[0]
    for i in range(1, len(n_array), 1):
        if n_array[i] > max_value:
            max_value = n_array[i]
    return max_value

def peak_amplitude_normalize(audio_data, peak=-3.0):  # Calculate a scaling factor based on the specific peak value
    n_array = to_positive(audio_data)               # (-3 dB) and multiply the entire audio signal by the scaling factor
    maximum_value = get_max(n_array)
    scaling = 10 ** (peak / 20) / maximum_value
    normalized_audio = audio_data * scaling
    return normalized_audio

def hamming_window(frame_length):
    window = np.zeros(frame_length)
    n_negative_one = frame_length - 1
    for n in range(frame_length):
        window[n] = 0.54 - 0.46 * math.cos((2 * math.pi * n) / n_negative_one)
    return window

# Function to frame the signal using a Hamming window
def frame_audio_signal(audio_data, frame_length):
    length_audio = len(audio_data)
    number_frames = length_audio // frame_length
    framed_audio = np.zeros((frame_length, number_frames))
    for i in range(number_frames):
        start = i * frame_length
        stop = start + frame_length
        samples = audio_data[start:stop] * hamming_window(frame_length)
        framed_audio[:, i] = samples
    return framed_audio

# 3. Extract energy audio features.
- extract_energy_frames takes sum of squared frames in each audio.
- energy_features uses extract_energy to extract features for all audios.

In [None]:
# Function to extract energy using hamming window and frame the signal
def extract_energy_frames(audio_data, frame_length):
    framed_audio = frame_audio_signal(audio_data, frame_length)
    energy = np.sum(framed_audio ** 2, axis=0)
    return energy

def energy_features() -> Tuple:  # This gets the audios from the dataset, gets their status 0 or 1, load them, clean
    feats = []                   # them, and calculate their energy frames as features
    status = []
    directory = '/content/drive/MyDrive/HYP TRAIN DATA/'
    # Getting each audio path from the dataset
    for file_name in os.listdir(directory):
        if file_name.endswith('.wav'):
            if "P" in file_name:
                status.append(1)
            if "C" in file_name:
                status.append(0)
            file_path = os.path.join(directory, file_name)
            audio_data, frame_rate = load_audio(file_path)
            frame_time = 5 # Duration of each frame in seconds
            frame_size = int(frame_time * frame_rate)  # Number of samples in each frame
            preprocessed_audio = peak_amplitude_normalize(audio_data)
            energy_feat = extract_energy_frames(audio_data=preprocessed_audio, frame_length=frame_size)
            feats.append(energy_feat)
    feats = np.array(feats)
    status = np.array(status)
    return feats, status

# 4. K-Nearest Neighbor class with data members k_nearest, x_train, and y_train.
- K-Nearest Neighbor computes the distance between all testing points in the testing data to all data points in the training set and then hold majority vote according to the value of k_nearest.

In [None]:
class KayNearestNeighbour:
    def __init__(self, k_nearest, x_train_data, y_train_data):  # Constructor
        self._k_nearest = k_nearest
        self._x_train = x_train_data
        self._y_train = y_train_data

    def set_k_nearest(self, k_nearest):  # Set number of k near points
        self._k_nearest = k_nearest

    def get_k_nearest(self) -> int:  # Get the number of k near points
        return self._k_nearest

    def distance(self, data_point1, data_point2) -> float:  # Compute the distance between two data points
        distance = np.sqrt(np.sum((data_point1 - data_point2) ** 2))  # Euclidean distance
        # distance = np.sum(np.abs(data_point1 - data_point2))  # Manhattan distance
        # distance = (np.sum(np.abs(data_point1 - data_point2) ** 1)) ** (1 / 1)  # Minkowski distance
        return distance

    def selection_sort(self, the_list):  # To sort the distances according to their increasing distances
        for n in range(len(the_list)):
            min_index = n
            for m in range(n + 1, len(the_list)):
                if the_list[m] < the_list[min_index]:
                    min_index = m
            the_list[n], the_list[min_index] = the_list[min_index], the_list[n]
        return the_list  # Returning the sorted list in ascending order

    def data_fitting(self, data_points, y_labels):  # Fit the data
        self._x_train = data_points
        self._y_train = y_labels

    def predict(self, data_points):  # Predict for each given data point a label, using the computed distances
        estimates = []
        for data_point in data_points:
            estimates.append(self.estimate(data_point))
        estimates = np.array(estimates)
        return estimates

    # estimate method first start by computing the distances between the given data point and all data points, then sort
    # the computed distances in ascending order, the use the value of k nearest to find the nearest points, the use
    # majority vote to find the label
    def estimate(self, data_point):
        # Computation of the distances
        self._y_train = list(self._y_train)
        computed_distances = []
        for data_point_train in self._x_train:
            dist = self.distance(data_point, data_point_train)
            computed_distances.append(dist)

        computed_distance_labels = []
        for n in range(len(computed_distances)):
            min_index = n
            for m in range(n + 1, len(computed_distances)):
                if computed_distances[m] < computed_distances[min_index]:
                    min_index = m
            computed_distances[n], computed_distances[min_index] = computed_distances[min_index], computed_distances[n]
            computed_distance_labels.append(self._y_train[min_index])

        closest_distances, closest_distance_labels = [], []
        for k in range(self.get_k_nearest()):
            closest_distances.append(computed_distances[k])
            closest_distance_labels.append(computed_distance_labels[k])

        one_status, zero_status = 0, 0
        for k in range(len(closest_distance_labels)):
            if closest_distance_labels[k] == 1:
                one_status += 1
            if closest_distance_labels[k] == 0:
                zero_status += 1

        if one_status > zero_status:
            predicted_status = 1
        else:
            predicted_status = 0
        return predicted_status

# 5. Standard scaler and splitting of the data.
- standard_scaler uses means and std deviations to standardize the features.
- split splits the data into training and testing sets.

In [None]:
def standard_scaler(features):
    # Computation of mean and standard deviation
    means = features.mean(axis=0)
    std_deviations = features.std(axis=0)
    # Standardizing the features
    standardized_features = (features - means) / std_deviations
    standardized_features = np.array(standardized_features)
    return standardized_features

def _random_indexes(array, size, random_state):  # For selecting the indexes for test features
    if size > array:
        raise ValueError(str(size) + " features can't be chosen out of " + str(array))
    random_indexes = []
    random.seed(random_state)
    random_index = random.randrange(0, array, 1)
    random_indexes.append(random_index)
    for _ in range(1, size, 1):
        random_index = random.randrange(0, array, 1)
        while random_index in random_indexes:
            random_index = random.randrange(0, array, 1)
        random_indexes.append(random_index)
    random_indexes = np.array(random_indexes)
    return random_indexes

def split(features, targets, test_size, random_state=42):
    number_of_samples = len(targets)
    t_size = test_size * number_of_samples
    t_size = int(t_size) + 1
    random_indexes = _random_indexes(number_of_samples, t_size, random_state)
    x_training, x_testing, y_training, y_testing = [], [], [], []
    features = list(features)
    targets = list(targets)
    for i in range(len(random_indexes)):
        x_testing.append(features[random_indexes[i]])
        y_testing.append(targets[random_indexes[i]])
    for i in range(len(features)):
        if i in random_indexes:
            pass
        else:
            x_training.append(features[i])
            y_training.append(targets[i])
    x_training, x_testing, y_training, y_testing = np.array(x_training), np.array(x_testing), np.array(y_training), \
        np.array(y_testing)
    return x_training, x_testing, y_training, y_testing

# 6. Metrics
- _confusion_matrix computes true positives, fales positives, true negatives, and false negatives.
- accuracy_score, precision_score, recall_score, f1_score andd confusion_matrix compute accuracy, precison, recall, and f1 scores and confusion matrix.

In [None]:
def _confusion_matrix(y_testing, y_prediction):
    # Computing confusion matrix
    length_of_labels = len(y_testing)
    true_positive, false_positive, true_negative, false_negative = 0, 0, 0, 0
    for i in range(length_of_labels):
        if y_testing[i] == 1:
            if y_testing[i] == y_prediction[i]:
                true_positive += 1
            else:
                false_positive += 1
        if y_testing[i] == 0:
            if y_testing[i] == y_prediction[i]:
                true_negative += 1
            else:
                false_negative += 1
    return true_positive, false_positive, true_negative, false_negative

def accuracy_score(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    return accuracy

def precision_score(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    precision = tp / (tp + fp)
    return precision

def recall_score(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    tp_fn = tp + fn
    if tp_fn == 0:
        return 0.0
    else:
        recall = tp / tp_fn
        return recall

def f1_score(y_testing, y_preds):
    precision = precision_score(y_testing, y_preds)
    recall = recall_score(y_testing, y_preds)
    precision_recall = precision + recall
    if precision_recall == 0:
        return 0.0
    else:
        f1 = (2 * precision * recall) / precision_recall
        return f1

def confusion_matrix(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    con_mat = []
    positives = [tp, fp]
    negatives = [fn, tn]
    con_mat.append(positives)
    con_mat.append(negatives)
    return con_mat

# 7. Main function.

In [None]:
if __name__ == "__main__":
    energy_feats, status = energy_features()  # get the energy features and the status (0 or 1)
    features = standard_scaler(energy_feats)
    x_train, x_valid, y_train, y_valid = split(features, status, test_size=0.20)
    knn = KayNearestNeighbour(5, x_train, y_train)
    knn.data_fitting(x_train, y_train)
    predictions = knn.predict(x_valid)

    # Metrics on the validation data
    accuracy = accuracy_score(y_valid, predictions)
    precision = precision_score(y_valid, predictions)
    recall = recall_score(y_valid, predictions)
    f1 = f1_score(y_valid, predictions)
    confusion_mat = confusion_matrix(y_valid, predictions)

    print('Accuracy score on the validation data: ', accuracy)
    print('Precision score on the validation data: ', precision)
    print('Recall score on the validation data: ', recall)
    print('F1 score on the validation data: ', f1)
    print('The confusion matrix on the validation data: ', confusion_mat)
    print('\n')

Accuracy score on the validation data:  0.5833333333333334
Precision score on the validation data:  0.0
Recall score on the validation data:  0.0
F1 score on the validation data:  0.0
The confusion matrix on the validation data:  [[0, 3], [2, 7]]


