# Parkinson's Disease detection with spectral centroid and spectral rolloff audio features using Random Forest coded without libraries.

# 1. Import the dependencies.

In [None]:
import numpy as np
import random
%run "/content/drive/MyDrive/decisiontree.py"
import librosa
from librosa import feature
import os
from typing import Tuple

# 2. Load the audios with Librosa, and preprocess them using Peak Amplitude Normalization.
- load_audio will return all the audios with their sampling rate.
- to_positive will turn all values to postive numbers (audio is 1D).
- get_max will return the maximum value in each audio array.
- peak_amplitude_normalize will preprocess audios with this formula 10 ** (peak / 20) / maximum_value.

In [None]:
def load_audio(file_path) -> Tuple:  # Loads the audio
    audio, sampling_rate = librosa.load(file_path, sr=44100)
    return audio, sampling_rate


def to_positive(n_array):  # Turn all values to positive values
    for i in range(len(n_array)):
        if n_array[i] < 0:
            n_array[i] = -1 * n_array[i]

    return n_array


def get_max(n_array):  # Get the maximum value
    max_value = n_array[0]
    for i in range(1, len(n_array), 1):
        if n_array[i] > max_value:
            max_value = n_array[i]

    return max_value


def peak_amplitude_normalize(audio_data, peak=-3.0):  # Calculate a scaling factor based on the specific peak value
    n_array = to_positive(audio_data)               # (-3 dB) and multiply the entire audio signal by the scaling factor
    maximum_value = get_max(n_array)
    scaling = 10 ** (peak / 20) / maximum_value
    normalized_audio = audio_data * scaling
    return normalized_audio

# 3. Extract spectral centroid and spectral rolloff audio features.
- spectral_centroid_rolloff function extract both spectral centroid and spectral rolloff features using Librosa library for each audio.

In [None]:
def spectral_centroid_rolloff() -> Tuple:  # Gets each audio from the dataset, get its status, load the audio, clean
    directory = '/content/drive/MyDrive/HYP TRAIN DATA/' # the audio compute the features (Spectral Centroid & Rolloff)
    features = []
    status = []
    # Getting each audio path from the dataset
    for file_name in os.listdir(directory):
        if file_name.endswith('.wav'):
            if "P" in file_name:
                status.append(1)

            if "C" in file_name:
                status.append(0)

            file_path = os.path.join(directory, file_name)
            audio, sampling_rate = load_audio(file_path)
            preprocessed_audio = peak_amplitude_normalize(audio)
            spectral_centroid = librosa.feature.spectral_centroid(y=preprocessed_audio, sr=sampling_rate).mean()
            spectral_roll_off = librosa.feature.spectral_rolloff(y=preprocessed_audio, sr=sampling_rate).mean()
            feats = np.array([spectral_centroid, spectral_roll_off])
            features.append(feats)

    features = np.array(features)
    status = np.array(status)
    return features, status

# 4. Class Random Forest takes number_trees, maximum_depth, minimum_samples_split, and num_features as data members.
- Random Forest uses Decision Tree to build more than one tree, then after each tree makes a prediction and then hold majority vote after that.

In [None]:
class RandomForest:  # Constructor
    def __init__(self, number_trees=10, maximum_depth=10, minimum_samples_split=2, num_features=None):
        self._number_trees = number_trees
        self._maximum_depth = maximum_depth
        self._minimum_samples_split = minimum_samples_split
        self._num_features = num_features
        self._trees = []

    def fit(self, x, y):  # Fit method to fit the data
        self._trees = []
        for _ in range(self._number_trees):
            tree = DecisionTree(max_depth=self._maximum_depth, min_samples_split=self._minimum_samples_split,
                                n_features=self._num_features)  # Instantiate a Decision Tree object
            x_sample, y_sample = self._random_samples(x, y)
            tree.fit(x_sample, y_sample)  # Fit the tree
            self._trees.append(tree)  # Append the tree to the tree list

    def _random_sample_indexes(self, a, size):  # Given a number a choose at random size numbers from 0 to a - 1
        if size > a:                            # a not included
            raise ValueError(str(size) + " features can't be chosen out of " + str(a))

        random_sample_indexes = []
        for _ in range(size):
            random_sample_index = random.randrange(0, a, 1)
            random_sample_indexes.append(random_sample_index)

        random_sample_indexes = np.array(random_sample_indexes)

        return random_sample_indexes

    def _random_samples(self, x, y):  # Chooses random samples for each tree
        number_samples = x.shape[0]
        indexes = self._random_sample_indexes(number_samples, number_samples)
        return x[indexes], y[indexes]

    def _unique_labels(self, y):  # Given y labels find the unique labels
        y = list(y)
        label = y[0]
        list_labels = []
        list_labels.append(label)

        for i in range(1, len(y), 1):
            label = y[i]
            if label in list_labels:
                pass
            else:
                list_labels.append(label)

        list_labels = np.array(list_labels)
        list_labels.sort()

        return list_labels

    def _majority_label(self, y):  # Given y labels find and return the most occurring label
        labels = self._unique_labels(y)
        occurrences = []
        majority_lab = y[0]

        for i in range(len(labels)):
            label = labels[i]
            num_occur = 0
            for lab in y:
                if lab == label:
                    num_occur += 1

            occurrences.append(num_occur)

        for i in range(len(occurrences)):
            for n in range(len(occurrences)):
                if i == n:
                    pass
                else:
                    if occurrences[i] > occurrences[n]:
                        majority_lab = labels[i]

        return majority_lab

    def predict(self, x):  # Given X data, every tree will make a prediction for each x data
        estimations = []
        for tree in self._trees:
            estimations.append(tree.predict(x))

        tree_estimates = []
        for i in range(len(x)):
            estimate = []
            for e in estimations:
                estimate.append(e[i])

            estimate = np.array(estimate)
            tree_estimates.append(estimate)

        tree_estimates = np.array(tree_estimates)
        tree_predictions = []

        for tree_estimate in tree_estimates:
            tree_predictions.append(self._majority_label(y=tree_estimate))

        tree_predictions = np.array(tree_predictions)
        return tree_predictions

# 5. Standard scaler and splitting of the data.
- standard_scaler uses means and std deviations to standardize the features.
- split splits the data into training and testing sets.

In [None]:
def standard_scaler(features):
    # Computation of mean and standard deviation
    means = features.mean(axis=0)
    std_deviations = features.std(axis=0)

    # Standardizing the features
    standardized_features = (features - means) / std_deviations
    standardized_features = np.array(standardized_features)

    return standardized_features


def _random_indexes(array, size, random_state):  # For selecting the indexes for test features
    if size > array:
        raise ValueError(str(size) + " features can't be chosen out of " + str(array))
    random_indexes = []
    random.seed(random_state)
    random_index = random.randrange(0, array, 1)
    random_indexes.append(random_index)
    for _ in range(1, size, 1):
        random_index = random.randrange(0, array, 1)
        while random_index in random_indexes:
            random_index = random.randrange(0, array, 1)

        random_indexes.append(random_index)
    random_indexes = np.array(random_indexes)

    return random_indexes


def split(features, targets, test_size, random_state=50):
    number_of_samples = len(targets)
    t_size = test_size * number_of_samples
    t_size = int(t_size) + 1

    random_indexes = _random_indexes(number_of_samples, t_size, random_state)

    x_training, x_testing, y_training, y_testing = [], [], [], []
    features = list(features)
    targets = list(targets)
    for i in range(len(random_indexes)):
        x_testing.append(features[random_indexes[i]])
        y_testing.append(targets[random_indexes[i]])

    for i in range(len(features)):
        if i in random_indexes:
            pass
        else:
            x_training.append(features[i])
            y_training.append(targets[i])

    x_training, x_testing, y_training, y_testing = np.array(x_training), np.array(x_testing), np.array(y_training), \
        np.array(y_testing)

    return x_training, x_testing, y_training, y_testing

# 6. Metrics
- _confusion_matrix computes true positives, fales positives, true negatives, and false negatives.
- accuracy_score, precision_score, recall_score, f1_score andd confusion_matrix compute accuracy, precison, recall, and f1 scores and confusion matrix.

In [None]:
def _confusion_matrix(y_testing, y_prediction):
    # Computing confusion matrix
    length_of_labels = len(y_testing)
    true_positive, false_positive, true_negative, false_negative = 0, 0, 0, 0

    for i in range(length_of_labels):
        if y_testing[i] == 1:
            if y_testing[i] == y_prediction[i]:
                true_positive += 1

            else:
                false_positive += 1

        if y_testing[i] == 0:
            if y_testing[i] == y_prediction[i]:
                true_negative += 1

            else:
                false_negative += 1

    return true_positive, false_positive, true_negative, false_negative


def accuracy_score(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    return accuracy


def precision_score(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    precision = tp / (tp + fp)
    return precision


def recall_score(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    tp_fn = tp + fn
    if tp_fn == 0:
        return 0.0
    else:
        recall = tp / tp_fn
        return recall


def f1_score(y_testing, y_preds):
    precision = precision_score(y_testing, y_preds)
    recall = recall_score(y_testing, y_preds)
    precision_recall = precision + recall
    if precision_recall == 0:
        return 0.0
    else:
        f1 = (2 * precision * recall) / precision_recall
        return f1


def confusion_matrix(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    con_mat = []
    positives = [tp, fp]
    negatives = [fn, tn]
    con_mat.append(positives)
    con_mat.append(negatives)
    return con_mat

# 7. Main function.

In [None]:
if __name__ == "__main__":
    feats, y_labels = spectral_centroid_rolloff()
    features = standard_scaler(feats)

    X_train, X_valid, y_train, y_valid = split(features, y_labels, test_size=0.20)

    rf = RandomForest(minimum_samples_split=2, maximum_depth=1)
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_valid)

    # Metrics on the validation data
    accuracy = accuracy_score(y_valid, predictions)
    precision = precision_score(y_valid, predictions)
    recall = recall_score(y_valid, predictions)
    f1 = f1_score(y_valid, predictions)
    confusion_mat = confusion_matrix(y_valid, predictions)

    print('Accuracy score on the validation data: ', accuracy)
    print('Precision score on the validation data: ', precision)
    print('Recall score on the validation data: ', recall)
    print('F1 score on the validation data: ', f1)
    print('The confusion matrix on the validation: ', confusion_mat)

Accuracy score on the validation data:  0.75
Precision score on the validation data:  0.5
Recall score on the validation data:  1.0
F1 score on the validation data:  0.6666666666666666
The confusion matrix on the validation:  [[3, 3], [0, 6]]
Accuracy score on the validation data:  0.75
Precision score on the validation data:  0.5
Recall score on the validation data:  1.0
F1 score on the validation data:  0.6666666666666666
The confusion matrix on the validation:  [[3, 3], [0, 6]]
