In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

In [2]:
def string_to_idx(string):
    arr = [int(c) * (10 ** (len(string) - i - 1)) for i, c in enumerate(string)]
    return sum(arr)

def load_data(graphs_dir):
    file_names = os.listdir(graphs_dir)
    if ".DS_Store" in file_names:
        file_names.remove(".DS_Store")
    file_names.sort()

    data_idx = [string_to_idx(f.split("_")[0]) for f in file_names]
    labels = [int(f.split("_")[1].split(".")[0]) for f in file_names]

    file_names = [graphs_dir + f for f in file_names]
    data = [np.load(f) for f in file_names]
    N = data[0].shape[0]
    mul = np.eye(N) - np.ones([N, N])
    data = np.abs(np.array([mul * d for d in data]))

    return data, labels, data_idx

def thresholding(data, threshold = None):
    if threshold != None:
        data[data < threshold] = 0
        data[data >= threshold] = 1
    elif threshold == None:
        data = np.abs(data)
        data = np.where(data > 0, 1, 0)
    return data

def get_edge_list(data):
    N = data[0].shape[0]
    edge_list = []
    for i in range(N):
        for j in range(i + 1, N):
            edge_list.append(int(data[i, j]))
    return edge_list

In [3]:
graphs_dir = "./Outputs/Graphs/ADHD/"
data, labels, _ = load_data(graphs_dir)
data = thresholding(data, threshold = None)
data_edge_list = [get_edge_list(d) for d in data]

In [4]:
top_k = 5
trials = 100
num_data = len(data_edge_list)
# test_percent = 0.15
test_size = 5
train_size = num_data - test_size

In [5]:
accuracies = []

for t in range(trials):
    idx = np.random.permutation(num_data)
    train_idx, test_idx = idx[ : train_size].tolist(), idx[train_size : ].tolist()
    
    correct = 0
    for i in test_idx:
        dat = data_edge_list[i]
        f_score_list = []
        for j in train_idx:
            train_dat = data_edge_list[j]
            f_score = f1_score(train_dat, dat)
            f_score_list.append(f_score)
        top_k_idx = np.argsort(f_score_list)[-top_k : ]
        top_k_labels = [labels[train_idx[i]] for i in top_k_idx]
        pred = int(2.0 * sum(top_k_labels) / top_k)
        ground_truth = labels[i]

        if pred == ground_truth:
            correct += 1
    
    acc = 100 * correct / test_size
    print(f"Trial {t + 1} / {trials} - Accuracy: {acc:.2f}%\n")
    accuracies.append(acc)

Trial 1 / 100 - Accuracy: 60.00%

Trial 2 / 100 - Accuracy: 60.00%

Trial 3 / 100 - Accuracy: 40.00%

Trial 4 / 100 - Accuracy: 40.00%

Trial 5 / 100 - Accuracy: 60.00%

Trial 6 / 100 - Accuracy: 60.00%

Trial 7 / 100 - Accuracy: 80.00%

Trial 8 / 100 - Accuracy: 20.00%

Trial 9 / 100 - Accuracy: 20.00%

Trial 10 / 100 - Accuracy: 60.00%

Trial 11 / 100 - Accuracy: 40.00%

Trial 12 / 100 - Accuracy: 20.00%

Trial 13 / 100 - Accuracy: 40.00%

Trial 14 / 100 - Accuracy: 60.00%

Trial 15 / 100 - Accuracy: 60.00%

Trial 16 / 100 - Accuracy: 40.00%

Trial 17 / 100 - Accuracy: 60.00%

Trial 18 / 100 - Accuracy: 60.00%

Trial 19 / 100 - Accuracy: 100.00%

Trial 20 / 100 - Accuracy: 40.00%

Trial 21 / 100 - Accuracy: 60.00%

Trial 22 / 100 - Accuracy: 60.00%

Trial 23 / 100 - Accuracy: 20.00%

Trial 24 / 100 - Accuracy: 20.00%

Trial 25 / 100 - Accuracy: 60.00%

Trial 26 / 100 - Accuracy: 0.00%

Trial 27 / 100 - Accuracy: 20.00%

Trial 28 / 100 - Accuracy: 40.00%

Trial 29 / 100 - Accuracy: 0.

In [6]:
print(f"Average accuracy: {np.mean(accuracies):.2f}%")

Average accuracy: 44.00%
