# Reading Files

In [None]:
import os

import pandas as pd
import networkx as nx
import numpy as np

from fastdtw import fastdtw
from tqdm import tqdm
from node2vec import Node2Vec
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.spatial.distance import euclidean

In [3]:
def batch_read(path: str) -> list[pd.DataFrame]:
    df_list = []
    for file in tqdm(os.listdir(path)):
        df = pd.read_csv(f'{path}/{file}')
        df_list.append(df)
    return df_list

In [4]:
control_data = batch_read('../data/Controls_columns')

100%|██████████| 66/66 [00:03<00:00, 19.13it/s]


In [5]:
parkinson_data = batch_read('../data/PDs_columns')

100%|██████████| 113/113 [00:05<00:00, 21.18it/s]


In [6]:
np.unique([column.split('.')[0] for column in parkinson_data[0].columns])

array(['AAL3', 'CSF', 'Grey Matter', 'Shen_268', 'White Matter', 'atlas',
       'networks'], dtype='<U12')

Os atlas possíveis são AAL3, Shen_268 e atlas. Cada atlas particiona o cérebro de maneiras diferentes. Devemos selecionar as colunas correspondentes para aplicar o processamento

In [7]:
all_columns = parkinson_data[0].columns
AAL3_columns = all_columns[np.where([column.split('.')[0] == 'AAL3' for column in all_columns])[0]]
AAL3_columns[:5]

Index(['AAL3.cluster001', 'AAL3.cluster002', 'AAL3.cluster003',
       'AAL3.cluster004', 'AAL3.cluster005'],
      dtype='object')

In [8]:
control_AAL3_data = [data[AAL3_columns] for data in control_data]
parkinson_AAL3_data = [data[AAL3_columns] for data in parkinson_data]

## Create Graphs

Demora muito para calcular a rede funcional com DTW

In [9]:
def dtw_distance(time_series1: np.array, time_series2: np.array) -> float:
    distance, _ = fastdtw(time_series1.reshape(-1, 1), time_series2.reshape(-1, 1), dist=euclidean)
    return distance

def compute_functional_network(time_series: np.array, distance_function: callable) -> np.array:
    n = time_series.shape[0]
    functional_network = np.zeros((n, n))
    for i in tqdm(range(1, n), leave=True):
        for j in tqdm(range(i+1, n), leave=False):
            distance = distance_function(time_series[i].reshape(-1, 1), time_series[j].reshape(-1, 1))
            functional_network[i, j] = distance
            functional_network[j, i] = distance
    return functional_network

Vamos utilizar Pearson

### Teste com a Triangular Superior

In [12]:
parkinson_AAL3_data[0].shape

(240, 166)

In [13]:
upper_triangular_indices = np.triu_indices(166)

parkinson_correlation_matrix = [time_series.corr().to_numpy()[upper_triangular_indices] for time_series in parkinson_AAL3_data]
control_correlation_matrix = [time_series.corr().to_numpy()[upper_triangular_indices] for time_series in control_AAL3_data]

#### Removendo pacientes com erros

Checando pacientes que possuem séries constantes (resultando em valores nulos na matriz de correlação)

In [14]:
np.where(pd.DataFrame(parkinson_correlation_matrix).isna().any(axis=1) == True)

(array([79]),)

In [15]:
AAL3_columns[np.where(parkinson_AAL3_data[79].std() == 0)]

Index(['AAL3.cluster106'], dtype='object')

O paciente indice 79 do grupo com Parkinson possui a série AAL3.cluster106 com valor constante

In [16]:
parkinson_AAL3_data[79]['AAL3.cluster106']

0      0
1      0
2      0
3      0
4      0
      ..
235    0
236    0
237    0
238    0
239    0
Name: AAL3.cluster106, Length: 240, dtype: int64

In [17]:
X = np.concatenate([
    parkinson_correlation_matrix,
    control_correlation_matrix
], axis=0)

In [18]:
y = np.concatenate([
    [1 for _ in range(len(parkinson_data))],
    [0 for _ in range(len(control_data))]
])

Selecionando séries temporais corretas

In [19]:
notna_indices = np.logical_not(np.isnan(X).any(axis=1))
X = X[notna_indices, :]
y = y[notna_indices]
print(X.shape, y.shape)

(178, 13861) (178,)


#### Train/Test Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
params = {
    'alpha': np.arange(1e-4, 1e-2, 1e-3),
    'learning_rate': ['constant', 'adaptive'],
}

#### Training

Why are there NaN values?

In [22]:
model = MLPClassifier(early_stopping=True, hidden_layer_sizes=(100, 100))
hyperparam_optimization = RandomizedSearchCV(model, params, random_state=1)
search = hyperparam_optimization.fit(X_train, y_train)
search.best_params_

{'learning_rate': 'adaptive', 'alpha': 0.0031}

In [23]:
optimized_model = MLPClassifier(early_stopping=True, **search.best_params_).fit(X_train, y_train)
optimized_model.score(X_test, y_test)

0.7555555555555555

In [27]:
optimized_model.predict(X_test)

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1])