# Reading Files

In [29]:
import os

import pandas as pd
import networkx as nx
import numpy as np

from fastdtw import fastdtw
from tqdm import tqdm
from node2vec import Node2Vec
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.spatial.distance import euclidean

In [2]:
def batch_read(path: str) -> list[pd.DataFrame]:
    df_list = []
    for file in tqdm(os.listdir(path)):
        df = pd.read_csv(f'{path}/{file}')
        df_list.append(df)
    return df_list

In [3]:
control_data = batch_read('../data/Controls_columns')

  0%|          | 0/66 [00:00<?, ?it/s]

100%|██████████| 66/66 [00:03<00:00, 17.29it/s]


In [4]:
parkinson_data = batch_read('../data/PDs_columns')

100%|██████████| 113/113 [00:06<00:00, 17.64it/s]


In [8]:
np.unique([column.split('.')[0] for column in parkinson_data[0].columns])

array(['AAL3', 'CSF', 'Grey Matter', 'Shen_268', 'White Matter', 'atlas',
       'networks'], dtype='<U12')

Os atlas possíveis são AAL3, Shen_268 e atlas. Cada atlas particiona o cérebro de maneiras diferentes. Devemos selecionar as colunas correspondentes para aplicar o processamento

In [23]:
all_columns = parkinson_data[0].columns
AAL3_columns = all_columns[np.where([column.split('.')[0] == 'AAL3' for column in all_columns])[0]]
AAL3_columns[:5]

Index(['AAL3.cluster001', 'AAL3.cluster002', 'AAL3.cluster003',
       'AAL3.cluster004', 'AAL3.cluster005'],
      dtype='object')

In [25]:
control_AAL3_data = [data[AAL3_columns] for data in control_data]
parkinson_AAL3_data = [data[AAL3_columns] for data in parkinson_data]

## Create Graphs

Demora muito para calcular a rede funcional com DTW

In [30]:
def dtw_distance(time_series1: np.array, time_series2: np.array) -> float:
    distance, _ = fastdtw(time_series1.reshape(-1, 1), time_series2.reshape(-1, 1), dist=euclidean)
    return distance

def compute_functional_network(time_series: np.array, distance_function: callable) -> np.array:
    n = time_series.shape[0]
    functional_network = np.zeros((n, n))
    for i in tqdm(range(1, n), leave=True):
        for j in tqdm(range(i+1, n), leave=False):
            distance = distance_function(time_series[i].reshape(-1, 1), time_series[j].reshape(-1, 1))
            functional_network[i, j] = distance
            functional_network[j, i] = distance
    return functional_network

In [None]:
compute_functional_network(parkinson_AAL3_data[0].to_numpy(), dtw_distance)

Vamos utilizar Pearson

### Teste com a Triangular Superior

In [37]:
parkinson_AAL3_data[0].shape

(240, 166)

In [38]:
upper_triangular_indices = np.triu_indices(166)

parkinson_correlation_matrix = [time_series.corr().to_numpy()[upper_triangular_indices] for time_series in parkinson_AAL3_data]
control_correlation_matrix = [time_series.corr().to_numpy()[upper_triangular_indices] for time_series in control_AAL3_data]

In [53]:
parkinson_correlation_matrix[0].shape

(13861,)

In [54]:
X = np.concatenate([
    parkinson_correlation_matrix,
    control_correlation_matrix
], axis=0)

In [56]:
y = np.concatenate([
    [1 for _ in range(len(parkinson_data))],
    [0 for _ in range(len(control_data))]
])

#### Train/Test Split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [60]:
params = {
    'hidden_layer_sizes': (100, 100, 100),
    'alpha': np.arange(1e-4, 1e-2, 1e-3),
    'learning_rate': ['constant', 'adaptive'],
}

#### Training

Why are there NaN values?

In [61]:
model = MLPClassifier(early_stopping=True)
hyperparam_optimization = RandomizedSearchCV(model, params, random_state=1)
search = hyperparam_optimization.fit(X_train, y_train)
search.best_params_

Traceback (most recent call last):
  File "/home/vitor/miniforge3/envs/parkinson/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/vitor/miniforge3/envs/parkinson/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 472, in __call__
    return estimator.score(*args, **kwargs)
  File "/home/vitor/miniforge3/envs/parkinson/lib/python3.9/site-packages/sklearn/base.py", line 572, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/home/vitor/miniforge3/envs/parkinson/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 1182, in predict
    return self._predict(X)
  File "/home/vitor/miniforge3/envs/parkinson/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 1186, in _predict
    y_pred = self._forward_pass_fast(X, check_input=check_input)
  File "/home/vitor/mini

ValueError: Input X contains NaN.
MLPClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13851,13852,13853,13854,13855,13856,13857,13858,13859,13860
0,1.0,0.379491,-0.074240,-0.048305,0.181852,0.073803,0.450929,0.410390,0.368395,0.427828,...,1.0,0.089115,0.124931,0.079752,1.0,-0.101149,0.113513,1.0,0.127300,1.0
1,1.0,0.637468,-0.084717,-0.134049,-0.067713,-0.122563,0.108029,0.176920,0.104330,0.053482,...,1.0,0.051924,0.038661,0.184871,1.0,0.080619,0.070074,1.0,-0.120449,1.0
2,1.0,0.879750,0.016016,-0.128961,-0.199359,-0.294118,0.003476,0.157900,-0.055491,-0.116494,...,1.0,0.034754,-0.351833,0.053151,1.0,0.041619,0.152763,1.0,0.133512,1.0
3,1.0,0.480276,0.355473,0.019761,0.362609,-0.038310,0.514590,-0.111072,0.346668,-0.158441,...,1.0,0.028604,-0.013055,0.053237,1.0,-0.060730,0.104242,1.0,0.091097,1.0
4,1.0,0.164299,0.575804,0.089667,0.568047,0.142096,0.546026,-0.158749,0.435653,-0.199536,...,1.0,0.113104,0.075452,0.043636,1.0,-0.128235,0.220301,1.0,-0.438330,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,1.0,0.694800,0.409443,0.074431,0.314380,0.047375,0.657915,0.338873,0.473940,0.232238,...,1.0,0.096485,0.203637,0.418732,1.0,0.275571,0.246035,1.0,0.212942,1.0
175,1.0,0.304920,-0.016491,-0.410828,0.039766,-0.273673,0.612212,0.189458,0.613211,0.293289,...,1.0,-0.009617,0.034761,0.268509,1.0,0.113632,-0.034542,1.0,0.198781,1.0
176,1.0,0.716324,0.365387,0.215571,0.454519,-0.047180,0.371976,0.206024,0.336097,0.124317,...,1.0,0.097590,0.066128,0.259675,1.0,-0.118689,-0.106592,1.0,0.047757,1.0
177,1.0,0.662847,0.301545,0.279085,0.409955,0.234694,0.310006,-0.096177,0.291628,-0.034034,...,1.0,0.301098,0.008829,0.127315,1.0,0.128821,0.151948,1.0,0.036646,1.0


In [45]:
optimized_model = MLPClassifier(early_stopping=True, **search.best_params_).fit(X_train, y_train)
optimized_model.score(X_test, y_test)

ValueError: Input X contains NaN.
MLPClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
optimized_model.predict(X_train)

array([1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1])

### Node2Vec

Não tá funfando por hora. Talvez a matriz de correlação/Grafo precise ser Positiva Semi Definida (não pode ter valores negativos e umas propriedades a mais ai)

In [216]:
graph = nx.from_numpy_array(
    parkinson_data[0].T.corr().to_numpy()
)

node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)

n2v_model = node2vec.fit(window=10, min_count=1, batch_words=4)

Computing transition probabilities: 100%|██████████| 240/240 [00:28<00:00,  8.39it/s]
Generating walks (CPU: 1):   0%|          | 0/50 [00:00<?, ?it/s]

ValueError: probabilities are not non-negative

In [224]:
model.wv.vectors # para encontrar o embedding

array([[ 4.10542369e-01, -8.13800544e-02,  1.05252430e-01,
         3.20213556e-01,  2.94262171e-01, -8.20183694e-01,
        -7.89105952e-01, -1.98348071e-02,  6.49442732e-01,
         2.30842590e-01],
       [ 4.11620647e-01,  4.74907979e-02,  1.99726164e-01,
         2.47501627e-01,  2.30991051e-01, -8.57504308e-01,
        -7.93270350e-01, -4.75351214e-02,  6.18805051e-01,
         3.10785681e-01],
       [ 3.62728894e-01, -3.80808376e-02,  1.49629399e-01,
         3.38716775e-01,  2.45367005e-01, -8.47277462e-01,
        -7.47303724e-01,  3.06409388e-03,  6.56970203e-01,
         3.15002412e-01],
       [ 3.93635809e-01, -1.44947495e-03,  1.27001777e-01,
         3.04565161e-01,  3.16155165e-01, -8.29582632e-01,
        -7.38255560e-01, -5.62160928e-03,  6.67009890e-01,
         3.20622265e-01],
       [ 3.99246514e-01,  1.95658579e-02,  1.65241390e-01,
         3.42297286e-01,  3.17477643e-01, -7.76112199e-01,
        -7.18523741e-01, -4.24726196e-02,  7.02263057e-01,
         3.