# Correlation Matrix with Pytorch

In [10]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import os
import sys
print(os.listdir('../'))

['teste', 'pyproject.toml', 'report', 'weights', 'notebooks', 'requirements.txt', 'data', '.gitignore', 'README.md', 'Makefile', 'parkinson', '.git']


In [12]:
import sys
# sys.path.append('/home/labic/merlin_codes/dl/Parkinson-Diagnosis-Deeplearning')
sys.path.append('../')

import os
import time
    
import parkinson

import torch
from torch import nn

import pandas as pd
import networkx as nx
import numpy as np

import pickle
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [13]:
RDN = 50
N_CLASSES = 2
BATCH_SIZE = 32
N_EPOCHS = 200
PATIENCE = 20
LR = 0.0001

## Data Processing
### Reading

In [14]:
parkinson_data = parkinson.utils.data.batch_read('../data/PDs_columns')
control_data = parkinson.utils.data.batch_read('../data/Controls_columns')

control_atlas_data = parkinson.utils.data.select_atlas_columns(control_data, 'AAL3')
parkinson_atlas_data = parkinson.utils.data.select_atlas_columns(parkinson_data, 'AAL3')

100%|██████████| 153/153 [00:03<00:00, 43.06it/s]
100%|██████████| 66/66 [00:01<00:00, 43.56it/s]


In [15]:
def save_cache(path, data):
    with open(path, 'wb') as f:
        pickle.dump(data, f)

def load_cache(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def compute_all_dtw_matrices(time_series_list, cache_path, final_cache_path=None, n_jobs=4):
    if final_cache_path is None:
        final_cache_path = cache_path.replace('.pkl', '_final.pkl')

    # Carrega cache parcial existente, se houver
    if os.path.exists(cache_path):
        print(f"Carregando cache existente de {cache_path}...")
        completed_results = load_cache(cache_path)
    else:
        completed_results = {}

    total = len(time_series_list)
    indices_to_process = [i for i in range(total) if i not in completed_results]

    print(f"Total: {total} pacientes")
    print(f"Já computados: {len(completed_results)}")
    print(f"Restantes: {len(indices_to_process)}")

    def process_and_save(idx):
        print(f"Iniciando paciente {idx}...")
        start_time = time.time()
        ts = time_series_list[idx]
        result = parkinson.utils.graph.compute_dtw_matrix(ts)
        elapsed = time.time() - start_time
        print(f"Paciente {idx} finalizado em {elapsed:.2f} segundos.")
        return idx, result

    # Processa os pacientes em batches de n_jobs
    for batch_start in range(0, len(indices_to_process), n_jobs):
        batch_indices = indices_to_process[batch_start:batch_start + n_jobs]

        print(f"\nProcessando batch {batch_start} a {batch_start + len(batch_indices) - 1}...")

        new_results = Parallel(n_jobs=n_jobs)(
            delayed(process_and_save)(i) for i in batch_indices
        )

        # Atualiza o cache com os resultados do batch
        for idx, res in new_results:
            completed_results[idx] = res

        # Salva o cache parcial após cada batch
        save_cache(cache_path, completed_results)
        print(f"Cache parcial salvo após batch {batch_start}.")

    # Reordena os resultados finais
    ordered_results = [completed_results[i] for i in range(total)]

    # Salva o resultado final completo
    save_cache(final_cache_path, completed_results)
    print(f"\nResultado final salvo em {final_cache_path}")

    return ordered_results

In [16]:
# com DTW
parkinson_cache_path = "cache_dtw_parkinson.pkl"
parkinson_correlation_matrix = compute_all_dtw_matrices(parkinson_atlas_data, parkinson_cache_path, n_jobs=12)

Carregando cache existente de cache_dtw_parkinson.pkl...
Total: 153 pacientes
Já computados: 153
Restantes: 0

Resultado final salvo em cache_dtw_parkinson_final.pkl


In [17]:
# com DTW
control_cache_path = "cache_dtw_control.pkl"
control_correlation_matrix = compute_all_dtw_matrices(control_atlas_data, control_cache_path, n_jobs=6)

Carregando cache existente de cache_dtw_control.pkl...
Total: 66 pacientes
Já computados: 66
Restantes: 0

Resultado final salvo em cache_dtw_control_final.pkl


In [None]:
# com pearson (descomentar se quiser usar pearson ao invés de DTW)
# parkinson_correlation_matrix = [parkinson.utils.graph.compute_correlation_matrix(time_series) for time_series in parkinson_atlas_data]
# control_correlation_matrix = [parkinson.utils.graph.compute_correlation_matrix(time_series) for time_series in control_atlas_data]

In [None]:
X = parkinson.utils.data.concatenate_data(parkinson_correlation_matrix, control_correlation_matrix)
y = parkinson.utils.data.concatenate_data([1 for _ in range(len(parkinson_data))], [0 for _ in range(len(control_data))])

X, y = parkinson.utils.data.filter_data(X, y)

### Split  
- 60% treino
- 20% validação
- 20% teste

In [20]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=RDN, stratify=y, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=RDN, stratify=y_trainval, shuffle=True)

ros = RandomOverSampler(random_state=RDN)
X_train, y_train = ros.fit_resample(X_train, y_train)

### Dataloaders

In [21]:
train_loader = parkinson.utils.data.get_torch_dataloader(X_train, y_train, batch_size=BATCH_SIZE)
val_loader = parkinson.utils.data.get_torch_dataloader(X_val, y_val, batch_size=BATCH_SIZE)
test_loader = parkinson.utils.data.get_torch_dataloader(X_test, y_test, batch_size=BATCH_SIZE)

## Training

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = parkinson.NetworkModels.SimpleMLP(input_dim=X_train.shape[1], hidden_dim=16, output_dim=2)
class_weights = parkinson.utils.data.get_torch_class_weights(y_train)

In [23]:
model.train()
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

In [24]:
%%time
out = parkinson.utils.train.train(model, train_loader, val_loader, class_weights, device, N_EPOCHS,  PATIENCE,LR)

train-loss: 0.6991  train-acc: 0.5000 | val-loss: 0.6559  val-acc: 0.7045:  18%|█▊        | 35/200 [00:11<00:54,  3.05it/s]

Early stopping at epoch 36
CPU times: user 8.75 s, sys: 9.36 s, total: 18.1 s
Wall time: 12.6 s





In [25]:
metrics = parkinson.utils.train.evaluate(model, test_loader, device)
print('Metrics:', metrics)
print('Done.')

Preds: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Metrics: {'acc': 0.7045454545454546, 'f1': 0.5824242424242424, 'recall': 0.7045454545454546, 'precision': 0.4963842975206612}
Done.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
import matplotlib.pyplot as plt

val_loss =  s['val_loss']
plt.plot(np.arange(len(val_loss)), val_loss)

NameError: name 's' is not defined