In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torch_geometric as tg

from sklearn.model_selection import train_test_split

# Define paths
Load timeseries and find id of subject with odd length timeseries.

In [5]:
data_path = os.path.join("..", "data", "cobre_difumo512")
ts_path = os.path.join(data_path, "difumo", "timeseries")
conn_path = os.path.join(data_path, "difumo", "connectomes")
pheno_path = os.path.join(data_path, "difumo", "phenotypic_data.tsv")

In [6]:
timeseries = [np.load(os.path.join(ts_path, p)) for p in os.listdir(ts_path)]
ids = [int(p.split('_')[1]) for p in os.listdir(ts_path)]

# One subject has different length timeseries, ignore them for now
not_150 = np.array([t.shape[0]!=150 for t in timeseries])
print('Bad sub ID: {}'.format(np.array(ids)[not_150][0]))

Bad sub ID: 40075


# Make Graph
- Load connectomes
- Get avg connectome
- Get 8 knn graph from avg connectome

In [7]:
def make_undirected(mat):
    """Takes an input adjacency matrix and makes it undirected (symmetric)."""
    m = mat.copy()
    mask = mat != mat.transpose()
    vals = mat[mask] + mat.transpose()[mask]
    m[mask] = vals
    return m

def knn_graph(mat,k=8):
    """Takes an input matrix and returns a k-Nearest Neighbour weighted adjacency matrix."""
    is_undirected = (mat == mat.T).all()
    m = np.abs(mat.copy())
    np.fill_diagonal(m,0)
    slices = []
    for i in range(m.shape[0]):
        s = m[:,i]
        not_neighbours = s.argsort()[:-k]
        s[not_neighbours] = 0
        slices.append(s)
    if is_undirected:
        return np.array(slices)
    else:
        return make_undirected(np.array(slices))
    
def make_group_graph(connectomes,k=8):
    # Group average connectome
    avg_conn = np.array(connectomes).mean(axis=0)

    # Undirected 8 k-NN graph as matrix
    avg_conn8 = knn_graph(avg_conn,k=k)

    # Format matrix into graph for torch_geometric
    graph = nx.convert_matrix.from_numpy_array(avg_conn8)
    return tg.utils.from_networkx(graph)

# Get train/test/validation data
- Load timeseries and ids
- Split timeseries of 150 volumes into time windows
- Split data into train/test/validation
  - All data from a given subject goes in the same bin

In [8]:
def split_timeseries(ts,n_timepoints=50):
    """Takes an input timeseries and splits it into time windows of specified length. Need to choose a number that splits evenly."""
    if ts.shape[0] % n_timepoints != 0:
        raise ValueError('Yikes choose a divisor for now')
    else:
        n_splits = ts.shape[0] / n_timepoints
        return np.split(ts,n_splits)

def split_ts_labels(timeseries,labels,n_timepoints=50):
    """
    timeseries: list of timeseries
    labels: list of lists (of accompanying labels)
    n_timepoints: n_timepoints of split (must be an even split)
    """
    # Split the timeseries
    split_ts = []
    tmp = [split_timeseries(t,n_timepoints=n_timepoints) for t in timeseries]
    for ts in tmp:
        split_ts = split_ts + ts

    #keep track of the corresponding labels
    n = int(timeseries[0].shape[0]/n_timepoints)
    split_labels = []
    for l in labels:
        split_labels.append(np.repeat(l,n))

    #add a label for each split
    split_labels.append(list(range(n))*len(timeseries))
    return split_ts, split_labels

def train_test_val_splits(split_ids,test_size=0.20,val_size=0.10,random_state=111):
    """Train test val split the data (in splits) so splits from a subject are in the same group.
        returns INDEX for each split
    """
    # Train test validation split of ids, then used to split dataframe
    X = np.unique(split_ids)
    y = list(range(len(X)))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size+val_size, random_state=random_state)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=val_size/(test_size+val_size), random_state=random_state)

    train_idx = []
    test_idx = []
    val_idx = []
    for i in range(len(split_ids)):
        if split_ids[i] in X_train:
            train_idx.append(i)
        elif split_ids[i] in X_test:
            test_idx.append(i)
        elif split_ids[i]in X_val:
            val_idx.append(i)

    return train_idx,test_idx,val_idx

# Prep data
 1. Get cobre timeseries
 2. Get cobre connectomes
 3. Filter out subject with odd timeseries length
 4. Get group average connectome
 5. Build 8 k-NN graph from avg connectome
 6. Split data: 70 training, 10 validation, 20 test
 7. All data from same subject assigned to same Split
 8. Cut time-series into bins of length time window

In [9]:
class cobreTimeWindows(Dataset):
    def __init__(self,timeseries,connectomes,sub_ids,labels,test_size=0.20,val_size=0.10,random_state=111,n_timepoints=50,k=8):
        """
        timeseries: list of arrays
        connectomes: list of arrays
        sub_ids: array of subject ids
        labels: array of subject labels
        """
        self.timeseries = timeseries
        self.connectomes = connectomes
        self.sub_ids = sub_ids
        self.labels = labels

        #make group connectomes
        self.graph = make_group_graph(self.connectomes,k=k)

        #split timeseries
        self.split_timeseries,split_labs = split_ts_labels(self.timeseries,[self.sub_ids,self.labels],n_timepoints=n_timepoints)
        self.split_sub_ids = split_labs[0]
        self.split_labels = split_labs[1]
        self.split_ids = split_labs[-1]

        #train test val split the data (each sub's splits in one category only)
        self.train_idx,self.test_idx,self.val_idx = train_test_val_splits(self.split_sub_ids,
                                                                            test_size=test_size,
                                                                            val_size=val_size,
                                                                            random_state=random_state)

    def __len__(self):
        return len(self.split_sub_ids)

    def __getitem__(self,idx):
        ts = torch.from_numpy(self.split_timeseries[idx]).transpose(0,1)
        sub_id = self.split_sub_ids[idx]
        label = self.split_labels[idx]
        split_id = self.split_ids[idx]
        #return {'timeseries':ts,
                 #"sub_id":sub_id, 
        #         'label':label, 
                 #"split_id":split_id
        #         }
        return ts,label

# Model
 - C input channels (n time points of timeseries)
 - 6 GCN layers
 - 32 graph filters at each layer
 - Global average pooling layer
 - 2 fully connected layers
 - 256, 128 units
 - ReLU activation
 - Softmax last layer

In [10]:
class GCN(torch.nn.Module):
    def __init__(self,edge_index,edge_weight,n_timepoints = 50):
        super().__init__()
        self.edge_index = edge_index
        self.edge_weight = edge_weight
        self.conv1 = tg.nn.ChebConv(in_channels=n_timepoints,out_channels=32,K=2,bias=True)
        self.conv2 = tg.nn.ChebConv(in_channels=32,out_channels=32,K=2,bias=True)
        self.conv3 = tg.nn.ChebConv(in_channels=32,out_channels=32,K=2,bias=True)
        self.conv4 = tg.nn.ChebConv(in_channels=32,out_channels=32,K=2,bias=True)
        self.conv5 = tg.nn.ChebConv(in_channels=32,out_channels=32,K=2,bias=True)
        self.conv6 = tg.nn.ChebConv(in_channels=32,out_channels=32,K=2,bias=True)
        self.fc1 = nn.Linear(512*32, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 2)
        self.dropout = nn.Dropout(0.5)

    def forward(self,x):
        x = self.conv1(x,self.edge_index,self.edge_weight)
        x = F.relu(x)
        x = self.conv2(x,self.edge_index,self.edge_weight)
        x = F.relu(x)
        x = self.conv3(x,self.edge_index,self.edge_weight)
        x = F.relu(x)
        x = self.conv4(x,self.edge_index,self.edge_weight)
        x = F.relu(x)
        x = self.conv5(x,self.edge_index,self.edge_weight)
        x = F.relu(x)
        x = self.conv6(x,self.edge_index,self.edge_weight)
        x = tg.nn.global_mean_pool(x,torch.from_numpy(np.array(range(x.size(0)),dtype=int)))
        
        x = x.view(-1, 512*32)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x
        


# Model Training
 - Adam optimizer
 - Learning rate: 0.001
 - Weight decay: 0.0005

In [11]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.sampler)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss, current = loss.item(), batch * len(X)
        print(batch)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.sampler)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model.forward(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [12]:
n_timepoints = 50
batch_size = 128

# Create group graph
graph = make_group_graph(conn_path)

# Create dataset (filters out subject with odd length timeseries)
data = cobreTimeWindows(ts_path,pheno_path,n_timepoints=n_timepoints)

# Create PT data samplers and loaders:
train_sampler = SubsetRandomSampler(data.train_idx)
test_sampler = SubsetRandomSampler(data.test_idx)
val_sampler = SubsetRandomSampler(data.val_idx)

train_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, sampler=train_sampler)
test_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, sampler=test_sampler)
val_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, sampler=val_sampler)

# Create model
gcn = GCN(graph.edge_index,graph.weight,n_timepoints=n_timepoints)

AxisError: axis 0 is out of bounds for array of dimension 0

In [14]:
np.array(connectomes)

NameError: name 'connectomes' is not defined

In [13]:
# Train and evaluate model
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gcn.parameters(), lr=0.001, weight_decay= 0.0005)

epochs = 15
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_loader, gcn, loss_fn, optimizer)
    test_loop(test_loader, gcn, loss_fn)
print("Done!")

NameError: name 'gcn' is not defined