In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch_geometric.data import Data
from torch_geometric.utils import dense_to_sparse

%matplotlib inline
sns.set(rc={'image.cmap': 'coolwarm'})

#from numba import jit,prange

import time
import os

SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [2]:
def correlation_matrix(size, corr):
    corr_matrix = np.zeros((size,size))
    cont = 0

    for i in range(size):
        for j in range(i, size):
            if i == j:
                corr_matrix[i,j] = float('nan')


            else:
                corr_matrix[i,j] = corr[cont]
                corr_matrix[j,i] = corr[cont]
                cont += 1 
    return corr_matrix

In [3]:
def import_data(fisher):
    if fisher == True:
        df = pd.read_csv(r'/Users/rodrigo/Post-Grad/CC400/corr_matrices_fisher200.csv',index_col=['Institution','Subject'])
        phenotypic = pd.read_csv(r'/Users/rodrigo/Post-Grad/CC400/phenotypic200.csv',index_col=['Institution','Subject'])
    else:
        df = pd.read_csv(r'/Users/rodrigo/Post-Grad/CC400/corr_matrices200.csv',index_col=['Institution','Subject','Run'])
        phenotypic = pd.read_csv(r'/Users/rodrigo/Post-Grad/CC400/phenotypic200.csv',index_col=['Institution','Subject'])
    return df,phenotypic

In [4]:
df, phenotypic = import_data(fisher=True)
df = df.join(pd.DataFrame(phenotypic.Age), how='left')
df = df.join(pd.DataFrame(phenotypic['ADHD Measure']), how='left')
df = df.join(pd.DataFrame(phenotypic['Gender']), how='left')

#df = ((df.reset_index()).drop(columns=['Institution', 'Subject','Run'])).dropna(subset=[str(x) for x in range(0,61425)])
df = ((df.reset_index()).drop(columns=['Institution', 'Subject'])).dropna(subset=[str(x) for x in range(0,17954)])
y = df.Gender
X = df.iloc[:,:-3]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15, shuffle=True)

In [7]:
train_data = []

for i in range((X_train.shape[0])):
    adj = torch.from_numpy(correlation_matrix(190,X.iloc[i,:].values)).float()
    edge_index, edge_attr = dense_to_sparse(adj)
    train_data.append(Data(x=adj, edge_index=edge_index, edge_attr=edge_attr, y=y.iloc[i]))

In [8]:
test_data = []

for i in range((X_test.shape[0])):
    adj = torch.from_numpy(correlation_matrix(190,X.iloc[i,:].values)).float()
    edge_index, edge_attr = dense_to_sparse(adj)
    test_data.append(Data(x=adj, edge_index=edge_index, edge_attr=edge_attr, y=y.iloc[i]))

In [9]:
from torch_geometric.loader import DataLoader

batch_size = 32

train_loader = DataLoader(train_data, batch_size, shuffle=True)

test_loader = DataLoader(test_data, batch_size, shuffle=True)


In [23]:
for data in train_loader:
    print('num_nodes',data.num_nodes)
    print(data.x.shape)
    #print(data.edge_index.shape)
    #print(data.batch)
    #print(data.y)
    print('num_graphs', data.num_graphs)
    # data = data.to(device)
    # optimizer.zero_grad()
    # out = model(data.x, data.edge_index, data.batch)
    # loss = F.cross_entropy(out, data.y)
    # loss.backward()
    # optimizer.step()
    # total_loss += float(loss) * data.num_graphs


num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_graphs 32
num_nodes 6080
torch.Size([6080, 190])
num_gra

In [11]:
import torch
import torch.nn.functional as func
from torch_geometric.nn import ChebConv, global_mean_pool


class GCN(torch.nn.Module):
    """GCN model(network architecture can be modified)"""

    def __init__(self,
                 num_features,
                 num_classes,
                 k_order,
                 dropout=.3):
        super(GCN, self).__init__()

        self.p = dropout

        self.conv1 = ChebConv(int(num_features), 128, K=k_order)
        self.conv2 = ChebConv(128, 64, K=k_order)
        self.conv3 = ChebConv(64, 32, K=k_order)

        self.lin1 = torch.nn.Linear(32, int(num_classes))

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        batch = data.batch
        
        x = func.relu(self.conv1(x, edge_index, edge_attr))
        x = func.dropout(x, p=self.p, training=self.training)
        x = func.relu(self.conv2(x, edge_index, edge_attr))
        x = func.dropout(x, p=self.p, training=self.training)
        x = func.relu(self.conv3(x, edge_index, edge_attr))

        x = global_mean_pool(x, batch)
        x = self.lin1(x)
        return x

In [13]:

def GCN_train(loader):
    model.train()

    loss_all = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = func.cross_entropy(output, data.y)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)


def GCN_test(loader):
    model.eval()

    pred = []
    label = []
    loss_all = 0
    for data in loader:
        data = data.to(device)
        output = model(data)
        loss = func.cross_entropy(output, data.y)
        loss_all += data.num_graphs * loss.item()
        pred.append(func.softmax(output, dim=1).max(dim=1)[1])
        label.append(data.y)

    y_pred = torch.cat(pred, dim=0).cpu().detach().numpy()
    y_true = torch.cat(label, dim=0).cpu().detach().numpy()
    tn, fp, fn, tp = confusion_matrix(y_pred, y_true).ravel()
    epoch_sen = tp / (tp + fn)
    epoch_spe = tn / (tn + fp)
    epoch_acc = (tn + tp) / (tn + tp + fn + fp)
    return epoch_sen, epoch_spe, epoch_acc, loss_all / len(val_dataset)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(36100, 2, 3).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
   
min_v_loss = np.inf    
for epoch in range(50):
    t_loss = GCN_train(train_data)
    test_sen, test_spe, test_acc, _ = GCN_test(test_loader)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (190x190 and 36100x128)