In [15]:
from sklearn.model_selection import train_test_split
import numpy as np
import networkx as nx
import pandas as pd
from karateclub import NetMF
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

In [16]:
subject_fc_matrices=np.load('../source_data/fc/fc_matrices.npy')

In [17]:
len(subject_fc_matrices)

71

In [18]:
# NetMF embedding logic using karate club module
def netmf_embedding(graph, dimensions=16, order=2):
    model = NetMF(dimensions=dimensions, order=order,seed=21)
    model.fit(graph)
    return model.get_embedding()

In [27]:
def create_data_object(sub_conn_matrix, label, dimensions, order, threshold=0.5):
    adjacency_matrix = (sub_conn_matrix > threshold).astype(float)
    graph = nx.from_numpy_matrix(adjacency_matrix)
    netmf_embeddings = netmf_embedding(graph,dimensions, order)  # Replace this with your actual embedding logic
    
    # Extract node features from the embeddings
    node_features = torch.tensor(netmf_embeddings, dtype=torch.float32)
    edge_index = torch.tensor(adjacency_matrix.nonzero(), dtype=torch.long)
    
    # Create PyTorch Geometric Data object
    data = Data(x=node_features, edge_index=edge_index, y=torch.tensor(label))
    
    return data

In [20]:
# Read the TSV data into a pandas DataFrame
df = pd.read_csv('../Full-Dataset/participants.tsv', delimiter='\t')

# Count the occurrences of each group
group_counts = df['group'].value_counts()

# Print the results
print("HC (Healthy Controls):", group_counts.get('HC', 0), "subjects")
print("AVH- (Auditory Verbal Hallucinations Negative):", group_counts.get('AVH-', 0), "subjects")
print("AVH+ (Auditory Verbal Hallucinations Positive):", group_counts.get('AVH+', 0), "subjects")


HC (Healthy Controls): 25 subjects
AVH- (Auditory Verbal Hallucinations Negative): 23 subjects
AVH+ (Auditory Verbal Hallucinations Positive): 23 subjects


In [28]:
data_list = []
label=[]
for i in range(len(subject_fc_matrices)):
    if i <25:
        label.append(0)
    if i >= 25 and i<48:
        label.append(1)
    if i >=48:
        label.append(2)

all_labels = np.array(label)
# Find unique labels and their counts
unique_labels, counts = np.unique(all_labels, return_counts=True)
# Create a dictionary to store the counts for each unique label
label_counts = dict(zip(unique_labels, counts))
# Print the results
for label, count in label_counts.items():
    print(f"Label {label}: {count} subjects")



Label 0: 25 subjects
Label 1: 23 subjects
Label 2: 23 subjects


In [29]:
for i in range(len(subject_fc_matrices)):
    sub_conn_matrix=subject_fc_matrices[i]
    subject_data = create_data_object(sub_conn_matrix,label=all_labels[i], dimensions=32, order=2,threshold=0.5)
    data_list.append(subject_data)

In [24]:
# Split the data into train, validation, and test sets
test_size = 0.2
validation_size = 0.2
dimensions=32
order=2
threshold=0.5
# First, split into training and test sets
subject_fc_train, subject_fc_test, labels_train, labels_test = train_test_split(
    subject_fc_matrices, all_labels, test_size=test_size, random_state=42)
# Then, split the training set into training and validation sets
subject_fc_train, subject_fc_val, labels_train, labels_val = train_test_split(
    subject_fc_train, labels_train, test_size=validation_size / (1 - test_size), random_state=42)

# Example: Create PyTorch Geometric Data objects for each set
data_train = [create_data_object(sub_conn_matrix,labels, dimensions, order,threshold) for sub_conn_matrix, labels in zip(subject_fc_train, labels_train)]
data_val = [create_data_object(sub_conn_matrix,labels, dimensions, order,threshold) for sub_conn_matrix, labels in zip(subject_fc_val, labels_val)]
data_test = [create_data_object(sub_conn_matrix,labels, dimensions, order,threshold) for sub_conn_matrix, labels in zip(subject_fc_test, labels_test)]

In [25]:
# Example: Use PyTorch DataLoader to handle batches if needed
batch_size = 4  # Adjust according to your needs
data_loader_train = DataLoader(data_train, batch_size=batch_size, shuffle=True)
data_loader_val = DataLoader(data_val, batch_size=batch_size, shuffle=False)
data_loader_test = DataLoader(data_test, batch_size=batch_size, shuffle=False)


In [26]:
batch_train = next(iter(data_loader_train))
print(batch_train)


DataBatch(x=[156, 32], edge_index=[2, 1282], y=[4], batch=[156], ptr=[5])
