# Data Processing

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch_geometric.transforms import RandomNodeSplit
from torch_geometric.utils import negative_sampling
import matplotlib.pyplot as plt

# Load the dataset
file_path = '/home/arifai/hussain/Race_pathways_signatures_meta_cleaned.csv'
data = pd.read_csv(file_path)

# Remove columns with repeated names
data = data.loc[:, ~data.columns.duplicated()]

# Assign binary labels based on race
data['label'] = data['Race'].apply(lambda x: 0 if x == "White (Non-Hispanic)" else 1 if x == "African American" else np.nan)

# Drop rows with NaN labels (if any)
data = data.dropna(subset=['label'])

# Columns to remove
columns_to_remove = [
    'pathgs', 'pathgs_p', 'pathgs_s', 'Race', 'svi', 'sm', 'lni', 'epe', 'avg.risk', 'Decipher', 'APF', 'age',
    'hallmark_adipogenesis', 'hallmark_allograft_rejection', 'hallmark_androgen_response', 'hallmark_angiogenesis',
    'hallmark_angiogenesis_Brauer2013', 'hallmark_angiogenesis_KeggVEGF', 'hallmark_angiogenesis_Liberzon2015',
    'hallmark_angiogenesis_Masiero2013', 'hallmark_angiogenesis_Nolan2013', 'hallmark_angiogenesis_Uhlik2016',
    'hallmark_apical_junction', 'hallmark_apical_surface', 'hallmark_apoptosis', 'hallmark_bile_acid_metabolism',
    'hallmark_cholesterol_homeostasis', 'hallmark_coagulation', 'hallmark_complement', 'hallmark_dna_repair',
    'hallmark_e2f_targets', 'hallmark_epithelial_mesenchymal_transition', 'hallmark_estrogen_response_early',
    'hallmark_estrogen_response_late', 'hallmark_fatty_acid_metabolism', 'hallmark_g2m_checkpoint', 'hallmark_glycolysis',
    'hallmark_hedgehog_signaling', 'hallmark_heme_metabolism', 'hallmark_hypoxia', 'hallmark_il2_stat5_signaling',
    'hallmark_il6_jak_stat3_signaling', 'hallmark_inflammatory_response', 'hallmark_interferon_alpha_response',
    'hallmark_interferon_gamma_response', 'hallmark_kras_signaling_dn', 'hallmark_kras_signaling_up', 'hallmark_mitotic_spindle',
    'hallmark_mtorc1_signaling', 'hallmark_myc_targets_v1', 'hallmark_myc_targets_v2', 'hallmark_myogenesis',
    'hallmark_notch_signaling', 'hallmark_oxidative_phosphorylation', 'hallmark_p53_pathway', 'hallmark_pancreas_beta_cells',
    'hallmark_peroxisome', 'hallmark_pi3k_akt_mtor_signaling', 'hallmark_protein_secretion', 'hallmark_reactive_oxigen_species_pathway',
    'hallmark_spermatogenesis', 'hallmark_tgf_beta_signaling', 'hallmark_tnfa_signaling_via_nfkb', 'hallmark_unfolded_protein_response',
    'hallmark_uv_response_dn', 'hallmark_uv_response_up', 'hallmark_wnt_beta_catenin_signaling', 'hallmark_xenobiotic_metabolism',
    'agell2012_1', 'cheville2008_1', 'cuzick2011_1', 'decipher_1', 'glinsky2005_1', 'klein2014_1', 'lapointe2004_1', 'larkin2012_1',
    'long2014_1', 'nakagawa2008_1', 'penney2011_1', 'ramaswamy2003_1', 'saal2007_1', 'singh2002_1', 'stephenson2005_1', 'talantov2010_1',
    'varambally2005_1', 'wu2013_1', 'yu2007_1'
]

# Remove specified columns
data = data.drop(columns=columns_to_remove)

# Drop non-numeric columns (assuming the first 3 columns are metadata)
gene_expression = data.iloc[:, 3:-1]  # Exclude the label column

# Handle missing values (e.g., fill with the mean of the column)
gene_expression = gene_expression.apply(pd.to_numeric, errors='coerce')
gene_expression = gene_expression.fillna(gene_expression.mean())

# Convert to numpy array
gene_expression = gene_expression.values

# Compute correlation matrix
correlation_matrix = np.corrcoef(gene_expression, rowvar=False)

# Define a threshold for creating edges
threshold = 0.7
edges = np.where(np.abs(correlation_matrix) > threshold)
edge_index = torch.tensor(edges, dtype=torch.long)

# Create node features (gene expression levels)
x = torch.tensor(gene_expression, dtype=torch.float)

# Use actual labels from the dataset
y = torch.tensor(data['label'].values, dtype=torch.long)

# Create PyG data object
data = Data(x=x, edge_index=edge_index, y=y)

# Manually split the data into training and test sets
num_train = 1000
num_test = 152

# Section 1 - Node Classification via GAT

In [3]:
# Create masks for training and test sets
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

train_mask[:num_train] = True
test_mask[num_train:num_train + num_test] = True

data.train_mask = train_mask
data.test_mask = test_mask

# Define GAT model for node classification
class GATNodeClassification(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_heads=8):
        super(GATNodeClassification, self).__init__()
        self.conv1 = GATConv(num_node_features, hidden_channels, heads=num_heads, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * num_heads, hidden_channels, heads=num_heads, dropout=0.6)
        self.conv3 = GATConv(hidden_channels * num_heads, hidden_channels, heads=num_heads, dropout=0.6)
        self.conv4 = GATConv(hidden_channels * num_heads, hidden_channels, heads=num_heads, dropout=0.6)
        self.fc = torch.nn.Linear(hidden_channels * num_heads, 2)
        self.dropout = torch.nn.Dropout(p=0.5)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = self.dropout(x)
        x = self.conv3(x, edge_index)
        x = F.elu(x)
        x = self.dropout(x)
        x = self.conv4(x, edge_index)
        x = F.elu(x)
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Initialize model, loss, and optimizer
model = GATNodeClassification(num_node_features=data.num_features, hidden_channels=256)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

# Training loop for node classification
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    scheduler.step()
    return loss.item()

# Evaluation for node classification
def test():
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
        acc = int(correct) / int(data.test_mask.sum())
        return acc

# Store loss and accuracy for plotting
train_losses = []
test_accuracies = []

# Train the model
for epoch in range(200):
    loss = train()
    acc = test()
    train_losses.append(loss)
    test_accuracies.append(acc)
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss}, Accuracy: {acc}')

# Save the model
torch.save(model.state_dict(), '/home/arifai/hussain/gat_node_classification_model.pth')

# Load the model
model = GATNodeClassification(num_node_features=data.num_features, hidden_channels=256)
model.load_state_dict(torch.load('/home/arifai/hussain/gat_node_classification_model.pth'))
model.eval()

# Print the final test accuracy
final_test_accuracy = test()
print(f'Final Test Accuracy: {final_test_accuracy}')

# Plotting the results
epochs = range(200)

plt.figure(figsize=(12, 5))

# Plot Training Loss
plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()

# Plot Test Accuracy
plt.subplot(1, 2, 2)
plt.plot(epochs, test_accuracies, label='Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Test Accuracy over Epochs')
plt.legend()

plt.tight_layout()
plt.savefig('/home/arifai/hussain/node_classification_loss_accuracy.png')
plt.show()


  edge_index = torch.tensor(edges, dtype=torch.long)


Epoch 0, Loss: 0.889835774898529, Accuracy: 0.0


KeyboardInterrupt: 

# Section 2 - Gene Interaction Network Visualization

In [4]:
import networkx as nx
import matplotlib.pyplot as plt
import torch_geometric
from torch_geometric.utils import to_networkx

# Use the preprocessed data from Section 1
# Compute correlation matrix
correlation_matrix = np.corrcoef(gene_expression, rowvar=False)

print(np.shape(correlation_matrix))

# Define a threshold for creating edges
threshold = 0.7
edges = np.where((np.abs(correlation_matrix) > threshold) & (np.abs(correlation_matrix != 1)))
edge_index = torch.tensor(edges, dtype=torch.long)

print(np.shape(edges))

# Create a NetworkX graph
G = nx.Graph()

# Add nodes (genes)
num_genes = correlation_matrix.shape[0]
G.add_nodes_from(range(num_genes))

# Add edges based on the correlation threshold
for i in range(edges[0].shape[0]):
    G.add_edge(edges[0][i], edges[1][i])

# Plot the graph
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, seed=42)  # Position nodes using the spring layout
nx.draw(G, pos, with_labels=True, node_size=50, node_color='skyblue', edge_color='gray', font_size=8)
plt.title('Gene Interaction Network')
plt.show()


KeyboardInterrupt: 

In [17]:
# Node Loss: measure of predicted and true labels (labels might be gene expressions. the model predicts 
#   the expression of a gene and compares it to the real value?) of nodes (cross-entropy loss, 0 to 1).
# Node Accuracy: % of correctly classified nodes based on race, how well it assigns race to each 
#   node. eval performance of node classification.
# Link Loss: measure of predicted and current existence of edges (edges are interactions between genes)
#   [cross-entropy loss, 0 to 1].
# Link Accuracy: % of correctly predicted edges. eval performance of link prediction.