In [9]:
from notebooks.util_classifier import MultiHomogeneousGraphTripletDataset
from training_classifier import *
from util_classifier import *
from util import plot_losses, save_training_results, save_dict_to_json
from gat_models import *

import os
import random
import numpy as np
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
from torch_geometric.loader import DataLoader
from torch.optim import Adam
from torch.nn.modules.loss import TripletMarginLoss
import torch.nn as nn
from src.shared.database_wrapper import DatabaseWrapper
from src.shared.graph_schema import *
from src.shared.graph_sampling import GraphSampling

random.seed(40)
np.random.seed(40)
torch.manual_seed(40)
torch.cuda.manual_seed_all(40)

In [10]:
config = {
    'experiment': 'GAT Classifier Training',
    'max_hops': 2,
    'model_node_feature': 'feature_vec',  # Node feature to use for GAT encoder
    'hidden_channels': 64,
    'out_channels': 16,
    'num_heads': 8,
    'classifier_in_channels': 3 * 16,
    'classifier_hidden_channels': 64,
    'classifier_out_channels': 16,
    'classifier_dropout': 0.2,
    'margin': 1.0,
    'optimizer': 'Adam',
    'learning_rate': 0.005,
    'weight_decay': 5e-4,
    'num_epochs': 10,
    'batch_size': 32,
}

In [11]:
# Graph sampling configurations
node_properties = [
    'id',
    'feature_vec',
]

node_spec = [
    NodeType.PUBLICATION
]

edge_spec = [
    # EdgeType.SIM_TITLE,
    EdgeType.SIM_ABSTRACT,
    EdgeType.SIM_AUTHOR,
    EdgeType.SIM_ORG,
    #EdgeType.SAME_AUTHOR,
]

gat_list = {
    EdgeType.SIM_ABSTRACT: './data/results/homogeneous (abstract) full_emb linear_layer dropout baseline/gat_encoder.pt',
    EdgeType.SIM_AUTHOR: './data/results/homogeneous (similar co-authors) full_emb linear_layer dropout baseline/gat_encoder.pt',
    EdgeType.SIM_ORG: './data/results/homogeneous (org) full_emb linear_layer dropout baseline/gat_encoder.pt',
    #EdgeType.SAME_AUTHOR: './data/results/homogeneous (same author) full_emb linear_layer dropout/gat_encoder.pt'
}


database = 'small-graph'
gs = GraphSampling(
    node_spec=node_spec,
    edge_spec=edge_spec,
    node_properties=node_properties,
    database=database
)

# Model configurations
# Load pre-trained GAT encoders
gat_encoders = {}
for edge_key, gat_path in gat_list.items():
    gat_encoder = HomoGATv2Encoder(config['hidden_channels'], config['out_channels'], num_heads=config['num_heads']).to(device)
    gat_encoder.load_state_dict(torch.load(gat_path))
    gat_list[edge_key] = gat_encoder

loss_fn = TripletMarginLoss(margin=config['margin'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# TODO: Adjust result folder name!
result_folder_name = 'classifier full_emb no nn (abstract, org, sim_author edges)'
result_folder_path = f'./data/results/{result_folder_name}'
if not os.path.exists(result_folder_path):
    os.mkdir(result_folder_path)

Using default edge type: SimilarAbstract for homogeneous graph sampling.


## Embedding Network
**This network takes in the stacked GAT node embeddings and outputs a lower-dimensional embedding.**

In [12]:
class EmbeddingNet(nn.Module):
    def __init__(self, input_size, hidden_size = 128, embedding_size = 16, dropout = 0.2):
        super(EmbeddingNet, self).__init__()
        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(input_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, embedding_size),
        )
    
    def forward(self, x):
        x = self.fc(x)
        x = F.normalize(x, p=2, dim=-1)
        return x

## Triplet Network
**This network takes in three inputs: an anchor, a positive example, and a negative example. It outputs the embeddings of the three inputs. It is used to simplify triplet loss training**

In [13]:
class TripletNet(nn.Module):
    def __init__(self, embedding_net: EmbeddingNet, edge_spec: [EdgeType], gat_encoders: dict[EdgeType, nn.Module]):
        super(TripletNet, self).__init__()
        self.edge_spec = edge_spec
        self.gat_encoders = gat_encoders
        self.embedding_net = embedding_net
        """
        for gat in self.gat_encoders.values():
            gat.eval()
            for param in gat.parameters():
                param.requires_grad = False
        """

    def forward(self, data_dict: dict):
        anchor = []
        positive = []
        negative = []
        
        for edge_type in self.edge_spec:
            # Anchor node embedding for the edge type 
            anchor_graph = data_dict[edge_type][0]
            anchor_gat_emb = self.gat_encoders[edge_type](anchor_graph)
            anchor.append(anchor_gat_emb[anchor_graph.central_node_id])
            
            # Positive node embedding for the edge type
            positive_graph = data_dict[edge_type][1]
            positive_gat_emb = self.gat_encoders[edge_type](positive_graph)
            positive.append(positive_gat_emb[positive_graph.central_node_id])
            
            # Negative node embedding for the edge type
            negative_graph = data_dict[edge_type][2]
            negative_gat_emb = self.gat_encoders[edge_type](negative_graph)
            negative.append(negative_gat_emb[negative_graph.central_node_id])
            
        anchor = torch.cat(anchor, dim=1)
        positive = torch.cat(positive, dim=1)
        negative = torch.cat(negative, dim=1)
        
        #output_anchor = self.embedding_net(anchor)
        #output_positive = self.embedding_net(positive)
        #output_negative = self.embedding_net(negative)
        
        return anchor, positive, negative #output_anchor, output_positive, output_negative

    def get_embedding(self, x):
        return self.embedding_net(x)

## Pair Classifier
**This network will be used for the actual classification task (the AND pipeline).**

In [14]:
class PairClassifier(nn.Module):
    def __init__(self, embedding_net):
        super(PairClassifier, self).__init__()
        self.embedding_net = embedding_net

    def forward(self, embedding_1, embedding_2):
        out_1 = self.embedding_net(embedding_1)
        out_2 = self.embedding_net(embedding_2)
        
        # Compute probability of the pair being similar by computing pairwise distance
        distance = F.pairwise_distance(out_1, out_2)
        similarity_prediction = torch.sigmoid(-distance)
        
        return similarity_prediction

    def get_embedding(self, x):
        return self.embedding_net(x)

## Training

In [15]:
db = DatabaseWrapper(database=database)
data_harvester = ClassifierTripletDataHarvester(db=db, gs=gs, edge_spec=edge_spec, config=config, valid_triplets_save_file='valid_triplets_classifier_small_graph', transformer_model='sentence-transformers/all-MiniLM-L6-v2')


# Split the pairs into train and test

# Harvest the evaluation triplets first, since triplets are ordered by author. This will ensure that the evaluation set has authors not seen in the training set.

config['eval_size'] = len(data_harvester.triplets)

print(f"Eval size: {len(data_harvester.triplets)}")

# Create the datasets from the pairs (distinct pairs for training and testing)
eval_dataset = MultiHomogeneousGraphTripletDataset(data_harvester.triplets, gs, edge_spec=edge_spec, config=config)

# Create the DataLoader
eval_dataloader = DataLoader(eval_dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=custom_triplet_collate)

# Create models
metadata = (
    [n.value for n in node_spec],
    [edge_pyg_key_vals[r] for r in edge_spec]
)
config['node_spec'] = metadata[0]
config['edge_spec'] = metadata[1]

# Embedding model
embedding_net = EmbeddingNet(
    input_size=config['classifier_in_channels'], 
    hidden_size=config['classifier_hidden_channels'],
    embedding_size=config['classifier_out_channels'],
    dropout=config['classifier_dropout']
).to(device)

# Triplet training classifier model
triplet_net = TripletNet(
    embedding_net=embedding_net,
    edge_spec=edge_spec,
    gat_encoders=gat_list
).to(device)

# Optimizer
optimizer = torch.optim.Adam(triplet_net.embedding_net.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])

2024-12-09 17:17:14,709 - DatabaseWrapper - INFO - Connecting to the database ...
2024-12-09 17:17:14,709 - DatabaseWrapper - INFO - Database ready.


Preparing triplets...
Loading triplets...
Loaded 6284 triplets.
Eval size: 6284


In [16]:
from collections import defaultdict

num_epochs = config['num_epochs']
results = defaultdict(list)
    
test_loss, test_num_correct, test_correct_pos_val, test_correct_neg_val, test_precision, test_recall, test_F1 = test_and_eval(
model=triplet_net,
loss_fn=loss_fn,
dataloader=eval_dataloader,
margin=config['margin']
)
results['test_total_loss'].append(test_loss)
results['test_accuracies'].append(test_num_correct)
results['test_accuracies_correct_pos'].append(test_correct_pos_val)
results['test_accuracies_correct_neg'].append(test_correct_neg_val)
results['test_precision'].append(test_precision)
results['test_recall'].append(test_recall)
results['test_F1'].append(test_F1)

print(f"Eval Loss: {test_loss}")
print(f"Eval Accuracy: {test_num_correct}")
print(f"Eval Precision: {test_precision}")
print(f"Eval Recall: {test_recall}")
print(f"Eval F1: {test_F1}")
    
# Save config and training results
config['results'] = results
save_dict_to_json(config, result_folder_path + '/training_data.json')


        Correct positive: 202 (3.21%), Correct negative: 6134 (97.61%)
        Total correct: 6336 (50.41%)
        Test/Eval Loss: 0.9595, Test/Eval Accuracy: 0.5041
        Precision: 0.5739, Recall: 0.0321, F1: 0.0609
Eval Loss: 0.9595071994108597
Eval Accuracy: 0.5041374920432845
Eval Precision: 0.5738636363636364
Eval Recall: 0.03214513049013367
Eval F1: 0.06088004822182037
