In [1]:
import dataset
import importlib
importlib.reload(dataset)
from dataset import DatasetCreation

In [2]:
from torch_geometric.utils import to_undirected,add_self_loops
from torch_geometric.data import Data
import itertools
import torch
import numpy as np 

In [3]:
#Creating the data, e.g., in this example, we create 1000 graphs with 10 nodes, 1000 with 15, etc.
#We create graphs with varying number of nodes such that the neural network can learn to generalize.
#Ideally we want many graphs and with many different number of nodes, but that can create computational problems if we lack the resources
#or it the GNN we consider have many parameters.
#Also, the bigger the graph, the more time concorde will need for solving (NP-hard problem)

data_list = []

num_datasets = 100

for num_nodes in [5, 10, 12, 15]:
  node_coords, distance_matrices,solution_paths, solution_adjacencies, distances = DatasetCreation.create_dataset(num_nodes = num_nodes,  num_datasets = num_datasets)

  # Peso para el entrenamiento de la red neuronal, mayor número de nodos mayor peso tiene en el entrenamiento. 
  num_pos = num_nodes
  num_neg = num_nodes**2-num_nodes

  weight_pos_class = (num_neg/num_pos)

  edge_index = torch.tensor(list(itertools.product(np.arange(num_nodes),np.arange(num_nodes))), dtype=torch.long).T.contiguous()
  for i in range(len(node_coords)):
      edge_attr = torch.tensor(((distance_matrices[i])).flatten()).float().unsqueeze(1)

      x = torch.tensor(node_coords[i]).float()
      y = torch.tensor(solution_adjacencies[i].flatten()).float().unsqueeze(1)


      data = Data(x=x, edge_index=edge_index, y= y, edge_attr=edge_attr)
      data.edge_weight = torch.tensor(((distance_matrices[i])).flatten()).float().unsqueeze(1)
      data.true_path = torch.Tensor(solution_paths[i])
      data.true_distance = torch.Tensor([distances[i]]).unsqueeze(1)
      data.num_nodes = num_nodes
      data.pos_class_weight = weight_pos_class
      data_list.append(data)

In [4]:
ejemplo = data_list[13]

print("Nodos del problema:")
print(ejemplo.x.tolist())
print("\nAristas por donde pasa el tour (1 si para , 0 si no pasa):")
print(ejemplo.y.tolist())
print("\nCamino más corto")
print(ejemplo.true_path.tolist())
print("\nDistancia total del tour:")
print(ejemplo.true_distance.tolist())
print("\nPeso para la red neuronal:")
print(ejemplo.pos_class_weight)

Nodos del problema:
[[89.93830871582031, 44.39252471923828], [29.364057540893555, 99.97833251953125], [90.78205871582031, 55.61297607421875], [90.71626281738281, 31.229068756103516], [38.70973205566406, 92.18704223632812]]

Aristas por donde pasa el tour (1 si para , 0 si no pasa):
[[0.0], [0.0], [1.0], [0.0], [0.0], [0.0], [0.0], [0.0], [1.0], [0.0], [0.0], [0.0], [0.0], [0.0], [1.0], [1.0], [0.0], [0.0], [0.0], [0.0], [0.0], [1.0], [0.0], [0.0], [0.0]]

Camino más corto
[0.0, 2.0, 4.0, 1.0, 3.0]

Distancia total del tour:
[[192.38340759277344]]

Peso para la red neuronal:
4.0


In [5]:
import solutions
importlib.reload(dataset)
from solutions import SolutionAnalysys

In [6]:
#split the data to train, test, val

from torch_geometric.loader import DataLoader
import random

total_length = len(data_list)

# Calculate lengths of each part based on percentages
train_length = int(total_length * 0.7)
val_length = int(total_length * 0.1)

shuffled_list = random.sample(data_list, len(data_list))

# train, validate, test = np.split(data_list, [int(len(data_list)*0.7), int(len(data_list)*0.8)])
train_list = shuffled_list[:train_length]
val_list = shuffled_list[train_length:train_length + val_length]
test_list = shuffled_list[train_length + val_length:]

In [7]:
#to train the GNN with graphs of different sizes, we have to create batches with each batch having graphs of the same size.
#(Not sure there is another way for this)

max_batch_size = 256  # Maximum batch size
train_batched_data = []

# Sort the datasets based on the number of nodes
sorted_datasets = sorted(train_list, key=lambda x: x.num_nodes)

for dataset in sorted_datasets:
    if not train_batched_data or train_batched_data[-1][-1].num_nodes != dataset.num_nodes:
        # If the current batch is empty or the last dataset in the current batch has a different number of nodes,
        # start a new batch with the current dataset
        train_batched_data.append([dataset])
    else:
        # If adding the current dataset to the last batch doesn't exceed the maximum batch size,
        # add it to the last batch
        current_batch = train_batched_data[-1]
        if sum(data.num_nodes for data in current_batch) + dataset.num_nodes <= max_batch_size:
            current_batch.append(dataset)
        else:
            # Otherwise, start a new batch with the current dataset
            train_batched_data.append([dataset])

print(len(train_batched_data))

14


In [None]:
val_batched_data = []

# Sort the datasets based on the number of nodes
sorted_datasets = sorted(val_list, key=lambda x: x.num_nodes)

for dataset in sorted_datasets:
    if not val_batched_data or val_batched_data[-1][-1].num_nodes != dataset.num_nodes:
        # If the current batch is empty or the last dataset in the current batch has a different number of nodes,
        # start a new batch with the current dataset
        val_batched_data.append([dataset])
    else:
        # If adding the current dataset to the last batch doesn't exceed the maximum batch size,
        # add it to the last batch
        current_batch = val_batched_data[-1]
        if sum(data.num_nodes for data in current_batch) + dataset.num_nodes <= max_batch_size:
            current_batch.append(dataset)
        else:
            # Otherwise, start a new batch with the current dataset
            val_batched_data.append([dataset])

In [9]:
test_batched_data = []

# Sort the datasets based on the number of nodes
sorted_datasets = sorted(test_list, key=lambda x: x.num_nodes)

for dataset in sorted_datasets:
    if not test_batched_data or test_batched_data[-1][-1].num_nodes != dataset.num_nodes:
        # If the current batch is empty or the last dataset in the current batch has a different number of nodes,
        # start a new batch with the current dataset
        test_batched_data.append([dataset])
    else:
        # If adding the current dataset to the last batch doesn't exceed the maximum batch size,
        # add it to the last batch
        current_batch = test_batched_data[-1]
        if sum(data.num_nodes for data in current_batch) + dataset.num_nodes <= max_batch_size:
            current_batch.append(dataset)
        else:
            # Otherwise, start a new batch with the current dataset
            test_batched_data.append([dataset])


In [None]:
#create the data loaders

from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_batched_data, batch_size=None, shuffle=True)
val_loader = DataLoader(val_batched_data, batch_size=None, shuffle=True)
test_loader = DataLoader(test_batched_data, batch_size=None, shuffle=True)