In [85]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn import preprocessing
import random

import itertools
import os
import re
import math
import json

import torch
from torch_geometric.data import HeteroData, Data, Batch
from torch_geometric.loader import DataLoader
from torch_geometric.utils import to_networkx, subgraph, degree, from_networkx
from torch.utils.data import Dataset, dataloader
import torch_geometric.transforms as T

from torch_geometric.nn import GCNConv, summary, GraphSAGE
from torch.nn import Sequential, Linear, ReLU, Dropout
from sklearn.model_selection import ParameterGrid
from torch_geometric.nn import global_add_pool, global_mean_pool
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.utils import class_weight

import community.community_louvain as community_louvain  # python-louvain

# visual
import networkx as nx
import matplotlib.pyplot as plt
from tabulate import tabulate

In [86]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

#device = torch.device('cpu')
print(device)

cuda


In [87]:
import os
cwd = os.getcwd()
print(cwd)

d:\Work\UQAM\Doctorat\Projets\oignion_GNN\cultures_GNN\carotte\Automne_2023\Script\graph_masking


In [88]:
carrot_df = pd.read_csv('../../Output/carrot_no_sensitive_data.csv', index_col=0)
meteo_df = pd.read_csv('../../Output/combined_daily_meteo.csv', index_col=0)
plant_distance_filepath = "../../Output/field_distance.txt"

In [89]:
with open(plant_distance_filepath) as file:
    distance_txt_file = [line.rstrip() for line in file]

In [90]:
for i in meteo_df.columns.tolist():
  print(i)

print (len(meteo_df.columns.tolist()))

FarmID
Date
Day_avg_Temp_C
Day_max_Temp_C
Day_min_Temp_C
Day_less_5_Temp_C
Day_less_13_Temp_C
Day_more_30_Temp_C
Day_more_35_Temp_C
Day_15_25_Temp_C
Day_18_25_Temp_C
Day_18_30_Temp_C
Day_avg_RH
Day_max_RH
Day_min_RH
Day_more_95_RH
Day_70_85_RH
Day_70_95_RH
Day_sum_Rain
Day_more_0_Rain
Night_avg_Temp_C
Night_max_Temp_C
Night_min_Temp_C
Night_less_5_Temp_C
Night_less_13_Temp_C
Night_more_30_Temp_C
Night_more_35_Temp_C
Night_between_15_25_Temp_C
Night_between_18_25_Temp_C
Night_between_18_30_Temp_C
Night_avg_RH
Night_max_RH
Night_min_RH
Night_more_95_RH
Night_between_70_85_RH
Night_between_70_95_RH
Night_sum_Rain
Night_more_0_Rain
Quot_avg_Temp_C
Quot_max_Temp_C
Quot_min_Temp_C
Quot_less_5_Temp_C
Quot_less_13_Temp_C
Quot_more_30_Temp_C
Quot_more_35_Temp_C
Quot_between_15_25_Temp_C
Quot_between_18_25_Temp_C
Quot_between_18_30_Temp_C
Quot_avg_RH
Quot_max_RH
Quot_min_RH
Quot_more_95_RH
Quot_between_70_85_RH
Quot_between_70_95_RH
Quot_avg_DewPoint_C
Quot_avg_SolarRadiation_W_m2
Quot_sum_Rain


In [91]:
carrot_df.rename(columns={'SampleDate':'Date'}, inplace=True)

In [92]:
carrot_df = carrot_df.drop(carrot_df[carrot_df['FarmID'] == 0].index)

In [93]:
carrot_df['cote_c_carotae'].value_counts()

0    276
1    218
2      4
3      1
Name: cote_c_carotae, dtype: int64

In [94]:
carrot_df.loc[carrot_df['cote_c_carotae'] >= 1, 'cote_c_carotae'] = 1
carrot_df['cote_c_carotae'].value_counts()

0    276
1    223
Name: cote_c_carotae, dtype: int64

In [95]:
# Assuming `true_labels` contains your actual class labels
unique_classes = np.unique(carrot_df['cote_c_carotae'])
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=unique_classes, y=carrot_df['cote_c_carotae'])

# Convert class weights to a tensor and move it to the same device as your model
class_weights = torch.tensor(class_weights, dtype=torch.float).double().to(device)

print(class_weights)

tensor([0.9040, 1.1188], device='cuda:0', dtype=torch.float64)


In [96]:
# we remove every date in the meteo dataframe where no sample have been taken
unique_sample_date = carrot_df['Date'].unique()
print(unique_sample_date)
unique_sample_date = meteo_df[meteo_df['Date'].isin(unique_sample_date)]

unique_sample_date.head()

[202 204 207 209 214 216 222 225 237 243 244 251 253 259 260 265 266 271
 272]


Unnamed: 0,FarmID,Date,Day_avg_Temp_C,Day_max_Temp_C,Day_min_Temp_C,Day_less_5_Temp_C,Day_less_13_Temp_C,Day_more_30_Temp_C,Day_more_35_Temp_C,Day_15_25_Temp_C,...,Rolling_Quot_RH_max_14D,Rolling_Quot_DewPoint_sum_14D,Rolling_Quot_SolarRadiation_sum_14D,Rolling_Quot_SolarRadiation_mean_14D,Rolling_Quot_Rain_sum_14D,Rolling_Quot_Rain_mean_14D,Rolling_Quot_more_0_Rain_14D,Rolling_Quot_WindSpeed_mean_14D,Rolling_Quot_GustSpeed_mean_14D,Rolling_DegresJours_sum_14D
37,0,202,19.633077,23.93,15.27,0,0,0,0,13,...,99.8,241.599583,2473.5,176.678571,47.2,3.371429,2.071429,0.449405,2.42381,232.5
39,0,204,22.406923,26.11,16.18,0,0,0,0,9,...,99.8,239.610833,2842.041667,203.002976,28.2,2.014286,1.285714,0.552083,2.609821,239.375
42,0,207,24.604615,28.1,16.75,0,0,0,0,6,...,99.8,243.713333,2762.166667,197.297619,39.6,2.828571,1.642857,0.703869,3.064286,244.495
44,0,209,21.417692,25.26,15.08,0,0,0,0,12,...,99.8,234.027917,2749.791667,196.41369,39.4,2.814286,1.642857,0.574405,2.685714,235.605
49,0,214,20.373077,23.62,14.96,0,0,0,0,12,...,99.8,206.191667,2762.833333,197.345238,45.2,3.228571,1.857143,0.741071,3.085119,212.38


In [97]:
combined_df = carrot_df.merge(meteo_df, on=['FarmID', 'Date'])

df_squamosa = combined_df.get('cote_c_carotae')

combined_df = combined_df.drop(['incidence_a_dauci', 'incidence_s_sclerotiorum'], axis=1)

combined_df

Unnamed: 0,FarmID,GreenLeavesNum_carrots,Plant_ID,cote_c_carotae,Date,carrot_stage,Day_avg_Temp_C,Day_max_Temp_C,Day_min_Temp_C,Day_less_5_Temp_C,...,Rolling_Quot_RH_max_14D,Rolling_Quot_DewPoint_sum_14D,Rolling_Quot_SolarRadiation_sum_14D,Rolling_Quot_SolarRadiation_mean_14D,Rolling_Quot_Rain_sum_14D,Rolling_Quot_Rain_mean_14D,Rolling_Quot_more_0_Rain_14D,Rolling_Quot_WindSpeed_mean_14D,Rolling_Quot_GustSpeed_mean_14D,Rolling_DegresJours_sum_14D
0,2,4,1,0,202,0,18.976154,23.16,15.84,0,...,99.9,244.769167,2526.625,180.473214,59.6,4.257143,2.357143,0.747024,3.088095,229.010
1,2,4,2,0,202,0,18.976154,23.16,15.84,0,...,99.9,244.769167,2526.625,180.473214,59.6,4.257143,2.357143,0.747024,3.088095,229.010
2,2,4,3,0,202,0,18.976154,23.16,15.84,0,...,99.9,244.769167,2526.625,180.473214,59.6,4.257143,2.357143,0.747024,3.088095,229.010
3,2,4,4,0,202,0,18.976154,23.16,15.84,0,...,99.9,244.769167,2526.625,180.473214,59.6,4.257143,2.357143,0.747024,3.088095,229.010
4,2,4,5,0,202,0,18.976154,23.16,15.84,0,...,99.9,244.769167,2526.625,180.473214,59.6,4.257143,2.357143,0.747024,3.088095,229.010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,2,6,21,1,260,2,22.380000,25.36,15.34,0,...,99.1,200.061250,2111.625,150.830357,26.2,1.871429,1.500000,0.962798,3.867560,186.985
395,2,6,22,1,260,2,22.380000,25.36,15.34,0,...,99.1,200.061250,2111.625,150.830357,26.2,1.871429,1.500000,0.962798,3.867560,186.985
396,2,6,23,1,260,2,22.380000,25.36,15.34,0,...,99.1,200.061250,2111.625,150.830357,26.2,1.871429,1.500000,0.962798,3.867560,186.985
397,2,6,24,1,260,2,22.380000,25.36,15.34,0,...,99.1,200.061250,2111.625,150.830357,26.2,1.871429,1.500000,0.962798,3.867560,186.985


In [98]:
# explore the directory containing the datasets vars and populate a result dataframe

# assign directory
directory = '../datasets_vars/'
datasets = []
 
# iterate over files in
# that directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(file_path):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        print(file_name)
        datasets.append(file_name)

print(f'{len(datasets)} datasets appended')

base
Botcast
Clarisse_dataset_1
Clarisse_dataset_2
Clarisse_dataset_3
Clarisse_dataset_4
f_classif_10
f_classif_20
f_classif_5
lasso_alpha0.005
lasso_alpha0.01
lasso_alpha0.02
MultiSURF10
MultiSURF20
MultiSURF5
mutual_info_class_10
mutual_info_class_20
mutual_info_class_5
ReliefF10
ReliefF20
ReliefF5
Steentjes
22 datasets appended


In [99]:
def extract_sub_df(combined_df, vars):
    return combined_df.loc[:, vars]

In [100]:
def normalize(sub_df, exception_list):
    min_max_scaler = preprocessing.MinMaxScaler()
    columns_to_normalize = [col for col in sub_df.columns if col not in exception_list]

    # Normalize the selected columns
    sub_df[columns_to_normalize] = min_max_scaler.fit_transform(sub_df[columns_to_normalize])
    return (sub_df)

In [101]:
def get_plant_distances(fieldID, threshold, distance_txt_file, valid_ids):
  array1 = []
  array2 = []
  for line in distance_txt_file:
      farmID, plant1, plant2, dist = line.split(',', 3)

      # conditions: find the right field,
      # plants are close enough together
      # avoids error if there was a partial observation that day (less nodes than the dist matrix has)
      # removes the last overflowing extra node if partial observation

      if (int(farmID) == fieldID and
      float(dist) <= threshold and
      all(int(x) in valid_ids for x in [plant1, plant2]) and
      ((max(int(plant1), int(plant2)) < max(valid_ids)))):
          array1.append(int(plant1))
          array2.append(int(plant2))

  return array1, array2

In [102]:
def extract_MetaData(carrot_df):
    # extract the farm IDs, dates, and plants information from the dataframes
    farm_ids = carrot_df['FarmID'].unique()
    dates = carrot_df['Date'].unique()
    return(farm_ids, dates)

In [103]:
def clean_ID_sparse(day_farm_data_subset, ID_column, donor, receiver):
    max_index = len(day_farm_data_subset[ID_column].tolist())
    node_id_list = day_farm_data_subset[ID_column].tolist()
    gaps = []
    for i in range(max_index):
        if i not in node_id_list:
            gaps.append(i)
    
    if len(gaps) > 0:
        id_to_replace = [i for i in node_id_list if i >= max_index]
        #print(f'id_to_replace: {id_to_replace}')
        
        #gaps and id_to_replace will always be the same size logically
        for idx in range(len(gaps)):
            old_id = id_to_replace[idx]
            new_id = gaps[idx]
            #day_farm_data_subset['Plant_ID'] = day_farm_data_subset['Plant_ID'].replace(old_id, new_id)
            day_farm_data_subset.loc[day_farm_data_subset[ID_column] == old_id, ID_column] = new_id

            donor = [new_id if x == old_id else x for x in donor]
            receiver = [new_id if x == old_id else x for x in receiver]

    return day_farm_data_subset, donor, receiver

# Approche par seuil coulissant
-----

In [104]:
def create_graph(donor, receiver, day_farm_data_subset, squamosa_rate_label, train_masks, valid_masks, test_masks, plant_tags):
    donor_np = np.array(donor)
    receiver_np = np.array(receiver)

    # we need edges going both ways 
    from_plant1 = torch.tensor(donor_np, dtype = int)
    to_plant2 = torch.tensor(receiver_np, dtype = int)

    plant_edges = torch.concat((from_plant1, to_plant2)).reshape(-1, len(from_plant1)).long()

    temp = day_farm_data_subset['Plant_ID'].to_list()
    #remove vars we dont want in the graph (the metadata)
    day_farm_data_clean = day_farm_data_subset.drop(['FarmID', 'Plant_ID', 'Date', 'cote_c_carotae'], axis=1)

    #convert class values to tensor
    squamosa_rate_label = torch.tensor(squamosa_rate_label, dtype = int)

    # create plant node as tensor
    plants_tensor = torch.tensor(np.array(day_farm_data_clean), dtype = float)

    graph = Data(x=plants_tensor, edge_index=plant_edges, y=squamosa_rate_label)

    transform = T.Compose([T.ToUndirected(), T.AddSelfLoops()])
    graph = transform(graph)

    graph.train_mask = torch.Tensor(train_masks)
    graph.val_mask = torch.Tensor(valid_masks)
    graph.test_mask = torch.Tensor(test_masks)
    graph.plant_tags = torch.Tensor(plant_tags)

    return graph, len(plant_edges[0])

In [105]:
def build_HGNN_aug_sliding_threshold(sub_df_norm, masking_data, distance_txt_file, minimum_distance, maximum_distance, step):

    farm_ids, dates = extract_MetaData(sub_df_norm)

    # list to store the graph for each farm and date
    graphs = []
    unique_farms_idx = {}
    aug_proof_concept = {}
    aug_proof_concept_iter = -1
    nbr_total_nodes = 0
    nbr_total_edges = 0


    # loop through each farm and date
    for (farm_id, date) in itertools.product(farm_ids, dates):

        train_masks, valid_masks, test_masks, plant_tags = create_masks(masking_data, farm_id)
        plant_ids = sub_df_norm[(sub_df_norm['FarmID'] == farm_id) & (sub_df_norm['Date'] == date)]['Plant_ID'].tolist()
        train_masks_subgraph = [train_masks[i - 1] for i in plant_ids]
        valid_masks_subgraph = [valid_masks[i - 1] for i in plant_ids]
        test_masks_subgraph = [test_masks[i - 1] for i in plant_ids]
        plant_tags_subgraph = [plant_tags[i - 1] for i in plant_ids]

        day_farm_data = sub_df_norm[(sub_df_norm['FarmID'] == farm_id) & (sub_df_norm['Date'] == date)]
        if day_farm_data.size > 0:
            aug_proof_concept_iter +=1 # only for visualisation purposes

            median_distance_threshold = int(round((minimum_distance+maximum_distance)/2))

            donor, receiver = get_plant_distances(farm_id, median_distance_threshold, distance_txt_file, day_farm_data['Plant_ID'].tolist())

            if len(plant_ids) != max(plant_ids):
                day_farm_data, donor, receiver = clean_ID_sparse(day_farm_data, "Plant_ID", donor, receiver)

            # first iteration is not an augmentation task, we preserve as much data as possible for training
            squamosa_rate_array = day_farm_data['cote_c_carotae'].to_numpy()
            squamosa_rate_label = np.rint(squamosa_rate_array) # round up cote squamosa to nearest integer
            day_farm_data_subset = day_farm_data # for compatibility with the aug operations

            graph, nbr_edges = create_graph(donor, receiver, day_farm_data_subset, squamosa_rate_label, train_masks_subgraph, valid_masks_subgraph, test_masks_subgraph, plant_tags_subgraph)
                
            median_graph_edges = nbr_edges
            graphs.append(graph)

            if farm_id not in unique_farms_idx:
                unique_farms_idx[farm_id] = graph
                
            if aug_proof_concept_iter == 0:
                aug_proof_concept[median_distance_threshold] = graph

            
            for dist_threshold in range(minimum_distance, median_distance_threshold, step):
                donor, receiver = get_plant_distances(farm_id, dist_threshold, distance_txt_file, day_farm_data['Plant_ID'].tolist())

                # first iteration is not an augmentation task, we preserve as much data as possible for training
                squamosa_rate_array = day_farm_data['cote_c_carotae'].to_numpy()
                squamosa_rate_label = np.rint(squamosa_rate_array) # round up cote squamosa to nearest integer
                day_farm_data_subset = day_farm_data # for compatibility with the aug operations

                if len(donor) > 0:
                    graph, nbr_edges = create_graph(donor, receiver, day_farm_data_subset, squamosa_rate_label, train_masks_subgraph, valid_masks_subgraph, test_masks_subgraph, plant_tags_subgraph)
                else:
                    nbr_edges = 0
                
                if nbr_edges >= 0.5 * median_graph_edges:
                    graphs.append(graph)

                # used to save a graph of each farm for visualisation purposes only
                if farm_id not in unique_farms_idx:
                    unique_farms_idx[farm_id] = graph
                
                if aug_proof_concept_iter == 0:
                    aug_proof_concept[dist_threshold] = graph
            
            for dist_threshold in range(median_distance_threshold + 1, maximum_distance+ 1, step):
                donor, receiver = get_plant_distances(farm_id, dist_threshold, distance_txt_file, day_farm_data['Plant_ID'].tolist())

                # first iteration is not an augmentation task, we preserve as much data as possible for training
                squamosa_rate_array = day_farm_data['cote_c_carotae'].to_numpy()
                squamosa_rate_label = np.rint(squamosa_rate_array) # round up cote squamosa to nearest integer
                day_farm_data_subset = day_farm_data # for compatibility with the aug operations

                graph, nbr_edges = create_graph(donor, receiver, day_farm_data_subset, squamosa_rate_label, train_masks_subgraph, valid_masks_subgraph, test_masks_subgraph, plant_tags_subgraph)
                
                if nbr_edges <= 1.5 * median_graph_edges:
                    graphs.append(graph)

                # used to save a graph of each farm for visualisation purposes only
                if farm_id not in unique_farms_idx:
                    unique_farms_idx[farm_id] = graph
                
                if aug_proof_concept_iter == 0:
                    aug_proof_concept[dist_threshold] = graph

    return (graphs, unique_farms_idx, aug_proof_concept)

# Approche Louvain
-----

In [106]:
def find_communities(graph, min_threshold=0.1):
    total_nodes = len(graph.nodes)
    # Apply Louvain community detection
    partition = community_louvain.best_partition(graph)
    
    # Create a dictionary to store communities
    communities = {}
    for node, community_id in partition.items():
        if community_id not in communities:
            communities[community_id] = []
        communities[community_id].append(node)
    
    # Merge small communities or randomly assign nodes to a neighboring community
    for community_id, nodes in list(communities.items()):
        if len(nodes) < min_threshold * total_nodes:
            neighbors = set()
            for node in nodes:
                neighbors.update(set(graph.neighbors(node)))
            
            # Filter out nodes already in the community 
            neighbors = neighbors - set(nodes)
            if len(neighbors) > 0:
                neighbor_community_id = partition[random.choice(list(neighbors))]
                communities[neighbor_community_id].extend(nodes)
                del communities[community_id]
            else:
                #random_community_id = random.choice(list(communities.keys()))
                random_community_id = random.choice([cid for cid in communities.keys() if cid != community_id])
                communities[random_community_id].extend(nodes)
                del communities[community_id]
    
    return list(communities.values())

In [107]:
def split_graph_into_communities(graph, communities):
    community_subgraphs = {}
    
    for community_id, nodes in enumerate(communities):
        subgraph = graph.subgraph(nodes)
        community_subgraphs[community_id] = subgraph
    
    return community_subgraphs

In [108]:
def plot_initial_graph_with_partitions(graph, communities):
    pos = nx.spring_layout(graph)  # You can use a different layout if desired
    
    # Draw nodes and edges of the entire graph
    nx.draw(graph, pos, with_labels=True, node_color='lightgray', node_size=300)
    
    # Draw nodes of each community with different colors
    for i, nodes in enumerate(communities):
        nx.draw_networkx_nodes(graph, pos, nodelist=nodes, node_color=f'C{i}', node_size=300)
    
    plt.show()

In [109]:
def plot_community_subgraphs(community_subgraphs):
    num_communities = len(community_subgraphs)

    fig, axs = plt.subplots(1, num_communities, figsize=(15, 5))

    for i, (community_id, subgraph) in enumerate(community_subgraphs.items()):
        axs[i].set_title(f'Community {community_id}')
        pos = nx.spring_layout(subgraph)  # You can use a different layout if desired

        color_dict = {
            0:'blue',
            1: 'orange',
            2: 'green',
            3: 'red',
            4: 'purple',
            5: 'brown',
            6: 'pink',
            7: 'gray',
            8: 'olive',
            9: 'cyan'
        }
        color_map = []
        for node in subgraph:
            color_map.append(color_dict[community_id])

        nx.draw(subgraph, pos, node_color=color_map, with_labels=True, ax=axs[i])
        #axs[i].axis('off')
        


    plt.tight_layout()
    plt.show()

In [110]:
def build_HGNN_louvain(sub_df_norm, masking_data, distance_txt_file, dist_threshold_choice = [20, 25, 30, 35, 40]):

    farm_ids, dates = extract_MetaData(sub_df_norm)

    # list to store the graph for each farm and date
    graphs = []
    unique_farms_idx = {}
    aug_proof_concept = {}
    aug_proof_concept_iter = -1
    nbr_total_nodes = 0
    nbr_total_edges = 0
    size_communities = []
    nbr_subgraphs = 0


    # loop through each farm and date
    for (farm_id, date) in itertools.product(farm_ids, dates):

        train_masks, valid_masks, test_masks, plant_tags = create_masks(masking_data, farm_id)
        day_farm_data = sub_df_norm[(sub_df_norm['FarmID'] == farm_id) & (sub_df_norm['Date'] == date)]
        if day_farm_data.size > 0:
            day_farm_data_subset = day_farm_data.copy(deep=True)
            aug_proof_concept_iter +=1 # only for visualisation purposes

            #get edges
            dist_threshold = random.choice(dist_threshold_choice)
            donor, receiver = get_plant_distances(farm_id, dist_threshold, distance_txt_file, day_farm_data['Plant_ID'].tolist())
            plant_edges = list(zip(donor, receiver))

            # columns that should not be features in the graph
            exclude_columns = ['FarmID', 'Plant_ID', 'Date', 'cote_c_carotae']
            node_id_column = day_farm_data_subset['Plant_ID']
            node_features = day_farm_data_subset.drop(exclude_columns, axis=1).to_dict('records')
            

            graph_nx = nx.Graph()
            for node_id, features in zip(node_id_column, node_features):
                graph_nx.add_node(node_id, **features)
            
            for edge in plant_edges:
                graph_nx.add_edge(*edge)
            

            communities = find_communities(graph_nx)

            community_subgraphs = split_graph_into_communities(graph_nx, communities)

            #plot_initial_graph_with_partitions(graph_nx, communities)
            #plot_community_subgraphs(community_subgraphs)
            
            for com_id, subgraph in community_subgraphs.items():
                node_id_list = subgraph.nodes
                edges_list = subgraph.edges

                subgraph_df = day_farm_data.loc[day_farm_data['Plant_ID'].isin(node_id_list)]

                # extract appropriate masks
                train_masks_subgraph = [train_masks[i - 1] for i in node_id_list]
                valid_masks_subgraph = [valid_masks[i - 1] for i in node_id_list]
                test_masks_subgraph = [test_masks[i - 1] for i in node_id_list]
                plant_tags_subgraph = [plant_tags[i - 1] for i in node_id_list]
                
                # resetting node IDS
                new_plant_id = list(range(0, len(node_id_list)))
                new_plant_id_tuples = list(zip(node_id_list, new_plant_id))

                for old_id, new_id in new_plant_id_tuples:
                    subgraph_df.loc[subgraph_df["Plant_ID"] == old_id, "Plant_ID"] = new_id

                id_dict = dict(new_plant_id_tuples)
                updated_edges_list = [(id_dict.get(old_id_1, old_id_1), id_dict.get(old_id_2, old_id_2)) for old_id_1, old_id_2 in edges_list]

                # shared operations between augmented and non-augmented graphs
                if len(updated_edges_list) == 0:
                    continue # we skip graphs with no edges at all

                donor, receiver = zip(*updated_edges_list)
                donor_np = np.array(donor)
                receiver_np = np.array(receiver)

                # we need edges going both ways 
                from_plant1 = torch.tensor(donor_np, dtype = int)
                to_plant2 = torch.tensor(receiver_np, dtype = int)

                plant_edges = torch.concat((from_plant1, to_plant2)).reshape(-1, len(from_plant1)).long()

                #convert class values to tensor
                squamosa_rate_array = subgraph_df['cote_c_carotae'].to_numpy()
                squamosa_rate_label = np.rint(squamosa_rate_array) # round up cote squamosa to nearest integer
                squamosa_rate_label = torch.tensor(squamosa_rate_label, dtype = int)

                #remove vars we dont want in the graph (the metadata)
                day_farm_data_clean = subgraph_df.drop(['FarmID', 'Plant_ID', 'Date', 'cote_c_carotae'], axis=1)

                # create plant node as tensor
                plants_tensor = torch.tensor(np.array(day_farm_data_clean), dtype = float)

                graph = Data(x=plants_tensor, edge_index=plant_edges, y=squamosa_rate_label)

                transform = T.Compose([T.ToUndirected(), T.AddSelfLoops()])
                graph = transform(graph)

                graph.train_mask = torch.Tensor(train_masks_subgraph)
                graph.val_mask = torch.Tensor(valid_masks_subgraph)
                graph.test_mask = torch.Tensor(test_masks_subgraph)
                graph.plant_tags = torch.Tensor(plant_tags_subgraph)

                if (len(train_masks_subgraph)) != len(squamosa_rate_array):
                    print('error: training mask not right size')

                graphs.append(graph)

                if farm_id not in unique_farms_idx:
                    unique_farms_idx[farm_id] = graph
                
                #if aug_proof_concept_iter == 0:
                    #aug_proof_concept[com_id] = graph

                    #g = to_networkx(graph, to_undirected=True)
                    #nx.draw(g, pos=nx.spring_layout(g), with_labels=True)

                    #plt.show()
    
    return (graphs, unique_farms_idx, aug_proof_concept)


# Approche random walk

In [111]:
def random_walk(graph, max_depth, samples):
    result = []
    unused_nodes = [n for n in graph.nodes()]
    for _ in range(samples):
        attempts = len(unused_nodes)
        while attempts > 0:
            start_node = random.choice(unused_nodes)
            unused_nodes.remove(start_node)
            walk = [start_node]
            visited = set()
            for depth in range(max_depth):
                if depth == 0:
                    neighbors = [n for n in graph.neighbors(start_node) if n not in visited]
                else:
                    neighbors = [n for node in walk for n in graph.neighbors(node) if n not in visited]
                if neighbors:
                    walk.extend(neighbors)
                    visited.update(neighbors)
                else:
                    break
            if len(walk) > 3:
                result.append(walk)
                break
            else:
                attempts -= 1
    return result

In [112]:
def build_HGNN_random_walk(sub_df_norm, masking_data, distance_txt_file, dist_threshold_choice = [20, 25, 30, 35, 40], max_depth = 2, samples = 5):

    farm_ids, dates = extract_MetaData(sub_df_norm)

    # list to store the graph for each farm and date
    graphs = []
    unique_farms_idx = {}
    aug_proof_concept = {}
    aug_proof_concept_iter = -1
    nbr_total_nodes = 0
    nbr_total_edges = 0
    size_communities = []
    nbr_subgraphs = 0


    # loop through each farm and date
    for (farm_id, date) in itertools.product(farm_ids, dates):

        train_masks, valid_masks, test_masks, plant_tags = create_masks(masking_data, farm_id)
        day_farm_data = sub_df_norm[(sub_df_norm['FarmID'] == farm_id) & (sub_df_norm['Date'] == date)]
        if day_farm_data.size > 0:
            day_farm_data_subset = day_farm_data.copy(deep=True)
            aug_proof_concept_iter +=1 # only for visualisation purposes

            #get edges
            dist_threshold = random.choice(dist_threshold_choice)
            donor, receiver = get_plant_distances(farm_id, dist_threshold, distance_txt_file, day_farm_data['Plant_ID'].tolist())
            plant_edges = list(zip(donor, receiver))

            # columns that should not be features in the graph
            exclude_columns = ['FarmID', 'Plant_ID', 'Date', 'cote_c_carotae']
            node_id_column = day_farm_data_subset['Plant_ID']
            node_features = day_farm_data_subset.drop(exclude_columns, axis=1).to_dict('records')
            

            graph_nx = nx.Graph()
            for node_id, features in zip(node_id_column, node_features):
                graph_nx.add_node(node_id, **features)
            
            for edge in plant_edges:
                graph_nx.add_edge(*edge)
            

            walks = random_walk(graph_nx, max_depth, samples)

            if len(walks) == 0:
                #nx.draw(graph_nx,pos=nx.spring_layout(graph_nx), with_labels=True, node_color='lightgray', node_size=300)
                #plt.show()
                #print(f'walks: {walks}')
                continue

            walks_subgraphs = split_graph_into_communities(graph_nx, walks)

            #print(walks)
            #plot_initial_graph_with_partitions(graph_nx, walks)
            #plot_community_subgraphs(walks_subgraphs)
            
            for walk_id, subgraph in walks_subgraphs.items():
                node_id_list = subgraph.nodes
                edges_list = subgraph.edges

                subgraph_df = day_farm_data.loc[day_farm_data['Plant_ID'].isin(node_id_list)]

                # extract appropriate masks
                train_masks_subgraph = [train_masks[i - 1] for i in node_id_list]
                valid_masks_subgraph = [valid_masks[i - 1] for i in node_id_list]
                test_masks_subgraph = [test_masks[i - 1] for i in node_id_list]
                plant_tags_subgraph = [plant_tags[i - 1] for i in node_id_list]

                # resetting node IDS
                new_plant_id = list(range(0, len(node_id_list)))
                new_plant_id_tuples = list(zip(node_id_list, new_plant_id))

                for old_id, new_id in new_plant_id_tuples:
                    subgraph_df.loc[subgraph_df["Plant_ID"] == old_id, "Plant_ID"] = new_id

                id_dict = dict(new_plant_id_tuples)
                updated_edges_list = [(id_dict.get(old_id_1, old_id_1), id_dict.get(old_id_2, old_id_2)) for old_id_1, old_id_2 in edges_list]

                # shared operations between augmented and non-augmented graphs
                if len(updated_edges_list) == 0:
                    continue # we skip graphs with no edges at all

                donor, receiver = zip(*updated_edges_list)
                donor_np = np.array(donor)
                receiver_np = np.array(receiver)

                # we need edges going both ways 
                from_plant1 = torch.tensor(donor_np, dtype = int)
                to_plant2 = torch.tensor(receiver_np, dtype = int)

                plant_edges = torch.concat((from_plant1, to_plant2)).reshape(-1, len(from_plant1)).long()

                #convert class values to tensor
                squamosa_rate_array = subgraph_df['cote_c_carotae'].to_numpy()
                squamosa_rate_label = np.rint(squamosa_rate_array) # round up cote squamosa to nearest integer
                squamosa_rate_label = torch.tensor(squamosa_rate_label, dtype = int)

                #remove vars we dont want in the graph (the metadata)
                day_farm_data_clean = subgraph_df.drop(['FarmID', 'Plant_ID', 'Date', 'cote_c_carotae'], axis=1)

                # create plant node as tensor
                plants_tensor = torch.tensor(np.array(day_farm_data_clean), dtype = float)

                graph = Data(x=plants_tensor, edge_index=plant_edges, y=squamosa_rate_label)

                transform = T.Compose([T.ToUndirected(), T.AddSelfLoops()])
                graph = transform(graph)

                graph.train_mask = torch.Tensor(train_masks_subgraph)
                graph.val_mask = torch.Tensor(valid_masks_subgraph)
                graph.test_mask = torch.Tensor(test_masks_subgraph)
                graph.plant_tags = torch.Tensor(plant_tags_subgraph)

                graphs.append(graph)

                if farm_id not in unique_farms_idx:
                    unique_farms_idx[farm_id] = graph
                
                if aug_proof_concept_iter == 0:
                    aug_proof_concept[walk_id] = graph

                    #g = to_networkx(graph, to_undirected=True)
                    #nx.draw(g, pos=nx.spring_layout(g), with_labels=True)

                    #plt.show()
    
    return (graphs, unique_farms_idx, aug_proof_concept)

# Approche par manipulation des graphes

In [113]:
def build_HGNN_aug(sub_df_norm, masking_data, distance_txt_file, dist_threshold_choice = [20, 25, 30, 35, 40], aug_multiplier = 3):

    farm_ids, dates = extract_MetaData(sub_df_norm)

    # list to store the graph for each farm and date
    graphs = []
    unique_farms_idx = {}
    aug_proof_concept = {}
    aug_proof_concept_iter = -1
    nbr_total_nodes = 0
    nbr_total_edges = 0

    # loop through each farm and date
    for (farm_id, date) in itertools.product(farm_ids, dates):

        train_masks, valid_masks, test_masks, plant_tags = create_masks(masking_data, farm_id)
        day_farm_data = sub_df_norm[(sub_df_norm['FarmID'] == farm_id) & (sub_df_norm['Date'] == date)]
        if day_farm_data.size > 0:
            aug_proof_concept_iter +=1 # only for visualisation purposes

            for aug in range(aug_multiplier):

                removed_plant_ids = []

                # we select a distance threshold and extract the relevant plant combinations for the edges
                dist_threshold = random.choice(dist_threshold_choice)
                donor, receiver = get_plant_distances(farm_id, dist_threshold, distance_txt_file, day_farm_data['Plant_ID'].tolist())

                # first iteration is not an augmentation task, we preserve as much data as possible for further alterations
                if aug == 0:
                    node_drop_prob = 0
                    edge_drop_prob = 0
                    edge_add_nbr = 0

                    squamosa_rate_array = day_farm_data['cote_c_carotae'].to_numpy()
                    squamosa_rate_label = np.rint(squamosa_rate_array) # round up cote squamosa to nearest integer
                    #print(f'nbr label:', len(squamosa_rate_label))
                    day_farm_data_subset = day_farm_data # for compatibility with the aug operations

                    plant_ids = sub_df_norm[(sub_df_norm['FarmID'] == farm_id) & (sub_df_norm['Date'] == date)]['Plant_ID'].tolist()
                    #print(f'nbr ids:', len(plant_ids))
                    train_masks_subgraph = [train_masks[i - 1] for i in plant_ids]
                    valid_masks_subgraph = [valid_masks[i - 1] for i in plant_ids]
                    test_masks_subgraph = [test_masks[i - 1] for i in plant_ids]
                    plant_tags_subgraph = [plant_tags[i - 1] for i in plant_ids]

                    if len(plant_ids) != max(plant_ids):                      
                        day_farm_data_subset, donor, receiver = clean_ID_sparse(day_farm_data_subset, "Plant_ID", donor, receiver)
                
                # iteration > 1 are augmentation tasks, we apply a random assortment of operations
                else:
                    day_farm_data_subset = day_farm_data.copy(deep=True) # we want to keep the original values so each iteration is independant

                    node_drop_prob = random.choice([0, 0.1, 0.2])
                    edge_drop_prob = random.choice([0, 0.1, 0.2])
                    edge_add_nbr = random.choice([0, 1, 2, 3, 4, 5])

                    #node drop
                    #using the dataframe, we remove n rows based on the node drop probability
                    nbr_nodes_drop = int(node_drop_prob * len(day_farm_data_subset))
                    if nbr_nodes_drop > 0:
                        
                        # sample the df n times and remove the picked rows
                        day_farm_data_subset_filtered = day_farm_data_subset[~day_farm_data_subset['Plant_ID'].isin(conserved_node_ids)] # we keep 20% most connected nodes safe
                        removed_rows = day_farm_data_subset_filtered.sample(nbr_nodes_drop)
                        removed_plant_ids = (removed_rows['Plant_ID'].tolist())
                        day_farm_data_subset = day_farm_data_subset.drop(removed_rows.index)

                        # remove the donor-receiver pairs that include the removed nodes
                        plant_edges = list(zip(donor, receiver))
                        temp_edges = []
                        for don, rec in plant_edges:
                            if don not in removed_plant_ids and rec not in removed_plant_ids:
                                temp_edges.append((don, rec))
                        
                        donor, receiver = zip(*temp_edges)
                        
                        squamosa_rate_array = day_farm_data_subset['cote_c_carotae'].to_numpy()
                        squamosa_rate_label = np.rint(squamosa_rate_array) # round up cote squamosa to nearest integer
                    
                    else:
                        day_farm_data_subset = day_farm_data
                        squamosa_rate_array = day_farm_data_subset['cote_c_carotae'].to_numpy()
                        squamosa_rate_label = np.rint(squamosa_rate_array) # round up cote squamosa to nearest integer


                    #now we have to re_ID come of the plants so that there is no indexing errors down the line
                    day_farm_data_subset, donor, receiver = clean_ID_sparse(day_farm_data_subset, "Plant_ID", donor, receiver)

                    node_id_list = day_farm_data_subset["Plant_ID"].tolist()
                    
                    # extract appropriate masks
                    train_masks_subgraph = [train_masks[i - 1] for i in node_id_list]
                    valid_masks_subgraph = [valid_masks[i - 1] for i in node_id_list]
                    test_masks_subgraph = [test_masks[i - 1] for i in node_id_list]
                    plant_tags_subgraph = [plant_tags[i - 1] for i in node_id_list]

                    #edge drop
                    nbr_edges_drop = int(edge_drop_prob * len(donor))
                    plant_edges = list(zip(donor, receiver))

                    if nbr_edges_drop > 0:
                        random.shuffle(plant_edges)
                        plant_edges = plant_edges[:len(plant_edges)-nbr_edges_drop] # we remove n random edge pairs

                    # edge add
                    if edge_add_nbr > 0:
                        patience = 10
                        remaining_edges = edge_add_nbr
                        node_id_list = day_farm_data_subset["Plant_ID"].tolist()
                        while patience > 0 and remaining_edges > 0:
                            new_donor = random.choice(node_id_list)
                            new_receiver = random.choice(node_id_list)
                            temp_edge = (new_donor, new_receiver)
                            if temp_edge in plant_edges or new_donor == new_receiver:
                                patience -= 1
                            else:
                                plant_edges.append(temp_edge)
                                remaining_edges -= 1
                                patience = 10
                    
                    donor, receiver = zip(*plant_edges)

                # shared operations between augmented and non-augmented graphs

                donor_np = np.array(donor)
                receiver_np = np.array(receiver)

                # we need edges going both ways 
                from_plant1 = torch.tensor(donor_np, dtype = int)
                to_plant2 = torch.tensor(receiver_np, dtype = int)

                plant_edges = torch.concat((from_plant1, to_plant2)).reshape(-1, len(from_plant1)).long()

                #remove vars we dont want in the graph (the metadata)
                day_farm_data_clean = day_farm_data_subset.drop(['FarmID', 'Plant_ID', 'Date', 'cote_c_carotae'], axis=1)

                #convert class values to tensor
                squamosa_rate_label = torch.tensor(squamosa_rate_label, dtype = int)

                # create plant node as tensor
                plants_tensor = torch.tensor(np.array(day_farm_data_clean), dtype = float)

                graph = Data(x=plants_tensor, edge_index=plant_edges, y=squamosa_rate_label)

                transform = T.Compose([T.ToUndirected(), T.AddSelfLoops()])
                graph = transform(graph)

                graph.train_mask = torch.Tensor(train_masks_subgraph)
                graph.val_mask = torch.Tensor(valid_masks_subgraph)
                graph.test_mask = torch.Tensor(test_masks_subgraph)
                graph.plant_tags = torch.Tensor(plant_tags_subgraph)
                
                nbr_total_nodes += len(plants_tensor)
                nbr_total_edges += len(plant_edges[0])
                graphs.append(graph)

                if aug == 0:
                    #print(f'graph.edge_index[0]:', graph.edge_index[0])
                    #print(f'graph.edge_index[1]:', graph.edge_index[1])
                    #print(f'graph.num_nodes:', graph.num_nodes)
                    #print(f'plants_tensor:', plants_tensor)
                    #print(f'len(plants_tensor):', len(plants_tensor))
                    #print('-------')
                    degrees = degree(graph.edge_index[0], num_nodes=graph.num_nodes, dtype=torch.float)
                    node_degrees = list(enumerate(degrees.tolist()))
                    # Sort nodes by degree in descending order
                    sorted_nodes = sorted(node_degrees, key=lambda x: x[1], reverse=True)
                    # Extract node IDs from sorted list
                    sorted_node_ids = [node_id for node_id, _ in sorted_nodes]
                    conserved_node_ids = sorted_node_ids[:round(0.2*len(sorted_node_ids))]

                # used to save a graph of each farm for visualisation purposes only
                if farm_id not in unique_farms_idx:
                    unique_farms_idx[farm_id] = graph
                
                if aug_proof_concept_iter == 0:
                    aug_proof_concept[aug] = graph

    return (graphs, unique_farms_idx, aug_proof_concept)

# Methode sans augmentation

In [114]:
def create_masks(masking_data, farm_id):

    filtered_lists = [[y for x, y in sublist if x == farm_id] for sublist in masking_data]

    total_len = sum(len(inner_list) for inner_list in filtered_lists)
    train_masks = [0] * total_len
    valid_masks = [0] * total_len
    test_masks = [0] * total_len

    train_masks = [1 if i + 1 in filtered_lists[0] else 0 for i in range(total_len)]
    valid_masks = [1 if i + 1 in filtered_lists[1] else 0 for i in range(total_len)]
    test_masks = [1 if i + 1 in filtered_lists[2] else 0 for i in range(total_len)]
    plant_tags = [farm_id*100 + i+1 for i in range(total_len)]
    
    return train_masks, valid_masks, test_masks, plant_tags

In [115]:
def build_HGNN(sub_df_norm, masking_data, distance_txt_file, dist_threshold, class_level):

  farm_ids, dates = extract_MetaData(sub_df_norm)

  # list to store the graph for each farm and date
  graphs = []
  unique_farms_idx = {}
  i = 0
  nbr_total_nodes = 0
  nbr_total_edges = 0


  # loop through each farm and date
  for (farm_id, date) in itertools.product(farm_ids, dates):
    
    train_masks, valid_masks, test_masks, plant_tags = create_masks(masking_data, farm_id)
    plant_ids = sub_df_norm[(sub_df_norm['FarmID'] == farm_id) & (sub_df_norm['Date'] == date)]['Plant_ID'].tolist()
    train_masks_subgraph = [train_masks[i - 1] for i in plant_ids]
    valid_masks_subgraph = [valid_masks[i - 1] for i in plant_ids]
    test_masks_subgraph = [test_masks[i - 1] for i in plant_ids]
    plant_tags_subgraph = [plant_tags[i - 1] for i in plant_ids]
    
    day_farm_data = sub_df_norm[(sub_df_norm['FarmID'] == farm_id) & (sub_df_norm['Date'] == date)]
    j = 0
    if day_farm_data.size > 0:
      # extract class values
      if class_level == 'graph':
        squamosa_rate_array = day_farm_data['cote_c_carotae'].to_numpy()
        squamosa_rate_max = max(squamosa_rate_array)
        squamosa_rate_label = np.rint(squamosa_rate_max) # round up cote squamosa to nearest integer

      elif class_level == 'node':
        squamosa_rate_array = day_farm_data['cote_c_carotae'].to_numpy()
        squamosa_rate_label = np.rint(squamosa_rate_array) # round up cote squamosa to nearest integer
      else:
        print(f'class_level must be "graph", not {class_level}.')
      
      # create edge: edge plant to plant
      donor, receiver = get_plant_distances(farm_id, dist_threshold, distance_txt_file, day_farm_data['Plant_ID'].tolist())

      if len(plant_ids) != max(plant_ids):
        day_farm_data, donor, receiver = clean_ID_sparse(day_farm_data, "Plant_ID", donor, receiver)


      donor_np = np.array(donor)
      receiver_np = np.array(receiver)
      # we need edges going both ways 
      from_plant1 = torch.tensor(donor_np, dtype = int)
      to_plant2 = torch.tensor(receiver_np, dtype = int)

      plant_edges = torch.concat((from_plant1, to_plant2)).reshape(-1, len(from_plant1)).long()


      #remove vars we dont want in the graph (the metadata)
      day_farm_data = day_farm_data.drop(['FarmID', 'Plant_ID', 'Date', 'cote_c_carotae'], axis=1)

      #convert class values to tensor
      squamosa_rate_label = torch.tensor(squamosa_rate_label, dtype = int)

      # create plant node as tensor
      plants_tensor = torch.tensor(np.array(day_farm_data), dtype = float)

      graph = Data(x=plants_tensor, edge_index=plant_edges, y=squamosa_rate_label)

      transform = T.Compose([T.ToUndirected(), T.AddSelfLoops()])
      graph = transform(graph)

      graph.train_mask = torch.Tensor(train_masks_subgraph)
      graph.val_mask = torch.Tensor(valid_masks_subgraph)
      graph.test_mask = torch.Tensor(test_masks_subgraph)
      graph.plant_tags = torch.Tensor(plant_tags_subgraph)
      graph.graph_date = torch.full((len(plant_tags_subgraph),), date)
      
      i+= 1
      nbr_total_nodes += len(plants_tensor)
      nbr_total_edges += len(plant_edges[0])
      graphs.append(graph)

      if (len(train_masks_subgraph)) != len(squamosa_rate_array):
        print('error: training mask not right size')

      if farm_id not in unique_farms_idx:
        unique_farms_idx[farm_id] = graph

  return (graphs, unique_farms_idx)






# Modele + hypertuner

In [116]:
def get_dataloaders(graphs, train_ratio = 0.75, val_ratio = 0.15):
    random.shuffle(graphs)
    train_split = int(len(graphs) * train_ratio)
    valid_split = int(len(graphs) * val_ratio)
    
    train_data = graphs[:train_split]
    val_data = graphs[train_split:train_split+valid_split]
    test_data = graphs[train_split+valid_split:]


    #training_data, test_data = train_test_split(graphs, test_size=0.1, random_state=42)
    #train_data, val_data = train_test_split(training_data, test_size=0.1, random_state=42)

    train_loader = DataLoader(train_data, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=4, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=4, shuffle=True)

    return (train_loader, val_loader, test_loader)


In [117]:
def count_class_samples(loader, nbr_classes):
    class_counts = torch.zeros(nbr_classes, dtype=torch.long)
    for data in loader:
        class_counts += torch.bincount(data.y, minlength=nbr_classes)
    return class_counts

In [118]:
def check_class_distribution(distribution, percentage_threshold=0.1):
    class_count = list(distribution)

    smallest_class = min(class_count)

    # Calculate the expected minimum and maximum class count
    max_class_count = (1 + percentage_threshold) * smallest_class

    # Check if the distribution is within the specified percentage
    within_threshold = all(count <= max_class_count for count in class_count)

    return within_threshold

# Logique d'execution

In [119]:
if not os.path.exists("../../Output/Trained_models"):
      
    # if the demo_folder directory is not present 
    # then create it.
    os.makedirs("../../Output/Trained_models")

In [120]:
def create_next_dataset_folder(directory_path):

    folders = [f for f in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, f))]

    max_number = -1

    # Iterate through the folders
    for folder in folders:
         if folder.startswith('created_models_'):
            try:
                number = int(folder.split('_')[-1])
                max_number = max(max_number, number)
            except ValueError:
                continue

    os.makedirs(f"created_models_{max_number + 1}")
    print(f"save location: created_models_{max_number + 1}")
    return (f'created_models_{max_number + 1}')

In [121]:
try:
    pbar.close() 
except NameError:
    pass

In [122]:
os.getcwd()

'd:\\Work\\UQAM\\Doctorat\\Projets\\oignion_GNN\\cultures_GNN\\carotte\\Automne_2023\\Script\\graph_masking'

In [123]:
def load_masking_data(json_file):
    with open(json_file, 'r') as json_file:
        loaded_masking_data = json.load(json_file)
    
    return(loaded_masking_data)

In [None]:
############################
### setup run parameters ###
############################
param_dict = dict()
nbr_classes = 2

datasets_to_run = ['base', 'Botcast']
augmentation = ['drop_add_3','drop_add_5', 
                'no_aug', 
                'louvain', 
                'sliding_window_dist_20_40_step_1','sliding_window_dist_20_40_step_2', 
                'random_walk_depth_1_sample_10', 'random_walk_depth_2_sample_10',]

dataset_store_path = create_next_dataset_folder('../graph_masking/')
masking_data = load_masking_data('graph_masking_carrot.json')

#######################
### start main loop ###
#######################

with tqdm(total=len(datasets_to_run)*len(augmentation)) as pbar:

    for filename in (os.listdir(directory)):
        vars = ['FarmID', 'Plant_ID', 'Date','cote_c_carotae']
        file_path = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(file_path):
            with open(file_path, 'r') as f:
                for line in f:
                    if line.strip(): # remove empty lines
                        if not line.startswith('->'): # remove df code
                            vars.append(line.strip())
            row_name = os.path.splitext(os.path.basename(file_path))[0]
            #print(f'Starting dataset : {row_name} of shape {extract_sub_df(combined_df, vars).shape} ')
            
            if row_name in datasets_to_run:
                sub_df = extract_sub_df(combined_df, vars)
                sub_df_norm = normalize(sub_df, exception_list=['FarmID', 'Plant_ID', 'Date', 'cote_c_carotae'])

                for aug in augmentation:
                    balanced = False
                    iter = 0
                    perc = 0.05
                    while not balanced:
                        pbar.set_description(f"Processing {row_name} - {aug} @ percentage {perc*100}%")
                        if aug == 'no_aug':
                            graphs, unique_farms_idx = build_HGNN(sub_df_norm, masking_data, distance_txt_file, 25, 'node')
                            parameters = f'dist-{25}'
                        elif aug == 'drop_add_3':
                            graphs, unique_farms_idx, aug_proof_concept = build_HGNN_aug(sub_df_norm, masking_data, distance_txt_file, aug_multiplier=3)
                            parameters = f'aug_mult-{3}'
                        elif aug == 'drop_add_5':
                            graphs, unique_farms_idx, aug_proof_concept = build_HGNN_aug(sub_df_norm, masking_data, distance_txt_file, aug_multiplier=3)
                            parameters = f'aug_mult-{5}'
                        elif aug == 'louvain':
                            graphs, unique_farms_idx, aug_proof_concept = build_HGNN_louvain(sub_df_norm, masking_data, distance_txt_file)
                            parameters = ''
                        elif aug == 'sliding_window_dist_20_40_step_1':
                            graphs, unique_farms_idx, aug_proof_concept = build_HGNN_aug_sliding_threshold(sub_df_norm, masking_data, distance_txt_file, minimum_distance=20, maximum_distance=40, step=2)
                            parameters = f'min_dist-{20}_max_dist-{40}_step-{1}'
                        elif aug == 'sliding_window_dist_20_40_step_2':
                            graphs, unique_farms_idx, aug_proof_concept = build_HGNN_aug_sliding_threshold(sub_df_norm, masking_data, distance_txt_file, minimum_distance=20, maximum_distance=40, step=2)
                            parameters = f'min_dist-{20}_max_dist-{40}_step-{2}'
                        elif aug == 'random_walk_depth_1_sample_10':
                            graphs, unique_farms_idx, aug_proof_concept = build_HGNN_random_walk(sub_df_norm, masking_data, distance_txt_file, max_depth=1, samples=10)
                            parameters = f'max_depth-{1}_samples-{10}'
                        elif aug == 'random_walk_depth_2_sample_10':
                            graphs, unique_farms_idx, aug_proof_concept = build_HGNN_random_walk(sub_df_norm, masking_data, distance_txt_file, max_depth=1, samples=10)
                            parameters = f'max_depth-{2}_samples-{10}'
                        else:
                            print(f'{aug} parameters not found in the if/elif selection block. skipping.')

                        #train_loader, val_loader, test_loader = get_dataloaders(graphs, train_ratio=0.75, val_ratio=0.15)

                        #train_counts = count_class_samples(train_loader, nbr_classes)
                        #val_counts = count_class_samples(val_loader, nbr_classes)
                        #test_counts = count_class_samples(test_loader, nbr_classes)

                        graphs_train_class_counts = [0, 0]
                        graphs_val_class_counts = [0, 0]
                        graphs_test_class_counts = [0, 0]
                        num_nodes = 0
                        for idx, graph in enumerate(graphs):
                            
                            # Access node labels
                            node_labels = graph.y
                            num_nodes += len(node_labels)
                            # Convert masks to the appropriate data type
                            train_mask = graph.train_mask.bool()
                            val_mask = graph.val_mask.bool()
                            test_mask = graph.test_mask.bool()

                            if len(train_mask) != len(node_labels):
                                print(aug)
                                print(graph)
                            
                            # Get unique classes present in the graph
                            unique_classes = torch.unique(node_labels)
                            
                            # Create count lists with zeros for both classes
                            train_class_counts = [0, 0]
                            val_class_counts = [0, 0]
                            test_class_counts = [0, 0]
                            
                            # Calculate counts if classes are present
                            if len(unique_classes) > 0:
                                counts = torch.bincount(node_labels[train_mask], minlength=len(unique_classes))
                                train_class_counts[:len(counts)] = counts.tolist()
                                graphs_train_class_counts = [x + y for x, y in zip(graphs_train_class_counts, train_class_counts)]
                                
                                counts = torch.bincount(node_labels[val_mask], minlength=len(unique_classes))
                                val_class_counts[:len(counts)] = counts.tolist()
                                graphs_val_class_counts = [x + y for x, y in zip(graphs_val_class_counts, val_class_counts)]
                                
                                counts = torch.bincount(node_labels[test_mask], minlength=len(unique_classes))
                                test_class_counts[:len(counts)] = counts.tolist()
                                graphs_test_class_counts = [x + y for x, y in zip(graphs_test_class_counts, test_class_counts)]
                            
                            #print(f"Training Set - Class 0: {train_class_counts[0]}, Class 1: {train_class_counts[1]}")
                            #print(f"Validation Set - Class 0: {val_class_counts[0]}, Class 1: {val_class_counts[1]}")
                            #print(f"Test Set - Class 0: {test_class_counts[0]}, Class 1: {test_class_counts[1]}")
                            #print("\n")
                            

                        if (check_class_distribution(graphs_train_class_counts, percentage_threshold= perc) and 
                            check_class_distribution(graphs_val_class_counts, percentage_threshold= perc) and 
                            check_class_distribution(graphs_test_class_counts, percentage_threshold= perc)):
                            balanced = True
                        
                        iter +=1
                        if iter == 50:
                            perc += 0.02
                            iter = 0
                    
                    if len(graphs) < 10:
                        batch_size = 2
                    elif 10 <= len(graphs) < 50:
                        batch_size = 4
                    elif 50 <= len(graphs) < 150:
                        batch_size = 8
                    elif 150 <= len(graphs) < 500:
                        batch_size = 16
                    elif 150 <= len(graphs) < 500:
                        batch_size = 32
                    elif 500 <= len(graphs) < 5000:
                        batch_size = 64
                    elif len(graphs) >= 5000:
                        batch_size = 128
                    else:
                        print('error batch size not in predicted range')
                    
                    random.shuffle(graphs)
                    graphs_loader = DataLoader(graphs, batch_size=batch_size, shuffle=True)
                    
                    save_folder = f'{dataset_store_path}/{row_name}_{aug}_{parameters}'
                    os.makedirs(save_folder, exist_ok=True)
                    torch.save(graphs_loader, f'{save_folder}/graphs_dataset.pth')

                    print(f'saved {row_name} - {aug} with train: {graphs_train_class_counts}, valid:{graphs_val_class_counts}, test:{graphs_test_class_counts}, perc: {perc}, num_graphs: {len(graphs)}, num nodes: {num_nodes}')
                    pbar.update(1)
                


save location: created_models_2


Processing base - no_aug @ percentage 5.0%:   0%|          | 0/1 [00:00<?, ?it/s]

Processing base - no_aug @ percentage 20.999999999999996%: 100%|██████████| 1/1 [00:18<00:00, 18.36s/it]

saved base - no_aug with train: [151, 129], valid:[27, 28], test:[35, 29], perc: 0.20999999999999996, num_graphs: 16, num nodes: 399





In [125]:
for data in graphs_loader:
    print(data[0])

Data(x=[25, 131], edge_index=[2, 71], y=[25], train_mask=[25], val_mask=[25], test_mask=[25], plant_tags=[25], graph_date=[25])
Data(x=[25, 131], edge_index=[2, 85], y=[25], train_mask=[25], val_mask=[25], test_mask=[25], plant_tags=[25], graph_date=[25])
Data(x=[25, 131], edge_index=[2, 71], y=[25], train_mask=[25], val_mask=[25], test_mask=[25], plant_tags=[25], graph_date=[25])
Data(x=[24, 131], edge_index=[2, 66], y=[24], train_mask=[24], val_mask=[24], test_mask=[24], plant_tags=[24], graph_date=[24])
