## Make the triplets in the whole training dataset and then split the dataset in training validation

In [2]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import math
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances,manhattan_distances
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import GroupKFold,KFold, train_test_split
import os
import json
from test_set import  normalization_and_ohe
from datasets3 import TripletDataset3
import torch

In [1]:
my_path = 'siamese_datasets/n_dataset_7'

In [4]:
columns_to_delete = ['corine_13', 'corine_4', 'corine_30', 'corine_38', 'corine_36', 'corine_10', 'corine_20', 'corine_18', 'corine_16',
                     'corine_43', 'corine_27', 'corine_5', 'corine_31', 'corine_17', 'corine_12', 'corine_26', 'corine_37', 'corine_24',
                     'corine_28', 'corine_29', 'corine_19', 'corine_21', 'corine_6', 'corine_23', 'corine_22', 'corine_1', 'corine_8',
                     'corine_33', 'corine_2', 'corine_40', 'corine_32', 'corine_41', 'corine_3', 'corine_25', 'corine_35', 'corine_14',
                     'corine_42', 'corine_11', 'corine_44', 'corine_15', 'corine_7', 'corine_9','band','firedate']


In [5]:
df = pd.read_csv('/mnt/nvme2tb/ffp/datasets/siamese/training/train_new_95_5.csv')
df = df.drop(columns=columns_to_delete)

In [5]:
df.columns

Index(['y', 'x', 'dom_dir', 'dom_vel', 'res_max', 'dir_max', 'max_dew_temp',
       'min_dew_temp', 'mean_dew_temp', 'max_temp', 'min_temp', 'mean_temp',
       'rain_7_days', 'ndvi', 'evi', 'lst_day', 'lst_night', 'fire', 'weekday',
       'month', 'slope', 'road_dens', 'dem', 'pop', 'aspect', 'f81',
       'frequency', 'corine_gr1', 'corine_gr4', 'corine_gr5', 'corine_gr21',
       'corine_gr22', 'corine_gr23', 'corine_gr24', 'corine_gr31',
       'corine_gr32', 'corine_gr33'],
      dtype='object')

In [6]:
df = normalization_and_ohe(df,normalization = True, ohe = False)

In [8]:
df.columns

Index(['y', 'x', 'dom_vel', 'res_max', 'max_dew_temp', 'min_dew_temp',
       'mean_dew_temp', 'max_temp', 'min_temp', 'mean_temp', 'rain_7_days',
       'ndvi', 'evi', 'lst_day', 'lst_night', 'fire', 'slope', 'road_dens',
       'dem', 'pop', 'aspect', 'f81', 'frequency', 'corine_gr1', 'corine_gr4',
       'corine_gr5', 'corine_gr21', 'corine_gr22', 'corine_gr23',
       'corine_gr24', 'corine_gr31', 'corine_gr32', 'corine_gr33'],
      dtype='object')

In [9]:
df = df.dropna()

In [10]:
fires = df[df.fire==1].reset_index(drop = True)
no_fires = df[df.fire==0].reset_index(drop = True)

In [11]:
fires.to_csv(os.path.join(my_path, 'fires.csv'), index = False)
no_fires.to_csv(os.path.join(my_path, 'nofires.csv'), index = False)

### Similarities 

In [6]:
def similarities_f_no_f(data_fire,data_no_fire,json_file_path,bins_dict_metrics,metrics_list,norm,name = 'f_n_f'):
    fire_array = data_fire.to_numpy() 
    no_fire_array = data_no_fire.to_numpy() 

    fire_indexes = {}

    similar_indices = {} # Initialize a list to store similarity bins for each fire
    
    for metric in metrics_list:
        print(metric)
        bins = bins_dict_metrics[metric]
        bins_names = bins_names_dict[metric]
        if norm == True:
            json_file_name = os.path.join(json_file_path, name + '_norm_'+metric+'.json')
        else:
            json_file_name = os.path.join(json_file_path, name + '_unorm_'+metric+'.json')
        print(json_file_name)
        for fire_index, fire in enumerate(fire_array):
            #'euclidean','manhattan','haversine'
            if metric == 'cosine':
                similarities = cosine_similarity(fire.reshape(1, -1), no_fire_array)[0]  # Calculate similarities for current fire
            elif metric == 'euclidean':
                similarities = euclidean_distances(fire.reshape(1, -1), no_fire_array)[0]  # Calculate similarities for current fire
            elif metric == 'manhattan':
                similarities = manhattan_distances(fire.reshape(1, -1), no_fire_array)[0]  # Calculate similarities for current fire
            elif metric == 'haversine':
                similarities = haversine_distances(fire.reshape(1, -1), no_fire_array)[0]  # Calculate similarities for current fire
            else:
                print(metric, ': Not such metric. Metrics should be one of euclidean,manhattan,haversine')
            #print(similarities.min(), similarities.max())      
            similar_indices[fire_index] = {bin_: [] for bin_ in bins_names}
            for i, bin_ in enumerate(bins):
                lower, upper = bin_
                bin_indices = np.where((similarities >= lower) & (similarities < upper))[0]
                if name == 'f_n_f':
                    similar_indices[fire_index][bins_names[i]] = bin_indices[:5000].tolist()
                elif name == 'n_f_f':
                    similar_indices[fire_index][bins_names[i]] = bin_indices[:100].tolist()
        #json_file_path = '/home/sgirtsou/Documents/fire_no_fire_similarity.json'
        #Save the similar_indices dictionary to a JSON file
        with open(json_file_name, 'w') as json_file:
            json.dump(similar_indices, json_file)

In [6]:
def similarities_f_f(data_fire,json_file_path,bins_dict_metrics,metrics_list,norm):
    fire_array = data_fire.to_numpy()
    similar_indices_fire = {}
    for metric in metrics_list:
        bins = bins_dict_metrics[metric]
        bins_names = bins_names_dict[metric]
        if norm == True:
            json_file_name = os.path.join(json_file_path,'f_f_norm_'+metric+'.json')
        else:
            json_file_name = os.path.join(json_file_path,'f_f_norm_'+metric+'.json')
        print(json_file_name)
        for fire_index, fire in enumerate(fire_array):
            if metric == 'cosine':
                similarities = cosine_similarity(fire.reshape(1, -1), fire_array)[0]  # Calculate similarities for current fire
            elif metric == 'euclidean':
                similarities = euclidean_distances(fire.reshape(1, -1), fire_array)[0]  # Calculate similarities for current fire
            elif metric == 'manhattan':
                similarities = manhattan_distances(fire.reshape(1, -1), fire_array)[0]  # Calculate similarities for current fire
            elif metric == 'haversine':
                similarities = haversine_distances(fire.reshape(1, -1), fire_array)[0]  # Calculate similarities for current fire
            else:
                print(metric, ': Not such metric. Metrics should be one of euclidean,manhattan,haversine')
            #similarities = cosine_similarity(fire.reshape(1, -1), fire_array)[0]  # Calculate similarities for current fire
            similar_indices_fire[fire_index] = {bin_: [] for bin_ in bins_names}
            for i, bin_ in enumerate(bins):
                lower, upper = bin_
                bin_indices = np.where((similarities >= lower) & (similarities < upper))[0]
                similar_indices_fire[fire_index][bins_names[i]] = bin_indices[:2000].tolist()
        # Define the path to the JSON file

        # Save the similar_indices dictionary to a JSON file
        with open(json_file_name, 'w') as json_file:
            json.dump(similar_indices_fire, json_file)

In [7]:
bins_names_dict = {'cosine':['bin1','bin2','bin3','bin4','bin5'],'euclidean':['bin5','bin4','bin3','bin2','bin1'],
             'manhattan':['bin5','bin4','bin3','bin2','bin1']}

In [8]:
edges = np.linspace(0, 4, 6)
euclidean_bins_norm = list(zip(edges, edges[1:]))

edges = np.linspace(1, 2850000, 6)
euclidean_bins_unnorm = list(zip(edges, edges[1:]))

edges = np.linspace(0, 9, 6)
manhattan_bins_norm = list(zip(edges, edges[1:]))

edges = np.linspace(4, 2860000, 6)
manhattan_bins_unnorm = list(zip(edges, edges[1:]))

edges = np.linspace(0, 0.4, 6)
cosine_bins_norm = list(zip(edges, edges[1:]))

edges = np.linspace(0, 1, 6)
cosine_bins_unorm = list(zip(edges, edges[1:]))

In [9]:
bins_dict_metrics = {'cosine':cosine_bins_norm, 
                'euclidean':euclidean_bins_norm,
                 'manhattan':manhattan_bins_norm}

In [11]:
fires = pd.read_csv(os.path.join(my_path, 'fires.csv'))
no_fires = pd.read_csv(os.path.join(my_path, 'nofires.csv'))

In [12]:
fires = fires.drop(columns=['fire'])
no_fires = no_fires.drop(columns = ['fire'])

In [20]:
similarities_f_no_f(fires, no_fires, my_path, bins_dict_metrics,metrics_list=['manhattan'],norm=True, name = 'f_n_f')

manhattan
/home/sgirtsou/Documents/siamese_datasets/n_dataset_7/f_n_f_norm_manhattan.json


In [13]:
similarities_f_no_f(no_fires, fires, my_path, bins_dict_metrics,metrics_list=['manhattan'],norm=True, name = 'n_f_f')

manhattan
/home/sgirtsou/Documents/siamese_datasets/n_dataset_7/n_f_f_norm_manhattan.json


In [31]:
similarities_f_f(fires,my_path,bins_dict_metrics,metrics_list=['manhattan'],norm=True)

/home/sgirtsou/Documents/siamese_datasets/n_dataset_7/f_f_norm_manhattan.json


## To make the complete dataset without splitting between train validation start from here

In [3]:
def datasets(fires,nofires,par_path,metric,norm_sim_mode,pn_ratio,np_ratio,n_triplets_per_fire_anchor,n_triplets_per_no_fire_anchor,ratio_fnf):
    data_fire = fires.loc[fires['fire'] == 1, ~fires.columns.isin(['fire'])].to_numpy()
    labels_data_fire = fires.loc[fires['fire'] == 1, fires.columns.isin(['fire'])].to_numpy()

    data_no_fire = nofires.loc[nofires['fire'] == 0, ~nofires.columns.isin(['fire'])].to_numpy()
    labels_data_no_fire = nofires.loc[nofires['fire'] == 0, nofires.columns.isin(['fire'])].to_numpy()

    train_f_n_f_name = f'f_n_f_{norm_sim_mode}_{metric}.json'
    train_f_f_name = f'f_f_{norm_sim_mode}_{metric}.json'
    train_n_f_f_name = f'n_f_f_{norm_sim_mode}_{metric}.json'
    fn_similarity_path = os.path.join(par_path,train_f_n_f_name)
    ff_similarity_path = os.path.join(par_path,train_f_f_name)
    nf_similarity_path = os.path.join(par_path,train_n_f_f_name)

    if metric == 'cosine':
        pos_neg_bins_dict = {'very_hard_negatives':'bin5', 'hard_negatives':'bin4','semi_hard_negatives':'bin3','easy_negatives':'bin2','very_easy_negatives':'bin1'}
        pos_pos_bins_dict = {'very_easy_positives':'bin5', 'easy_positives':'bin4','semi_hard_positives':'bin3','hard_positives':'bin2','very_hard_positives':'bin1'}
        neg_pos_bins_dict = {'very_hard_positives':'bin5', 'hard_positives':'bin4','semi_hard_positives':'bin3','easy_positives':'bin2','very_easy_positives':'bin1'}
    elif metric in ['euclidean','manhattan']:
        pos_neg_bins_dict = {'very_hard_negatives':'bin1', 'hard_negatives':'bin2','semi_hard_negatives':'bin3','easy_negatives':'bin4','very_easy_negatives':'bin5'}
        pos_pos_bins_dict = {'very_easy_positives':'bin1', 'easy_positives':'bin2','semi_hard_positives':'bin3','hard_positives':'bin4','very_hard_positives':'bin5'}
        neg_pos_bins_dict = {'very_hard_positives':'bin1', 'hard_positives':'bin2','semi_hard_positives':'bin3','easy_positives':'bin4','very_easy_positives':'bin5'}

    # Read JSON file
    with open(fn_similarity_path, 'r') as f:
        fn_dist_matrix = json.load(f)
    with open(ff_similarity_path, 'r') as f:
        ff_dist_matrix = json.load(f)
    with open(nf_similarity_path, 'r') as f:
        nf_dist_matrix = json.load(f)

    train_batch_sampler = TripletDataset3(data_fire, data_no_fire,labels_data_fire,labels_data_no_fire, fn_dist_matrix
                                          ,ff_dist_matrix, nf_dist_matrix, pos_neg_bins_dict,pos_pos_bins_dict, neg_pos_bins_dict, pn_ratio= pn_ratio,pp_ratio=(0,0,1)
                                          ,np_ratio = np_ratio, n_triplets_per_fire_anchor=n_triplets_per_fire_anchor,n_triplets_per_no_fire_anchor=n_triplets_per_no_fire_anchor
                                          ,ratio_fnf=ratio_fnf)
    print('data_batch_sampler run ok')

    return train_batch_sampler

In [4]:
par_path = 'siamese_datasets/n_dataset_7/'
n_triplets_per_fire_anchor = 10
n_triplets_per_no_fire_anchor = 1
ratio_fnf = 0.5
metric = 'manhattan'
norm_sim_mode = 'norm'
n_epochs = 20
log_interval = 150
pn_ratio = (0.2,0.2,0.6)
np_ratio = (0.2,0.2,0.6)
fires = pd.read_csv(os.path.join(par_path,'fires.csv'))
no_fires = pd.read_csv(os.path.join(par_path,'nofires.csv'))

In [10]:
data_batch_sampler = datasets(fires,no_fires,par_path,metric,norm_sim_mode,pn_ratio,np_ratio,n_triplets_per_fire_anchor,n_triplets_per_no_fire_anchor,ratio_fnf)

139570 triplets where made for fire anchors
Gathering no fire instances.....
3487 where made for no fire anchors
data_batch_sampler run ok


## Datasets with many combinations - manhattan distance

In [7]:
ratio_combinations = []
for i in range(11):
    for j in range(11):
        for k in range(11):
            if i + j + k == 10:  # Ensure the sum is 1.0
                ratio_combinations.append((i/10, j/10, k/10))

In [8]:
ratio_combinations = [(0.1, 0.1, 0.8),(0.2, 0.1, 0.7),(0.7, 0.1, 0.2)]

In [9]:
n_triplets_fires = [5, 10, 20]
n_triplets_no_fires = [1, 2, 3]
ratios = [0.5, 0.7, 0.9]
metrics_list=['manhattan']

In [19]:
par_path = 'siamese_datasets/n_dataset_7/'
norm_sim_mode = 'norm'
fires = pd.read_csv(os.path.join(par_path,'fires.csv'))
no_fires = pd.read_csv(os.path.join(par_path,'nofires.csv'))
#np_ratio = (0,0,1)
for pn_ratio in ratio_combinations:
    for np_ratio in ratio_combinations:
        for n_triplets_per_fire_anchor in n_triplets_fires:
            n_triplets_no_fires = [n_triplets_per_fire_anchor/n_triplets_per_fire_anchor, int(n_triplets_per_fire_anchor/2),n_triplets_per_fire_anchor]
            for ratio_fnf in ratios:
                for n_triplets_per_no_fire_anchor in n_triplets_no_fires:
                    for metric in metrics_list:
                        directory_path =f'm_{metric}_pn{str(int(pn_ratio[0]*10))}_{str(int(pn_ratio[1]*10))}_{str(int(pn_ratio[2]*10))}_np{str(int(np_ratio[0]*10))}_{str(int(np_ratio[1]*10))}_{str(int(np_ratio[2]*10))}_{str(norm_sim_mode)}_nf{str(n_triplets_per_fire_anchor)}_nnf{str(n_triplets_per_no_fire_anchor)}_r{str(ratio_fnf)}'
                        if not os.path.exists(directory_path):
                            os.makedirs(os.path.join(par_path, directory_path))
                            data_batch_sampler = datasets(fires,no_fires,par_path,metric,norm_sim_mode,pn_ratio,np_ratio,n_triplets_per_fire_anchor,n_triplets_per_no_fire_anchor
                                                          ,ratio_fnf)
                            data_path = os.path.join(par_path, directory_path,'train_batch_sampler.pth')
                            torch.save(data_batch_sampler, data_path)
                        else:
                            continue

69785 triplets where made for fire anchors
Gathering no fire instances.....
3836 where made for no fire anchors
data_batch_sampler run ok
69785 triplets where made for fire anchors
Gathering no fire instances.....
3836 where made for no fire anchors
data_batch_sampler run ok
69785 triplets where made for fire anchors
Gathering no fire instances.....
3836 where made for no fire anchors
data_batch_sampler run ok
69785 triplets where made for fire anchors
Gathering no fire instances.....
7520 where made for no fire anchors
data_batch_sampler run ok
69785 triplets where made for fire anchors
Gathering no fire instances.....
7520 where made for no fire anchors
data_batch_sampler run ok
69785 triplets where made for fire anchors
Gathering no fire instances.....
7520 where made for no fire anchors
data_batch_sampler run ok
69785 triplets where made for fire anchors
Gathering no fire instances.....
12433 where made for no fire anchors
data_batch_sampler run ok
69785 triplets where made for fir

69785 triplets where made for fire anchors
Gathering no fire instances.....
6836 where made for no fire anchors
data_batch_sampler run ok
69785 triplets where made for fire anchors
Gathering no fire instances.....
11303 where made for no fire anchors
data_batch_sampler run ok
69785 triplets where made for fire anchors
Gathering no fire instances.....
11303 where made for no fire anchors
data_batch_sampler run ok
69785 triplets where made for fire anchors
Gathering no fire instances.....
11303 where made for no fire anchors
data_batch_sampler run ok
153527 triplets where made for fire anchors
Gathering no fire instances.....
3487 where made for no fire anchors
data_batch_sampler run ok
153527 triplets where made for fire anchors
Gathering no fire instances.....
3487 where made for no fire anchors
data_batch_sampler run ok
153527 triplets where made for fire anchors
Gathering no fire instances.....
3487 where made for no fire anchors
data_batch_sampler run ok
153527 triplets where made f

139570 triplets where made for fire anchors
Gathering no fire instances.....
3487 where made for no fire anchors
data_batch_sampler run ok
139570 triplets where made for fire anchors
Gathering no fire instances.....
3487 where made for no fire anchors
data_batch_sampler run ok
139570 triplets where made for fire anchors
Gathering no fire instances.....
6836 where made for no fire anchors
data_batch_sampler run ok
139570 triplets where made for fire anchors
Gathering no fire instances.....
6836 where made for no fire anchors
data_batch_sampler run ok
139570 triplets where made for fire anchors
Gathering no fire instances.....
6836 where made for no fire anchors
data_batch_sampler run ok
139570 triplets where made for fire anchors
Gathering no fire instances.....
11303 where made for no fire anchors
data_batch_sampler run ok
139570 triplets where made for fire anchors
Gathering no fire instances.....
11303 where made for no fire anchors
data_batch_sampler run ok
139570 triplets where mad

139570 triplets where made for fire anchors
Gathering no fire instances.....
12433 where made for no fire anchors
data_batch_sampler run ok
139570 triplets where made for fire anchors
Gathering no fire instances.....
12433 where made for no fire anchors
data_batch_sampler run ok
139570 triplets where made for fire anchors
Gathering no fire instances.....
12433 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where made for fire anchors
Gathering no fire instances.....
3836 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where made for fire anchors
Gathering no fire instances.....
3836 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where made for fire anchors
Gathering no fire instances.....
3836 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where made for fire anchors
Gathering no fire instances.....
7520 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where ma

279140 triplets where made for fire anchors
Gathering no fire instances.....
3487 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where made for fire anchors
Gathering no fire instances.....
6836 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where made for fire anchors
Gathering no fire instances.....
6836 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where made for fire anchors
Gathering no fire instances.....
6836 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where made for fire anchors
Gathering no fire instances.....
11303 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where made for fire anchors
Gathering no fire instances.....
11303 where made for no fire anchors
data_batch_sampler run ok
279140 triplets where made for fire anchors
Gathering no fire instances.....
11303 where made for no fire anchors
data_batch_sampler run ok
