In [1]:
import pandas as pd 
import geopandas as gpd
import os
import numpy as np
import glob
from pathlib import Path    
from tqdm import tqdm
import shutil
import json

from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import haversine_distances
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from shapely.geometry import Polygon, Point
from math import cos, radians

In [None]:
root_dir = "/home/hagerradi/projects/Ecosystem_embeddings/ebutterfly/Darwin/0177350-230224095556074"
dataset_tag = "ebutterfly_data_v4"

In [None]:
env_data_path = os.path.join(root_dir, dataset_tag, "environmental_data")

In [None]:
for file_name in glob.glob(env_data_path + '/*.npy'):
    shutil.copy(file_name, env_data_path + '/B' + Path(file_name).name)

for file_name in glob.glob(env_data_path + '/L*.npy'):
    shutil.move(file_name, os.path.join(root_dir, dataset_tag, "backup", "environmental_data") )

In [None]:
img_data_path = os.path.join(root_dir, dataset_tag, "images")
for file_name in glob.glob(img_data_path + '/*.tif'):
    shutil.copy(file_name, img_data_path + '/B' + Path(file_name).name)

for file_name in glob.glob(img_data_path + '/L*.tif'):
    shutil.move(file_name, os.path.join(root_dir, dataset_tag, "backup", "images") )

In [None]:
targets_path = os.path.join(root_dir, dataset_tag, "butterfly_targets")
for file_name in glob.glob(targets_path + '/*.json'):
    shutil.copy(file_name, targets_path + '/B' + Path(file_name).name)

for file_name in glob.glob(targets_path + '/L*.json'):
    shutil.move(file_name, os.path.join(root_dir, dataset_tag, "backup", "butterfly_targets") )

In [None]:
img_path = os.path.join(root_dir, dataset_tag, "images_visual")
for file_name in glob.glob(img_path + '/*.tif'):
    shutil.copy(file_name, img_path + '/B' + Path(file_name).name)

for file_name in glob.glob(img_path + '/L*.tif'):
    shutil.move(file_name, os.path.join(root_dir, dataset_tag, "backup", "images_visual") )

In [None]:
for group_name in ["train", "test", "valid"]:
    group_data = pd.read_csv(os.path.join(root_dir, dataset_tag, "butterfly_hotspots_" + str(group_name) + ".csv"))
    hotspots = group_data['hotspot_id'].values.tolist()
    print(group_data)
    new_hotspots = []
    for hs in hotspots:
        new_hotspots.append("B"+hs)
    
    group_data['hotspot_id'] = new_hotspots
    group_data.to_csv(os.path.join(root_dir, dataset_tag, "butterfly_hotspots_" + str(group_name) + ".csv"))


In [None]:
for group_name in ["train", "test", "valid"]:
    group_data = pd.read_csv(os.path.join(root_dir, dataset_tag, "butterfly_hotspots_" + str(group_name) + ".csv"))
    print(group_data)

In [None]:
import pandas as pd 
import os
import numpy as np
import glob
from pathlib import Path    
import shutil

env_data_path = "environmental_data"
for file_name in glob.glob(env_data_path + '/*.npy'):
    shutil.move(file_name, env_data_path + '/B' + Path(file_name).name)
    

img_data_path = "images"
for file_name in glob.glob(img_data_path + '/*.tif'):
    shutil.move(file_name, img_data_path + '/B' + Path(file_name).name)

targets_data_path = "butterfly_targets"
for file_name in glob.glob(targets_data_path + '/*.json'):
    shutil.move(file_name, targets_data_path + '/B' + Path(file_name).name)


In [None]:
import glob
import json
import matplotlib.pyplot as plt
from tqdm import tqdm

# targets_path = os.path.join(root_dir, dataset_tag, "butterfly_targets")
targets_path = "corrected_targets"
all_targets = []
for file_name in tqdm(glob.glob(targets_path + '/*.json'):
    targets = json.load(open(file_name))['probs']
    targets = [i for i in targets if i != 0]
    all_targets += targets

print(len(all_targets))

# all_targets = [i for i in all_targets if i != 0]
# print(len([i for i in all_targets if i <= 1]))

ret = plt.hist(all_targets, bins=5)
print(ret)

#### saving ebird targets with only most frequent top 28%

In [None]:
import glob
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import os

# N = int(0.25 * len(data))

# # Get the indices of the top N values
# top_indices = np.argsort(data)[-N:]

# # Sort the top_indices in ascending order
# sorted_top_indices = np.sort(top_indices)

# targets_path = os.path.join(root_dir, dataset_tag, "butterfly_targets")
targets_path = "corrected_targets"
dst_path = "corrected_targets_v2"
indices_to_save = np.load('stats/top_28percent_species_indices.npy')
all_targets = []
#print(indices_to_save)
for file_name in tqdm(glob.glob(targets_path + '/*.json')):
    targets = json.load(open(file_name))
    #print(targets["probs"])
    new_probs = [targets["probs"][i] for i in indices_to_save] 
    targets['probs'] = new_probs
    #print(targets, len(new_probs))
    with open(os.path.join(dst_path, os.path.basename(file_name)), 'w') as f:
        json.dump(targets, f)

## Mapping family/order

In [None]:
root_dir = "/home/hagerradi/projects/Ecosystem_embeddings/ebutterfly/ebird_species"
species_df = pd.read_csv(os.path.join(root_dir, "NEW_Clements-Checklist-v2022-October-2022.csv"))

species_list = open(os.path.join(root_dir, "species_list_USA_birds.txt")).read().split("\n")[0:-1]
print(len(species_list))
print(species_df.columns)

species_family = []
species_order = []

for sp in species_list:
    index_position = species_df[species_df['scientific name'] == sp].index[0]
    species_family.append(species_df['family'][index_position])
    species_order.append(species_df['order'][index_position])


# f = open(os.path.join(root_dir, "species_list_USA_birds_family.txt"),'w')
# f.write('\n'.join(str(i) for i in species_family))
# f.close()

# f = open(os.path.join(root_dir, "species_list_USA_birds_order.txt"),'w')
# f.write('\n'.join(str(i) for i in species_order))
# f.close()

def save_mapping(species_l, file_name):
    print(len(np.unique(species_l)))
    unique_values , indices = np.unique(species_l, return_index=True)
    species_l = np.array(species_l)
    all_indices = {value: np.where(species_l == value)[0].tolist() for i, value in enumerate(unique_values)}
    
    all_v = []
    for k, v in all_indices.items():
        all_v += v

    assert len(all_v) == len(species_list)

    with open(os.path.join(root_dir, file_name), 'w') as f:
        json.dump(all_indices, f)

    
    with open(os.path.join(root_dir, file_name), 'r') as f:
        dict_loaded = json.load(f)
    
    print(len(dict_loaded.keys()))
    print(list(dict_loaded.values())[0])

save_mapping(species_l=species_order, file_name='bird_species_order_mapping.json')
save_mapping(species_l=species_family, file_name='bird_species_family_mapping.json')

## Analysis on preds

In [None]:
from tqdm import tqdm
import glob, os
import json
import numpy as np

targets_path = "/network/projects/ecosystem-embeddings/SatBird_data_v2/USA_summer/corrected_targets"
preds_path = "/home/mila/h/hager.radi/scratch/ecosystem-embedding/baseline_joint_rtran_resnet18_RGBNIR_ENV_8/birds_preds"

for file_name in tqdm(glob.glob(os.path.join(preds_path, '*.npy'))):
    hotspot_id = os.path.basename(file_name).split('.')[0]
    pred = np.load(os.path.join(preds_path, hotspot_id + '.npy'))[0:670]
    y = json.load(open(os.path.join(targets_path, hotspot_id + '.json')))
    y = y['probs']
    for y_, pred_ in zip(y, pred):
        if y_ > 0:
            print(y_, pred_)