In [1]:
from pathlib import Path

# Set path to AMI Traps classes.txt file
ami_species_list_path = Path("../data/ami-dataset/archive/ami_traps/ami_traps_dataset/classes.txt")

# Read species names (Latin)
with open(ami_species_list_path, 'r', encoding='utf-8') as f:
    ami_species = [line.strip() for line in f.readlines()]

print(f"Found {len(ami_species)} moth species in AMI Traps:")
print(ami_species[:10])


Found 11432 moth species in AMI Traps:
['Abablemma discipuncta', 'Abablemma grandimacula', 'Abablemma sp.', 'Abablemma ulopus', 'Abagrotis alternata', 'Abagrotis anchocelioides', 'Abagrotis benjamini', 'Abagrotis brunneipennis', 'Abagrotis cupida', 'Abagrotis forbesi']


In [2]:

import pandas as pd
from pathlib import Path

# Load your mapping
mapping_path = Path("../data/mappings/kaggle_common_to_latin.csv")
mapping_df = pd.read_csv(mapping_path, encoding='latin1')


# Extract Latin names from mapped common names that are in your Kaggle dataset
kaggle_latin_names = mapping_df['latin_name'].tolist()

# Compare with AMI species
overlapping_species = set(ami_species).intersection(kaggle_latin_names)
print(f"Found {len(overlapping_species)} overlapping species:")
print(sorted(overlapping_species))

Found 43 overlapping species:
['Abraxas grossulariata', 'Acherontia atropos', 'Actias luna', 'Anania funebris', 'Antheraea polyphemus', 'Apantesis vittata', 'Aporophyla nigra', 'Arctia caja', 'Atolmis rubricollis', 'Automeris io', 'Catocala electa', 'Comibaena bajularia', 'Cyclophora puppillaria', 'Daphnis nerii', 'Deilephila elpenor', 'Diaphora mendica', 'Dryocampa rubicunda', 'Endromis versicolora', 'Hyles lineata', 'Hypercompe scribonia', 'Idaea muricata', 'Macroglossum stellatarum', 'Ourapteryx sambucaria', 'Palpita vitrealis', 'Plagodis phlogosaria', 'Plemyria rubiginata', 'Pterophoridae', 'Pyropteron chrysidiformis', 'Rhodometra sacraria', 'Saturnia pavonia', 'Schinia arcigera', 'Scopula limboundata', 'Scotopteryx luridata', 'Sesia apiformis', 'Sesiidae', 'Smerinthus ocellata', 'Thalera fimbrialis', 'Thyatira batis', 'Tyria jacobaeae', 'Udea ferrugalis', 'Xestia xanthographa', 'Yponomeuta evonymella', 'Zygaena filipendulae']


In [3]:
# Filter only the rows in your mapping that are in the overlap
overlapping_df = mapping_df[mapping_df['latin_name'].isin(overlapping_species)]

# Show them sorted by Latin name
print(overlapping_df.sort_values('latin_name'))


                  common_name                 latin_name
0                 MAGPIE MOTH      Abraxas grossulariata
1       DEATHS HEAD HAWK MOTH         Acherontia atropos
2                   LUNA MOTH                Actias luna
3    WHITE SPOTTED SABLE MOTH            Anania funebris
4             POLYPHEMUS MOTH       Antheraea polyphemus
5           BANDED TIGER MOTH          Apantesis vittata
6           BLACK RUSTIC MOTH           Aporophyla nigra
7           GARDEN TIGER MOTH                Arctia caja
9     RED NECKED FOOTMAN MOTH        Atolmis rubricollis
11                    IO MOTH               Automeris io
13        ROSY UNDERWING MOTH            Catocala electa
16      BLOTCHED EMERALD MOTH        Comibaena bajularia
18               BLAIRS MOCHA     Cyclophora puppillaria
19         OLEANDER HAWK MOTH              Daphnis nerii
20         ELEPHANT HAWK MOTH         Deilephila elpenor
21                MUSLIN MOTH           Diaphora mendica
22            ROSY MAPLE MOTH  

In [4]:
from pathlib import Path
import pandas as pd

metadata_path = Path("../data/ami-dataset/archive/ami_traps/metadata")

# List all CSV files in metadata folder
list(metadata_path.glob("*.csv"))


[WindowsPath('../data/ami-dataset/archive/ami_traps/metadata/ami-gbif_taxonomy_map.csv'),
 WindowsPath('../data/ami-dataset/archive/ami_traps/metadata/ami-traps_taxonomy_map.csv')]

In [6]:
import json
from pathlib import Path

# Path to insect crops
insect_crops_path = Path("../data/ami-dataset/archive/ami_traps/insect_crops")

# Find the first .json file
json_files = list(insect_crops_path.glob("*.json"))
print(f"Found {len(json_files)} JSON files.")

# Load the first one to inspect structure
sample_json = json_files[0]
with open(sample_json, 'r') as f:
    data = json.load(f)

print(f"Sample JSON file: {sample_json.name}")
print(json.dumps(data, indent=2))  # Pretty-print



Found 2 JSON files.
Sample JSON file: binary_labels.json
{
  "1.png": {
    "label": "Non-Moth",
    "region": "NorthEasternAmerica"
  },
  "2.png": {
    "label": "Non-Moth",
    "region": "NorthEasternAmerica"
  },
  "3.png": {
    "label": "Non-Moth",
    "region": "NorthEasternAmerica"
  },
  "4.png": {
    "label": "Non-Moth",
    "region": "NorthEasternAmerica"
  },
  "5.png": {
    "label": "Non-Moth",
    "region": "NorthEasternAmerica"
  },
  "6.png": {
    "label": "Non-Moth",
    "region": "NorthEasternAmerica"
  },
  "7.png": {
    "label": "Non-Moth",
    "region": "NorthEasternAmerica"
  },
  "8.png": {
    "label": "Moth",
    "region": "NorthEasternAmerica"
  },
  "9.png": {
    "label": "Non-Moth",
    "region": "NorthEasternAmerica"
  },
  "10.png": {
    "label": "Non-Moth",
    "region": "NorthEasternAmerica"
  },
  "11.png": {
    "label": "Non-Moth",
    "region": "NorthEasternAmerica"
  },
  "12.png": {
    "label": "Moth",
    "region": "NorthEasternAmerica"
  }

In [7]:
# List all JSON files in insect_crops
json_files = list(insect_crops_path.glob("*.json"))
print("JSON files found in insect_crops:")
for f in json_files:
    print(f.name)


JSON files found in insect_crops:
binary_labels.json
fgrained_labels.json


In [8]:
import json

fgrained_labels_path = insect_crops_path / "fgrained_labels.json"

with open(fgrained_labels_path, 'r') as f:
    fine_grained_labels = json.load(f)

# Show number of entries and sample keys & values
print(f"Total cropped insect images labeled: {len(fine_grained_labels)}")
sample_items = list(fine_grained_labels.items())[:5]
for img_name, label_info in sample_items:
    print(f"{img_name}: {label_info}")


Total cropped insect images labeled: 14105
8.png: {'taxon_rank': 'FAMILY', 'label': 'Tortricidae', 'synonym': None, 'speciesKey': None, 'acceptedTaxonKey': None, 'region': 'NorthEasternAmerica'}
12.png: {'taxon_rank': 'SPECIES', 'label': 'Dryocampa rubicunda', 'synonym': None, 'speciesKey': 1865862, 'acceptedTaxonKey': 1865862, 'region': 'NorthEasternAmerica'}
13.png: {'taxon_rank': 'SPECIES', 'label': 'Palpita magniferalis', 'synonym': None, 'speciesKey': 1889838, 'acceptedTaxonKey': 1889838, 'region': 'NorthEasternAmerica'}
14.png: {'taxon_rank': 'SPECIES', 'label': 'Palpita magniferalis', 'synonym': None, 'speciesKey': 1889838, 'acceptedTaxonKey': 1889838, 'region': 'NorthEasternAmerica'}
15.png: {'taxon_rank': 'GENUS', 'label': 'Baileya sp.', 'synonym': None, 'speciesKey': None, 'acceptedTaxonKey': None, 'region': 'NorthEasternAmerica'}


In [9]:
from collections import Counter

# Filter species-level images only
species_images = {img: info for img, info in fine_grained_labels.items() if info['taxon_rank'] == 'SPECIES'}

# Count images per species for overlapping species
counts = Counter()
for img, info in species_images.items():
    species_name = info['label']
    if species_name in overlapping_species:
        counts[species_name] += 1

# Print counts sorted by descending image count
for species, count in counts.most_common():
    print(f"{species}: {count} images")

print(f"\nTotal species with images: {len(counts)}")
print(f"Total images counted for overlapping species: {sum(counts.values())}")


Dryocampa rubicunda: 81 images
Automeris io: 31 images
Actias luna: 14 images
Xestia xanthographa: 12 images
Scopula limboundata: 7 images
Diaphora mendica: 6 images
Deilephila elpenor: 3 images
Plagodis phlogosaria: 2 images
Atolmis rubricollis: 1 images

Total species with images: 9
Total images counted for overlapping species: 157
