In [None]:
pip install ijson

In [None]:
import os
import json
import glob
import re
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, datasets, models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
from collections import Counter, defaultdict
import ijson
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [None]:
import shutil
import os

# Copier les utility_scripts depuis le dataset vers le working directory
utility_scripts_src = '/kaggle/input/dataset/utility_scripts'
utility_scripts_dst = '/kaggle/working/utility_scripts'

if os.path.exists(utility_scripts_src):
    if os.path.exists(utility_scripts_dst):
        shutil.rmtree(utility_scripts_dst)
    shutil.copytree(utility_scripts_src, utility_scripts_dst)
    print("Utility scripts copiés vers /kaggle/working/utility_scripts")
else:
    print("Utility scripts non trouvés dans /kaggle/input/dataset/utility_scripts")
    print("Assurez-vous que le dataset 'dataset' contient le dossier utility_scripts")

In [None]:
# Import everything from utility_scripts
import sys
sys.path.append('..')
from utility_scripts.tree import tree
from utility_scripts.taxonomy import parse_taxonomy
from utility_scripts.mapping import build_taxa_maps
from utility_scripts.corruption_scan import verify_image_validity

In [None]:
# Input files
base_folder = "/kaggle/input/inaturalist-insects/"

public_test_folder = os.path.join(base_folder, "public_test/public_test")
train_folder = os.path.join(base_folder, "train/train")
train_mini_folder = os.path.join(base_folder, "train_mini/train_mini")
val_folder = os.path.join(base_folder, "val/val")

public_test_json = os.path.join(base_folder, "public_test-json/public_test.json")
train_json = os.path.join(base_folder, "train-json/train.json")
train_mini_json = os.path.join(base_folder, "train_mini-json/train_mini.json")
val_json = os.path.join(base_folder, "val-json/val.json")

# Output files
working_dir = "/kaggle/working/"

tree_file = os.path.join(working_dir, "tree.txt")
hierarchy_map_file = "/kaggle/working/hierarchy_map.json"


In [None]:
example = "train/train/00980_Animalia_Arthropoda_Insecta_Lepidoptera_Erebidae_Arctia_virginalis/464f3a34-4c04-4eb3-afa2-6cb7444c3fa3.jpg"
taxonomy = parse_taxonomy(example)
print("Résultat:", taxonomy)
validity = verify_image_validy(example)
print("Image valide" if validity else "Image invalide")

In [None]:
total = tree(base_folder, 2)
print(f"\nNombre total de fichiers : {total}")

In [None]:
def load_annotated_images(json_path):
    """Charge lat/lon par file_name depuis train_mini_json."""
    annotated = {}
    with open(json_path, 'rb') as f:
        parser = ijson.items(f, 'images.item')
        for img in parser:
            filename = img.get('file_name', '')
            lat = float(img.get('latitude')) if img.get('latitude') is not None else 0.0
            lon = float(img.get('longitude')) if img.get('longitude') is not None else 0.0
            annotated[filename] = (lat, lon)
    return annotated

In [None]:
def compute_stats(full_taxa_map, full_geo_db, species_encountered):
    """Calcule toutes stats."""
    ordre_count = Counter()
    famille_count = Counter()
    genre_count = Counter()
    espece_count = Counter()
    
    for taxon_key, hier in full_taxa_map.items():
        ordre_count[hier['ordre']] += 1
        famille_count[hier['famille']] += 1
        genre_count[hier['genre']] += 1
        espece_count[hier['espece']] += 1
    
    homonyms_count = len([s for s in species_encountered if len(species_encountered[s]) > 1])
    homonyms_dirs = sum(len(species_encountered[s]) for s in species_encountered if len(species_encountered[s]) > 1)
    
    taxon_geo_counts = {str(k): len(v) for k, v in full_geo_db.items()}
    geo_taxa = len(full_geo_db)
    total_taxa = len(full_taxa_map)
    multi_geo = sum(1 for c in taxon_geo_counts.values() if c > 1)
    
    return {
        'total_dirs': len(species_encountered),
        'unique_taxa': total_taxa,
        'geo_coverage': geo_taxa / total_taxa if total_taxa else 0,
        'homonyms': {'names': homonyms_count, 'dirs': homonyms_dirs},
        'hierarchy': {
            'ordres': len(ordre_count), 'familles': len(famille_count), 'genres': len(genre_count)
        },
        'taxon_geo_counts': taxon_geo_counts
    }, ordre_count, famille_count, genre_count

In [None]:
annotated_images = load_annotated_images(train_mini_json)
species_encountered, unparsed_dirs = parse_taxonomy_folders(train_mini_folder)
full_taxa_map, full_geo_db = build_taxa_maps(species_encountered, annotated_images, train_mini_folder)
stats, ordre_count, famille_count, genre_count = compute_stats(full_taxa_map, full_geo_db, species_encountered)

print(f"✅ {stats['unique_taxa']} taxons (sur {stats['total_dirs']} dossiers)")
print(f"Homonymes: {stats['homonyms']['names']} noms → {stats['homonyms']['dirs']} dossiers")
print(f"Hiérarchie: {stats['hierarchy']['ordres']} ordres, {stats['hierarchy']['familles']} familles, {stats['hierarchy']['genres']} genres")
print(f"Géo: {len(full_geo_db)}/{len(full_taxa_map)} ({stats['geo_coverage']*100:.1f}%)")
print(f"Non parsés: {len(unparsed_dirs)}")

save_hierarchy_map(full_taxa_map, full_geo_db, stats, hierarchy_map_file)

In [None]:
class RobustImageFolder(Dataset):
    """ImageFolder skip corrompus."""
    def __init__(self, root, transform=None, corrupt_files=None):
        self.root = root
        self.transform = transform
        self.corrupt_files = set(corrupt_files or [])
        
        self.classes, self.class_to_idx = self.find_classes(self.root)
        self.samples = self.make_dataset(self.root, self.class_to_idx)
        
        self.valid_indices = []
        for i, (path, _) in enumerate(self.samples):
            if os.path.relpath(path, self.root) not in self.corrupt_files:
                self.valid_indices.append(i)
    
    def find_classes(self, directory):
        """Trouve classes (dossiers)."""
        classes = [d.name for d in os.scandir(directory) if d.is_dir()]
        classes.sort()
        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx
    
    def make_dataset(self, directory, class_to_idx):
        """Construit samples comme ImageFolder."""
        samples = []
        for target_class in sorted(self.class_to_idx.keys()):
            class_index = self.class_to_idx[target_class]
            target_dir = os.path.join(directory, target_class)
            for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)):
                for fname in sorted(fnames):
                    path = os.path.join(root, fname)
                    item = (path, class_index)
                    samples.append(item)
        return samples
    
    def __getitem__(self, index):
        path, target = self.samples[self.valid_indices[index]]
        img = Image.open(path).convert("RGB")
        if self.transform is not None:
            img = self.transform(img)
        return img, target
    
    def __len__(self):
        return len(self.valid_indices)

In [None]:
corrupted_train, train_log = scan_corrupted_images('/kaggle/input/inaturalist-insects/train_mini/train_mini', max_workers=4)
corrupted_val, val_log = scan_corrupted_images('/kaggle/input/inaturalist-insects/val/val', max_workers=4)

with open(train_log) as f:
    print("\nCORROMPUS TRAIN:\n", f.read()[:500] + "..." if os.path.getsize(train_log) > 500 else f.read())

In [None]:
corrupt_train = []
with open('/kaggle/working/corrupted_train_mini.txt') as f:
    for line in f:
        if line.strip() and not line.startswith('#'):
            corrupt_train.append(line.strip())

corrupt_val = []
with open('/kaggle/working/corrupted_val.txt') as f:
    for line in f:
        if line.strip() and not line.startswith('#'):
            corrupt_val.append(line.strip())

In [None]:
# Datasets FULL (sans skip)
train_dataset_full = RobustImageFolder(train_mini_folder)
val_dataset_full = RobustImageFolder(val_folder)

# Datasets CLEAN
train_dataset = RobustImageFolder(train_mini_folder, train_transforms, corrupt_train)
val_dataset = RobustImageFolder(val_folder, val_transforms, corrupt_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

print(f'Train: {len(train_dataset_full)} → {len(train_dataset)}')
print(f'Val: {len(val_dataset_full)} → {len(val_dataset)}')
print(f'Classes: {len(train_dataset.classes)}')

In [None]:
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch

# Transforms
train_transforms = transforms.Compose([
    transforms.RandomRotation(30),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.3),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

val_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

class HierarchicalInsectDataset(Dataset):
    def __init__(self, root_dir, hierarchy_map, transform=None, corrupt_files=None):
        self.root_dir = root_dir
        self.hierarchy_map = hierarchy_map
        self.transform = transform
        
        # ImageFolder interne
        self.inner_dataset = datasets.ImageFolder(root_dir)
        
        # Filtre corrompus + hiérarchie
        self.valid_indices = []
        for i in range(len(self.inner_dataset)):
            class_idx = self.inner_dataset.targets[i]
            if class_idx in self.hierarchy_map:
                # Skip si basename dans corrupt_files
                path = self.inner_dataset.samples[i][0]
                if os.path.basename(path) not in (corrupt_files or []):
                    self.valid_indices.append(i)
        
        print(f"Dataset {root_dir}: {len(self.inner_dataset)} → {len(self.valid_indices)} valides")
    
    def __getitem__(self, idx):
        real_idx = self.valid_indices[idx]
        img, class_idx = self.inner_dataset[real_idx]
        
        # Labels hiérarchiques [ordre_id, famille_id, genre_id, espece_id]
        hier_labels = torch.tensor(self.hierarchy_map[class_idx])
        
        if self.transform:
            img = self.transform(img)
        
        return img, hier_labels
    
    def __len__(self):
        return len(self.valid_indices)

# === USAGE ===
data_dir = '/kaggle/input/inaturalist-insects/'
train_mini_folder = os.path.join(data_dir, 'train_mini/train_mini')
val_folder = os.path.join(data_dir, 'val/val')

# Datasets hiérarchiques
train_dataset = HierarchicalInsectDataset(
    train_mini_folder, 
    final_hierarchy, 
    transform=train_transforms,
    corrupt_files=[os.path.basename(p) for p in corrupt_train]  # Seulement basenames
)

val_dataset = HierarchicalInsectDataset(
    val_folder, 
    final_hierarchy, 
    transform=val_transforms,
    corrupt_files=[os.path.basename(p) for p in corrupt_val]
)

# Loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✅ Datasets hiérarchiques:")
print(f"  Train: {len(train_dataset)} images, 2526 classes")
print(f"  Val: {len(val_dataset)} images")
print(f"  Labels: ordre/famille/genre/espece [device={device}]")

# Test 1 batch
img, labels = next(iter(train_loader))
print(f"Batch shape: {img.shape}, labels shape: {labels.shape}")  # [32,3,224,224], [32,4]

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import json

class HierarchicalMobileNetV3(nn.Module):
    """MobileNetV3 avec outputs séparés par niveau."""
    def __init__(self, num_ordre=17, num_famille=190, num_genre=1472, num_espece=2526):
        super().__init__()
        backbone = models.mobilenet_v3_large(weights='IMAGENET1K_V1')
        self.features = backbone.features
        self.avgpool = backbone.avgpool
        
        self.fc_shared = nn.Sequential(
            nn.Linear(960, 512),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3)
        )
        
        # Heads indépendants (FIX: pas cumulatif)
        self.head_ordre = nn.Linear(512, num_ordre)
        self.head_famille = nn.Linear(512, num_famille)
        self.head_genre = nn.Linear(512, num_genre)
        self.head_espece = nn.Linear(512, num_espece)
    
    def forward(self, x, return_probs=False):
        x = self.features(x)
        x = self.avgpool(x)
        feats = torch.flatten(x, 1)
        shared = self.fc_shared(feats)
        
        ordre = self.head_ordre(shared)
        famille = self.head_famille(shared)
        genre = self.head_genre(shared)
        espece = self.head_espece(shared)
        
        if return_probs:
            return (F.softmax(ordre, dim=1), F.softmax(famille, dim=1), 
                   F.softmax(genre, dim=1), F.softmax(espece, dim=1))
        
        # Stack [B, 4, max_classes] → pad à max
        max_classes = 2526
        preds = torch.zeros(x.size(0), 4, max_classes, device=x.device)
        preds[:, 0, :ordre.size(1)] = ordre
        preds[:, 1, :famille.size(1)] = famille
        preds[:, 2, :genre.size(1)] = genre
        preds[:, 3, :espece.size(1)] = espece
        
        return preds  # [B,4,2526]

# === STATS ===
with open('hierarchy_labels.json') as f:
    stats = json.load(f)['stats']
num_ordre = stats['ordres']      # 17
num_famille = stats['familles']  # 190
num_genre = stats['genres']     # 1472
num_espece = stats['total_classes']  # 2526

# Model
model = HierarchicalMobileNetV3(num_ordre, num_famille, num_genre, num_espece).to(device)

class HierarchicalLoss(nn.Module):
    def __init__(self, num_classes_per_level):
        super().__init__()
        self.ce = nn.CrossEntropyLoss()
        self.weights = torch.tensor([1.0, 2.0, 5.0, 10.0])
        self.num_classes = num_classes_per_level  # [17,190,1472,2526]
    
    def forward(self, preds, targets):
        loss = 0
        for i in range(4):
            mask = torch.arange(self.num_classes[i], device=preds.device)
            lvl_pred = preds[:, i, mask]
            lvl_loss = self.ce(lvl_pred, targets[:, i])
            loss += self.weights[i] * lvl_loss
        return loss / self.weights.sum()

criterion = HierarchicalLoss([num_ordre, num_famille, num_genre, num_espece])
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

print(f"Model: {sum(p.numel() for p in model.parameters()):,} params")
print(f"✅ GPU: {next(model.parameters()).device}")

# TEST FIX
model.eval()
with torch.no_grad():
    batch_img, batch_labels = next(iter(train_loader))
    preds = model(batch_img.to(device))  # [32,4,2526]
    loss = criterion(preds, batch_labels.to(device))
    print(f"✅ Test OK: preds={preds.shape}, loss={loss.item():.3f}")

In [None]:
class HierarchicalLoss(nn.Module):
    def __init__(self, weights=[1.0, 1.5, 2.0, 3.0]):
        super().__init__()
        self.ce = nn.CrossEntropyLoss()
        self.weights = weights
    
    def forward(self, preds, targets):
        ordre_p, fam_p, genre_p, esp_p = preds
        ordre_t, fam_t, genre_t, esp_t = targets[:, 0], targets[:, 1], targets[:, 2], targets[:, 3]
        
        loss0 = self.ce(ordre_p, ordre_t)
        loss1 = self.ce(fam_p, fam_t)
        loss2 = self.ce(genre_p, genre_t)
        loss3 = self.ce(esp_p, esp_t)
        
        return sum(w * l for w, l in zip(self.weights, [loss0, loss1, loss2, loss3]))