In [1]:
pip install ijson

Collecting ijson
  Downloading ijson-3.4.0.post0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (23 kB)
Downloading ijson-3.4.0.post0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.4/134.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: ijson
Successfully installed ijson-3.4.0.post0
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import json
import glob
import re
from pathlib import Path
import torch
import torch.nn as nn
from torchvision import transforms, datasets, models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
from collections import Counter, defaultdict
import ijson

**Liste des imports globaux**

- torch : gestion du modèle
- PIL : traitement des images
- pandas : gestion de collections de données
- ijson : gestion de fichiers JSON par chunk

In [None]:
def count_files_and_display_tree(
    path: str | os.PathLike,
    indent: int = 0,
    show_tree: bool = True,
    export: bool = False,
    _file=None,
) -> int:
    """
    Compte les fichiers dans un dossier (et ses sous-dossiers) et affiche l'arborescence.

    Paramètres
    ----------
    path : str | PathLike
        Dossier racine à analyser.
    indent : int
        Niveau d'indentation (usage interne pour la récursion).
    show_tree : bool
        Si True, affiche l'arborescence dans la console.
    export : bool
        Si True, écrit l'arborescence complète dans 'tree.txt'.
    _file :
        Gestion interne du fichier ouvert pour l'export (ne pas utiliser directement).

    Retour
    ------
    int
        Nombre total de fichiers dans 'path' et ses sous-dossiers.
    """
    path = Path(path)
    file_count = 0
    subfolder_counts: list[tuple[Path, int]] = []

    close_file = False
    if export and _file is None:
        _file = open("/kaggle/working/tree.txt", "w", encoding="utf-8")
        close_file = True

    try:
        entries = list(path.iterdir())
    except FileNotFoundError:
        raise FileNotFoundError(f"Chemin introuvable : {path}")
    except PermissionError:
        return 0

    files = [e for e in entries if e.is_file()]
    dirs = [e for e in entries if e.is_dir()]

    file_count = len(files)

    for d in sorted(dirs, key=lambda p: p.name.lower()):
        sub_count = count_files_and_display_tree(
            d,
            indent=indent + 4,
            show_tree=show_tree,
            export=export,
            _file=_file,
        )
        subfolder_counts.append((d, sub_count))

    line = " " * indent + f"{path.name}/ [{file_count} fichiers]\n"

    if show_tree:
        print(line, end="")

    if export and _file is not None:
        _file.write(line)

    for subfolder, sub_count in subfolder_counts:
        sub_line = " " * (indent + 2) + f"{subfolder.name}/ [{sub_count} fichiers]\n"
        if show_tree:
            print(sub_line, end="")
        if export and _file is not None:
            _file.write(sub_line)

    total_files = file_count + sum(count for _, count in subfolder_counts)

    if close_file and _file is not None:
        _file.close()

    return total_files


In [None]:
base_folder = "/kaggle/input/inaturalist-insects/"
total = count_files_and_display_tree(base_folder, 2)
print(f"\nNombre total de fichiers : {total}")

In [None]:
import os
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

data_dir = '/kaggle/input/inaturalist-insects/'  

train_transforms = transforms.Compose([
    transforms.RandomRotation(30),        # Rotation aléatoire jusqu'à 30 degrés
    transforms.RandomHorizontalFlip(),    # Flip horizontal aléatoire
    transforms.RandomVerticalFlip(),      # Flip vertical aléatoire
    transforms.RandomResizedCrop(224),    # Recadrage aléatoire et redimensionnement à 224x224
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Ajustements couleurs
    transforms.ToTensor(),                 # Conversion en tenseur PyTorch
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Normalisation ImageNet
])

val_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

train_dataset = datasets.ImageFolder(os.path.join(data_dir, 'train/train'), transform=train_transforms)
val_dataset = datasets.ImageFolder(os.path.join(data_dir, 'val/val'), transform=val_transforms)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

print(f'Nombre de classes: {len(train_dataset.classes)}')
print(f'Taille train dataset: {len(train_dataset)}')
print(f'Taille val dataset: {len(val_dataset)}')

In [None]:
for images, labels in train_loader:
    print(images.shape)
    assert images.shape == [32, 3, 224, 224] 
    print(labels.shape)
    assert labels.shape == [32]
    break