In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import KFold
from PIL import Image
from tqdm.auto import tqdm
import timm
from torch.optim import AdamW

os.environ['KAGGLE_KERNEL_RUN_TYPE'] = 'Batch'
DEBUG = False
TRAIN = True
LOCAL = False

tqdm.pandas()

DATA_ROOT = '/kaggle/input/csiro-biomass/'

train_df = pd.read_csv(f'{DATA_ROOT}/train.csv')
print(f"Données chargées : {len(train_df)} lignes")
print(train_df.head())

train_df[['sample_id_prefix', 'sample_id_suffix']] = train_df.sample_id.str.split('__', expand=True)
assert (train_df.sample_id_suffix == train_df.target_name).all(), "Erreur : identifiants ne correspondent pas"

cols = ['sample_id_prefix', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm']
agg_train_df = train_df.groupby(cols).apply(
    lambda df: df.set_index('target_name').target,
    include_groups=False
)
agg_train_df.reset_index(inplace=True)
agg_train_df.columns.name = None

print("Chargement des images en mémoire...")
agg_train_df['image'] = agg_train_df.image_path.progress_apply(
    lambda path: Image.open(DATA_ROOT + path).convert('RGB')
)
print(f"{len(agg_train_df)} images chargées")
print(agg_train_df['image'].apply(lambda x: x.size).value_counts())

assert np.isclose(
    agg_train_df[['Dry_Green_g', 'Dry_Clover_g']].sum(axis=1),
    agg_train_df['GDM_g'], atol=1e-04
).mean() > 0.99, "Erreur : GDM ≠ Green + Clover"

assert np.isclose(
    agg_train_df[['GDM_g', 'Dry_Dead_g']].sum(axis=1),
    agg_train_df['Dry_Total_g'], atol=1e-04
).mean() > 0.99, "Erreur : Total ≠ GDM + Dead"

print("Relations entre cibles vérifiées")

plt.figure(figsize=(16, 4))
plt.subplot(1, 3, 1)
agg_train_df.Dry_Green_g.plot(kind='hist', bins=50, color='green')
plt.title('Végétation verte sèche (g)')
plt.subplot(1, 3, 2)
agg_train_df.Dry_Clover_g.plot(kind='hist', bins=50, color='lightgreen')
plt.title('Trèfle sec (g)')
plt.subplot(1, 3, 3)
agg_train_df.Dry_Dead_g.plot(kind='hist', bins=50, color='brown')
plt.title('Matière morte sèche (g)')
plt.tight_layout()
plt.savefig('distribution_cibles.png')
plt.show()

NFOLD = 5
kfold = KFold(n_splits=NFOLD, shuffle=True, random_state=42)

agg_train_df['fold'] = -1
for i, (trn_idx, val_idx) in enumerate(kfold.split(agg_train_df.index)):
    agg_train_df.loc[val_idx, 'fold'] = i

print(f"Données découpées en {NFOLD} folds")
print(agg_train_df['fold'].value_counts().sort_index())

class DatasetPaturage(Dataset):
    def __init__(self, data, transform=None):
        self.data = data.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        image = item.image
        if self.transform:
            image = self.transform(image)
        cibles = [item['Dry_Green_g'], item['Dry_Clover_g'], item['Dry_Dead_g']]
        return image, cibles

def creer_dataloader(data, taille_image=(256, 256), batch_size=32, melanger=True, augmentation=True):
    if augmentation:
        transform = transforms.Compose([
            transforms.Resize(taille_image),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.5),
            transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    else:
        transform = transforms.Compose([
            transforms.Resize(taille_image),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    dataset = DatasetPaturage(data, transform=transform)
    print(f'Taille du dataset : {len(dataset)} échantillons')
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=melanger,
        num_workers=2,
        pin_memory=True
    )
    return dataloader

NOM_MODELE = 'efficientnet_b2'
model_test = timm.create_model(NOM_MODELE, pretrained=True, num_classes=3)
TAILLE_IMAGE_CIBLE = model_test.pretrained_cfg['input_size'][1:]
print(f"Modèle {NOM_MODELE} — Taille d'entrée : {TAILLE_IMAGE_CIBLE}")
del model_test

def r2_pondere(y_true: np.ndarray, y_pred: np.ndarray):
    poids = np.array([0.1, 0.1, 0.1, 0.2, 0.5])
    scores_r2 = []
    for i in range(5):
        y_t = y_true[:, i]
        y_p = y_pred[:, i]
        ss_res = np.sum((y_t - y_p) ** 2)
        ss_tot = np.sum((y_t - np.mean(y_t)) ** 2)
        r2 = 1 - ss_res / ss_tot if ss_tot > 0 else 0.0
        scores_r2.append(r2)
    scores_r2 = np.array(scores_r2)
    return np.sum(scores_r2 * poids) / np.sum(poids), scores_r2

def calculer_metrique(sorties, cibles):
    y_true = np.column_stack((cibles, cibles[:, :2].sum(axis=1), cibles.sum(axis=1)))
    y_pred = np.column_stack((sorties, sorties[:, :2].sum(axis=1), sorties.sum(axis=1)))
    return r2_pondere(y_true, y_pred)

def entrainer_epoch(model, dataloader, critere, optimiseur, device):
    model.train()
    perte_totale = 0
    for images, cibles in dataloader:
        images = images.to(device)
        cibles = torch.stack(cibles).T.float().to(device)
        optimiseur.zero_grad()
        sorties = model(images)
        perte = critere(sorties, cibles)
        perte.backward()
        optimiseur.step()
        perte_totale += perte.item()
    return perte_totale / len(dataloader)

def valider(model, dataloader, critere, device):
    model.eval()
    perte_totale = 0
    toutes_sorties, toutes_cibles = [], []
    with torch.no_grad():
        for images, cibles in dataloader:
            images = images.to(device)
            cibles = torch.stack(cibles).T.float().to(device)
            sorties = model(images)
            perte = critere(sorties, cibles)
            perte_totale += perte.item()
            toutes_sorties.append(sorties.detach().cpu())
            toutes_cibles.append(cibles.detach().cpu())
    sorties_np = torch.cat(toutes_sorties).numpy()
    cibles_np = torch.cat(toutes_cibles).numpy()
    r2_val, scores_r2 = calculer_metrique(sorties_np, cibles_np)
    return perte_totale / len(dataloader), r2_val, scores_r2

def entrainer_fold(data, fold):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Dispositif : {device}")

    batch_size = 32
    lr = 1e-3
    patience = 10
    nb_epochs = 100

    model = timm.create_model(NOM_MODELE, pretrained=True, num_classes=3)
    model.to(device)
    critere = nn.SmoothL1Loss()
    optimiseur = AdamW(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimiseur, mode='max', factor=0.5, patience=patience // 2
    )

    train_loader = creer_dataloader(data[data.fold != fold], TAILLE_IMAGE_CIBLE, batch_size, melanger=True, augmentation=True)
    val_loader = creer_dataloader(data[data.fold == fold], TAILLE_IMAGE_CIBLE, batch_size, melanger=False, augmentation=False)

    historique = []
    meilleur_score = -float('inf')
    epochs_sans_amelioration = 0

    for epoch in range(nb_epochs):
        perte_train = entrainer_epoch(model, train_loader, critere, optimiseur, device)
        perte_val, r2_val, scores_r2 = valider(model, val_loader, critere, device)
        scheduler.step(r2_val)

        lr_actuel = optimiseur.param_groups[0]['lr']
        print(f"Epoch [{epoch:3d}/{nb_epochs}] | Train: {perte_train:.4f} | Val: {perte_val:.4f} | R²: {r2_val:.4f} | LR: {lr_actuel:.6f}")

        historique.append({
            'perte_train': perte_train,
            'perte_val': perte_val,
            'r2_pondere': r2_val,
            'scores_r2': scores_r2.tolist(),
        })

        if r2_val > meilleur_score:
            meilleur_score = r2_val
            epochs_sans_amelioration = 0
            torch.save(model.state_dict(), f'{DOSSIER_SORTIE}/meilleur_modele_fold{fold}.pth')
            print(f"Meilleur modèle sauvegardé (R²={meilleur_score:.4f})")
        else:
            epochs_sans_amelioration += 1

        if epochs_sans_amelioration >= patience:
            print(f"Arrêt anticipé epoch {epoch}")
            break

    print(f"\nFold {fold} terminé — Meilleur R² : {meilleur_score:.4f}")
    return historique, meilleur_score

DOSSIER_SORTIE = 'modeles_entraines/'
os.makedirs(DOSSIER_SORTIE, exist_ok=True)

if TRAIN:
    tous_meilleurs_scores = []
    print("Début de l'entraînement K-Fold...\n")

    for i in range(NFOLD):
        print(f"\n{'='*50}")
        print(f"FOLD {i}/{NFOLD-1}")
        print(f"{'='*50}")

        historique, meilleur_score = entrainer_fold(agg_train_df, fold=i)
        tous_meilleurs_scores.append(meilleur_score)

        historique_df = pd.DataFrame(historique)
        historique_df.to_json(f'{DOSSIER_SORTIE}/historique_fold{i}.jsonl', orient='records', lines=True, force_ascii=False)

        plt.figure(figsize=(12, 4))
        plt.subplot(1, 2, 1)
        plt.title(f'Perte — Fold {i}')
        plt.plot(historique_df.perte_train, label='Entraînement')
        plt.plot(historique_df.perte_val, label='Validation')
        plt.xlabel('Epoch')
        plt.ylabel('Perte (SmoothL1)')
        plt.legend()
        plt.subplot(1, 2, 2)
        plt.title(f'R² Pondéré — Fold {i}')
        plt.plot(historique_df.r2_pondere, color='green')
        plt.xlabel('Epoch')
        plt.ylabel('R²')
        plt.tight_layout()
        plt.savefig(f'{DOSSIER_SORTIE}/courbes_fold{i}.png')
        plt.show()
    
    for i, score in enumerate(tous_meilleurs_scores):
        print(f"Fold {i} : R² = {score:.4f}")
    print(f"Moyenne CV : {np.mean(tous_meilleurs_scores):.4f} ± {np.std(tous_meilleurs_scores):.4f}")

def recuperer_derniers_modeles():
    model_root = '/kaggle/input/csiro-simple-output/pytorch/default/'
    latest = 1
    for version in os.listdir(model_root):
        try:
            v = int(version)
            if v > latest:
                latest = v
        except:
            continue
    return f'{model_root}/{latest}/modeles_entraines/'

MODELES_SAUVEGARDES = DOSSIER_SORTIE if TRAIN else recuperer_derniers_modeles()

def predire(model, dataloader, device):
    model.to(device)
    model.eval()
    toutes_sorties = []
    with torch.no_grad():
        for images, cibles in dataloader:
            images = images.to(device)
            sorties = model(images)
            toutes_sorties.append(sorties.detach().cpu())
    return torch.cat(toutes_sorties).numpy()

def predire_ensemble(dataloader):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    toutes_predictions = []

    fichiers = sorted(Path(MODELES_SAUVEGARDES).glob('*.pth'))
    if not fichiers:
        raise FileNotFoundError(f"Aucun modèle .pth trouvé dans {MODELES_SAUVEGARDES}")

    for fichier_modele in fichiers:
        print(f"Chargement : {fichier_modele.name}")
        model = timm.create_model(NOM_MODELE, pretrained=False, num_classes=3)
        model.load_state_dict(
            torch.load(fichier_modele, map_location='cpu', weights_only=True)
        )
        preds = predire(model, dataloader, device)
        toutes_predictions.append(preds)

    predictions_moyennes = np.mean(toutes_predictions, axis=0)
    print(f"Ensemble de {len(toutes_predictions)} modèles — prédictions moyennées")
    return predictions_moyennes

print("\nChargement des données de test...")
test_df = pd.read_csv(DATA_ROOT + 'test.csv')
test_df['target'] = 0.0
test_df[['sample_id_prefix', 'sample_id_suffix']] = test_df.sample_id.str.split('__', expand=True)

cols_test = ['sample_id_prefix', 'image_path']
agg_test_df = test_df.groupby(cols_test).apply(
    lambda df: df.set_index('target_name').target,
    include_groups=False
)
agg_test_df.reset_index(inplace=True)
agg_test_df.columns.name = None

print("Chargement des images de test...")
agg_test_df['image'] = agg_test_df.image_path.progress_apply(
    lambda path: Image.open(DATA_ROOT + path).convert('RGB')
)
print(f"{len(agg_test_df)} images de test chargées")

test_loader = creer_dataloader(agg_test_df, TAILLE_IMAGE_CIBLE, 32, melanger=False, augmentation=False)

print("\nGénération des prédictions...")
predictions = predire_ensemble(test_loader)

agg_test_df[['Dry_Green_g', 'Dry_Clover_g', 'Dry_Dead_g']] = predictions
agg_test_df['GDM_g'] = agg_test_df.Dry_Green_g + agg_test_df.Dry_Clover_g
agg_test_df['Dry_Total_g'] = agg_test_df.GDM_g + agg_test_df.Dry_Dead_g

print("\nAperçu des prédictions :")
print(agg_test_df[['sample_id_prefix', 'Dry_Green_g', 'Dry_Clover_g', 'Dry_Dead_g', 'GDM_g', 'Dry_Total_g']].head())

cols_cibles = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']
sub_df = agg_test_df.set_index('sample_id_prefix')[cols_cibles].stack()
sub_df = sub_df.reset_index()
sub_df.columns = ['sample_id_prefix', 'target_name', 'target']
sub_df['sample_id'] = sub_df.sample_id_prefix + '__' + sub_df.target_name

sub_df[['sample_id', 'target']].to_csv('submission.csv', index=False)
print("\nFichier submission.csv généré avec succès !")
print(sub_df[['sample_id', 'target']].head(10).to_string())