# Explore 🕵 provdided data

In [None]:
!ls -l /kaggle/input

PATH_DATASET = "/kaggle/input/happy-whale-and-dolphin"

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

df_train = pd.read_csv(os.path.join(PATH_DATASET, "train.csv"))
display(df_train.head())
print(f"Dataset size: {len(df_train)}")
print(f"Unique ids: {len(df_train['individual_id'].unique())}")

Lets see how top individulas we have in the database...

In [None]:
import numpy as np
from pprint import pprint

species_individuals = {}
for name, dfg in df_train.groupby("species"):
    species_individuals[name] = dfg["individual_id"].value_counts()

si_max = max(list(map(len, species_individuals.values())))
si = {n: [0] * si_max for n in species_individuals}
for n, counts in species_individuals.items():
    si[n][:len(counts)] = list(np.log(counts))
si = pd.DataFrame(si)

In [None]:
import seaborn as sn

fig = plt.figure(figsize=(10, 8))
ax = sn.heatmap(si[:500].T, cmap="BuGn", ax=fig.gca())

In [None]:
top_ids = {n: list(inds[:4].index) + ['new_individual']  for n, inds in species_individuals.items()}
display(pd.DataFrame(top_ids).T)

# Inference with Lightning⚡Flash

**this is inference for the folowing kernel:** https://www.kaggle.com/jirkaborovec/whale-dolphin-eda-classify-lit-flash

In [None]:
!pip install -q 'lightning-flash[image]' --find-links /kaggle/input/whale-dolphin-eda-classify-lit-flash/frozen_packages/ --no-index
!pip uninstall -y wandb

In [None]:
import torch
import flash
from flash.image import ImageClassificationData, ImageClassifier

In [None]:
import glob

imgs = glob.glob("/kaggle/input/happy-whale-and-dolphin/test_images/*.jpg")
df_test = pd.DataFrame(map(os.path.basename, imgs), columns=["image"])
display(df_test.head())
print(len(df_test))

## 1. Load the task ⚙️

In [None]:
model = ImageClassifier.load_from_checkpoint(
    "/kaggle/input/whale-dolphin-eda-classify-lit-flash/image_classification_model.pt"
)
print(model.labels)

In [None]:
# Trainer Args
GPUS = int(torch.cuda.is_available())  # Set to 1 if GPU is enabled for notebook
trainer = flash.Trainer(gpus=GPUS)

## 2. Run predictions 🎉

In [None]:
datamodule = ImageClassificationData.from_data_frame(
    input_field="image",
    predict_data_frame=df_test,
    # for simplicity take just fraction of the data
    # predict_data_frame=df_test[:len(df_test) // 1000],
    predict_images_root=os.path.join(PATH_DATASET, "test_images"),
    batch_size=32,
    transform_kwargs={"image_size": (300, 300)},
    num_workers=3,
)

In [None]:
predictions = []
for lbs in trainer.predict(model, datamodule=datamodule, output="labels"):
    # lbs = [torch.argmax(p["preds"].float()).item() for p in preds]
    predictions += lbs

df_test["prediction"] = predictions
display(df_test.head())

## Browse some images

In [None]:
nb_species = len(df_test["prediction"].unique())
fig, axarr = plt.subplots(ncols=5, nrows=nb_species, figsize=(12, nb_species * 2))

for i, (name, dfg) in enumerate(df_test.groupby("prediction")):
    axarr[i, 0].set_title(name)
    for j, (_, row) in enumerate(dfg[:5].iterrows()):
        im_path = os.path.join(PATH_DATASET, "test_images", row["image"])
        img = plt.imread(im_path)
        axarr[i, j].imshow(img)
        axarr[i, j].set_axis_off()

# Static 📥 submission

In [None]:
!head ../input/happy-whale-and-dolphin/sample_submission.csv

In [None]:
df_test["predictions"] = [" ".join(top_ids[lb]) for lb in df_test["prediction"]]
df_test[["image","predictions"]].set_index("image").to_csv("submission.csv")

!head submission.csv