In [1]:
!pip install torchvision >null

[0m

In [6]:
import glob
from pathlib import Path
import os

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import PIL
from PIL import Image

import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import timm
import torch.nn as nn
import torch.nn.functional as F

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

torch.manual_seed(1)
np.random.seed(1)

# Dataset Preparation

In [7]:
PATH = ""

names = pd.read_csv(PATH +"/RECOFISH_dataset - liste_especes_selection.csv")
names.dropna(inplace=True)

data_dir = Path(PATH +"/dataset_recofish")
file_path = list(data_dir.glob(r'*/*.jpg'))
classes = list(map(lambda x: os.path.split(os.path.split(x)[0])[1],file_path))
                                
file_path = pd.Series(file_path).astype(str)
classes = pd.Series(classes)
df = pd.concat([file_path, classes], axis=1)
df.columns = ['image', 'label']

dic_names_id = dict(zip((names.ID).astype('int').astype('str'), names.nom_scientifique))

lb = LabelEncoder()
df['encoded_labels'] = lb.fit_transform(df['label'])
df['label'] = df['label'].map(dic_names_id)

In [8]:
dic_label_name = dict(zip((df.encoded_labels).astype('int'), df.label))
myKeys = list(dic_label_name.keys())
myKeys.sort()
sorted_dic_label_name = {i: dic_label_name[i] for i in myKeys}
sorted_dic_name_label = {v: k for k, v in sorted_dic_label_name.items()}

In [9]:
train_df, test_df = train_test_split(df, train_size=0.9, random_state=0)
train_df, valid_df = train_test_split(train_df, train_size=0.9, random_state=0)

In [10]:
class FishDataset(torch.utils.data.Dataset):
    def __init__(self, df, transform=transforms.Compose([transforms.ToTensor()])):
        self.df = df
        self.transform = transform
        
    def __len__(self):
        length = len(self.df)
        return length
        
    def __getitem__(self, idx):
        img_path = self.df.iloc[idx, 0]
        label = int(self.df.iloc[idx, 2])

        label = torch.tensor(label)
        image = self.transform(Image.open(img_path).convert('RGB'))

        return image, label

In [11]:
data_transforms_train = transforms.Compose([
    transforms.Resize([224, 224]),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=30),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

data_transforms_test = transforms.Compose([
    transforms.Resize([224, 224]),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

In [12]:
train_dataset = FishDataset(df=train_df, transform=data_transforms_train)
valid_dataset = FishDataset(df=valid_df, transform=data_transforms_test)
test_dataset = FishDataset(df=test_df, transform=data_transforms_test)

In [13]:
batch_size = 64

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1)

# Model Training

In [14]:
#From https://github.com/Bjarten/early-stopping-pytorch
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0


### Load Pre-trained Model

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = timm.create_model('efficientnet_b4', pretrained=True, num_classes=101)
model = model.to('cuda')

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=2, cooldown=2, verbose=True)

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b4_ra2_320-7eb33cd5.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b4_ra2_320-7eb33cd5.pth


### Training Model

In [None]:
epochs = 20

total_train_loss = []
total_valid_loss = []
best_valid_loss = np.Inf
early_stop = EarlyStopping(patience=5, verbose=True)

for epoch in range(epochs): 
    print('Epoch: ', epoch + 1)
    train_loss = []
    valid_loss = []
    train_correct = 0
    train_total = 0
    valid_correct = 0
    valid_total = 0
    for image, target in train_loader:
        model.train()
        image, target = image.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(image)

        loss = criterion(output, target)
        train_loss.append(loss.item())
        _, predicted = torch.max(output.data, 1)
        train_total += target.size(0)
        train_correct += (predicted == target).sum().item()
        
        loss.backward()
        optimizer.step()
        
    for image, target in valid_loader:
        with torch.no_grad():
            model.eval()
            image, target = image.to(device), target.to(device)
            
            output = model(image)
            loss = criterion(output, target)
            valid_loss.append(loss.item())
            _, predicted = torch.max(output.data, 1)
            valid_total += target.size(0)
            valid_correct += (predicted == target).sum().item()
            
    epoch_train_loss = np.mean(train_loss)
    epoch_valid_loss = np.mean(valid_loss)
    print(f'Epoch {epoch + 1}, train loss: {epoch_train_loss:.4f}, valid loss: {epoch_valid_loss:.4f}, train accuracy: {(100 * train_correct / train_total):.4f}%, valid accuracy: {(100 * valid_correct / valid_total):.4f}%')
    if epoch_valid_loss < best_valid_loss:
        torch.save(model.state_dict(), 'fish_species_classification_model.pt')
        print('Model improved. Saving model.')
        best_valid_loss = epoch_valid_loss
    
    early_stop(epoch_valid_loss, model)
        
    if early_stop.early_stop:
        print("Early stopping")
        break
        
    lr_scheduler.step(epoch_valid_loss)
    total_train_loss.append(epoch_train_loss)
    total_valid_loss.append(epoch_valid_loss)

# Model Evaluation
### Test Accuracy

In [16]:
# Model Accuracy on test Data
# load pretrained model
model.load_state_dict(torch.load(PATH+"/fish_classification.pt"))

classes = list(sorted_dic_name_label.keys())
nb_classes = len(classes)

confusion_matrix = torch.zeros(nb_classes, nb_classes)
correct = 0
total = 0
y_pred = []
y_true = []

with torch.no_grad():
    model.eval()
    for image, target in test_loader:
        image, target = image.to(device), target.to(device)
        output = model(image)
        _, predicted = torch.max(output.data, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()
        
        y_pred.extend(predicted.data.cpu().numpy()) # Save Prediction
        y_true.extend(target.data.cpu().numpy()) # Save Truth


print(f'Test Accuracy : {100 * correct / total:.2f}%')

Test Accuracy : 85.31%


### Confusion Matrix

In [17]:
df_cm = pd.DataFrame(np.array(confusion_matrix), index = [i for i in classes], columns = [i for i in classes])
df_cm.to_csv('confusion_matrix.csv')

### Classification Report

In [18]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
report = classification_report(y_true, y_pred, target_names=classes)
print(report)


Classification Report

                             precision    recall  f1-score   support

       Sphyraena viridensis       0.83      0.94      0.88        32
              Belone belone       0.80      0.80      0.80        30
           Oblada melanurus       0.96      0.90      0.93        78
            Diplodus sargus       0.85      0.78      0.82        68
          Diplodus puntazzo       0.86      0.96      0.91        50
          Diplodus vulgaris       0.89      0.95      0.92        86
         Diplodus annularis       0.86      0.77      0.81        47
          Diplodus cervinus       1.00      0.91      0.95        23
             Sparus auratus       0.88      0.87      0.88        85
    Spondyliosoma cantharus       0.88      0.78      0.82        18
              Dentex dentex       0.90      0.84      0.87        32
      Lithognathus mormyrus       0.87      0.93      0.90        56
                Sarpa salpa       0.97      0.97      0.97        76
         

In [24]:
clsf_report = pd.DataFrame(classification_report(y_true = y_true, y_pred = y_pred, output_dict=True, target_names=classes)).transpose()
clsf_report.to_csv('report.csv')

clsf_report.sort_values(by='precision', ascending=False).head(50)

Unnamed: 0,precision,recall,f1-score,support
Xyrichtys novacula,1.0,0.928571,0.962963,14.0
Diplodus cervinus,1.0,0.913043,0.954545,23.0
Sarpa salpa,0.973684,0.973684,0.973684,76.0
Conger conger,0.970588,0.942857,0.956522,35.0
Chromis chromis,0.969925,0.969925,0.969925,133.0
Thalassoma pavo,0.96875,0.981013,0.974843,158.0
Labrus merula,0.966667,0.935484,0.95082,31.0
Apogon imberbis,0.966102,0.982759,0.974359,58.0
Muraena helena,0.964286,0.931034,0.947368,58.0
Balistes capriscus,0.961538,0.961538,0.961538,26.0
