In [None]:
#Original Dataset is in : ../data/PlantVillage/
#This dataset is the dataset from here : https://www.kaggle.com/datasets/soumiknafiul/plantvillage-dataset-labeled

import os
import random
import shutil

# Define paths
input_folder = "../augmentation-validation/classified-augmented-original"
train_folder = "../augmentation-validation/classified-augmented-original-div/train"
val_folder = "../augmentation-validation/classified-augmented-original-div/val"

# Create train and validation folders if they don't exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(val_folder, exist_ok=True)

# Function to divide images into train and validation sets
def split_dataset(input_folder, train_folder, val_folder, split_ratio=0.8):
    for root, dirs, files in os.walk(input_folder):
        for dir_name in dirs:
            # Create corresponding folders in train and validation sets
            train_dir = os.path.join(train_folder, root[len(input_folder)+1:], dir_name)
            val_dir = os.path.join(val_folder, root[len(input_folder)+1:], dir_name)
            os.makedirs(train_dir, exist_ok=True)
            os.makedirs(val_dir, exist_ok=True)
            
            # Get list of all image files in the subfolder
            image_files = [f for f in os.listdir(os.path.join(root, dir_name)) if f.endswith(('.jpg', '.jpeg', '.png', '.JPG'))]
            
            # Shuffle the list of image files
            random.shuffle(image_files)
            
            # Calculate number of images for train and validation sets
            num_images = len(image_files)
            num_train = int(split_ratio * num_images)
            num_val = num_images - num_train
            
            # Split the image files into train and validation sets
            train_images = image_files[:num_train]
            val_images = image_files[num_train:]
            
            # Copy train images to train folder
            for image in train_images:
                src = os.path.join(root, dir_name, image)
                dst = os.path.join(train_dir, image)
                shutil.copyfile(src, dst)
            
            # Copy validation images to validation folder
            for image in val_images:
                src = os.path.join(root, dir_name, image)
                dst = os.path.join(val_dir, image)
                shutil.copyfile(src, dst)

# Split the dataset into train and validation sets
split_dataset(input_folder, train_folder, val_folder)

print("Dataset split into train and validation sets successfully.")


In [None]:
import os
import shutil

def copy_folder_contents(source_dir, dest_dir):
    for root, dirs, files in os.walk(source_dir):
        # Get the relative path from the source directory
        relative_path = os.path.relpath(root, source_dir)
        
        # Construct the corresponding destination directory
        dest_path = os.path.join(dest_dir, relative_path)
        
        # Ensure the destination directory exists
        os.makedirs(dest_path, exist_ok=True)
        
        # Copy files
        for file in files:
            src_file = os.path.join(root, file)
            dest_file = os.path.join(dest_path, file)
            shutil.copy(src_file, dest_file)

source_folder = "../augmentation-validation/classified-augmented-original-div/val"
destination_folder = "../augmentation-validation/nst-augmented/PlantVillage/val"

copy_folder_contents(source_folder, destination_folder)

In [None]:
import time
import copy

import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Subset
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
from torchvision import datasets, transforms, models
from collections import OrderedDict
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
from sklearn import metrics
import os

# Load PlantVillage Dataset

In [None]:
#Organizing the dataset
batch_size = 32

# check if CUDA is available
train_on_gpu = torch.cuda.is_available()
train_on_gpu_mps = torch.backends.mps.is_available()

if train_on_gpu:
    print('CUDA is available.  Training on GPU ...')
    device = torch.device("cuda:0")
if train_on_gpu_mps:
    print('MPS is available!  Training on GPU ...')
    device = torch.device("mps")
else:
    print('Training on CPU ...')
    device = torch.device("cpu")
print(device)

In [None]:
def balanced_sampler_weights(images, nclasses):
  count = [0] * nclasses
  for img in images:
    count[img[1]] += 1
  total_num_imgs = float(sum(count))

  weight_per_class = [total_num_imgs/float(count[i]) for i in range(nclasses)]
  weight_per_img = [weight_per_class[img[1]] for _, img in enumerate(images)]
  return weight_per_img

In [None]:
# Define your transforms for the training and validation sets
# Data augmentation and normalization for training
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize(256),
        # transforms.RandomAffine(0,shear=30),
        transforms.RandomRotation(30),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

data_dir = '../augmentation-validation/nst-augmented/PlantVillage'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']}

balance_classes = False

if balance_classes:
  train_weights = balanced_sampler_weights(image_datasets['train'].imgs, len(image_datasets['train'].classes))
  train_weights = torch.DoubleTensor(train_weights)
  train_sampler = torch.utils.data.sampler.WeightedRandomSampler(train_weights, len(train_weights))
  samplers = {'train': train_sampler, 'val': None}

  # Using the image datasets and the trainforms, define the dataloaders
  dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, sampler=samplers[x], num_workers=4) for x in ['train', 'val']}
else:
  print("No balancing")
  # Using the image datasets and the trainforms, define the dataloaders
  dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val']}

# data
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}

x,y = next(iter(dataloaders['train']))
print(x.shape, y.shape)

In [None]:
tmp = dataloaders['train']

In [None]:
buckets_split = {}
for split in ['train', 'val']:
  classes = image_datasets[split].classes
  buckets_split[split] = {x: 0 for x in classes}

print(classes)

for split in ['train', 'val']:
  ds = dataloaders[split]
  for data, batch_target in ds:
    for target in batch_target:
      class_idx = target.item()
      buckets_split[split][classes[class_idx]] += 1

In [None]:
plt.figure(figsize=(20,5))
ind = range(len(buckets_split['train'].keys()))
p1 = plt.bar(ind,list(buckets_split['train'].values()))
p2 = plt.bar(ind,list(buckets_split['val'].values()), bottom=list(buckets_split['train'].values()))
plt.title("PlantVillage dataset class distribution")
plt.ylabel('Number of samples')
plt.xlabel('Plant disease class')
plt.xticks(ticks=ind, labels=classes, rotation = 'vertical')
plt.legend((p1[0], p2[0]), ('Train', 'Validation'))
plt.show()

# Build and train classifier

In [None]:
model = models.resnet50(pretrained=True)

# for param in model.parameters():
#   param.requires_grad = False
# model = models.resnet50(pretrained=True)

In [None]:
classifier = nn.Sequential(OrderedDict([('fc1', nn.Linear(2048, 512)),
                                        ('relu', nn.ReLU()),
                                        ('fc2', nn.Linear(512,38)),
                                        ('output', nn.LogSoftmax(dim=1))]))

model.fc = classifier

In [None]:
#Function to train the model
def train_model(model, criterion, optimizer, scheduler, num_epochs=50):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(1, num_epochs+1):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('-' * 10)

        model.train()  # Set model to training mode
        running_loss = 0.0
        running_corrects = 0

        # Train and update weights
        for inputs, labels in dataloaders['train']:
          inputs, labels = inputs.to(device), labels.to(device)
          optimizer.zero_grad()

          # Forward
          outputs = model(inputs)
          loss = criterion(outputs, labels)
          _, preds = torch.max(outputs,1)

          # Backward + optimize
          loss.backward()
          optimizer.step()

          # statistics
          running_loss += loss.item() * inputs.size(0)
          running_corrects += torch.sum(preds == labels.data)

        # Each epoch has a training and validation phase
        scheduler.step()
        running_loss = np.float32(running_loss)
        running_corrects = np.float32(running_corrects.cpu())
        epoch_loss = running_loss / dataset_sizes['train']
        epoch_acc = running_corrects / dataset_sizes['train']

        print(f"Train Loss: {epoch_loss} Acc: {epoch_acc}")

        # Evaluate on validation set
        model.eval()
        running_loss = 0.0
        running_corrects = 0

        # Iterate over data.
        for inputs, labels in dataloaders['val']:
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            with torch.no_grad():
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                _, preds = torch.max(outputs, 1)

            # statistics
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        running_loss = np.float32(running_loss)
        running_corrects = np.float32(running_corrects.cpu())
        epoch_loss = running_loss / dataset_sizes['val']
        epoch_acc = running_corrects / dataset_sizes['val']

        print(f"Validation Loss: {epoch_loss} Acc: {epoch_acc}")

        # deep copy the model
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())
        
        if epoch % 5 == 0:
            # Save model every 5 epochs
            model.epochs = epoch
            checkpoint = {'input_size': [3, 224, 224],
                  'batch_size': dataloaders['train'].batch_size,
                  'output_size': 17,
                  'state_dict': model.state_dict(),
                  'data_transforms': data_transforms,
                  'optimizer_dict':optimizer.state_dict(),
                  'epoch': model.epochs}
            torch.save(checkpoint, f"classifier_{epoch}_epochs_checkpoint.pth")

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best valid accuracy: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

## Train the classifier

In [None]:
# Train a model with a pre-trained network
num_epochs = 5
if train_on_gpu:
    print ("Using GPU: "+ str(train_on_gpu))
    model = model.cuda()
if train_on_gpu_mps:
    print ("Using GPU: "+ str(train_on_gpu_mps))
    model = model.to(device)

# NLLLoss because our output is LogSoftmax
criterion = nn.NLLLoss()

# Adam optimizer with a learning rate
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
# Decay LR by a factor of 0.1 every 5 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)


model_ft = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=num_epochs)

In [None]:
# Do validation on the test set
def test(model, dataloaders, device):
  model.eval()
  accuracy = 0
  
  model.to(device)
  
  label_accuracy = {x: 0 for x in image_datasets['val'].classes}
  predicted_labels = []
  true_labels = []

  for images, labels in dataloaders['val']:
    images = Variable(images)
    labels = Variable(labels)
    images, labels = images.to(device), labels.to(device)
      
    output = model.forward(images)
    ps = torch.exp(output)
    equality = (labels.data == ps.max(1)[1])
    acc = equality.type_as(torch.FloatTensor())
    accuracy += acc.mean()
    for i,x in enumerate(labels.data):
      label_accuracy[image_datasets['val'].classes[x]] += acc[i]
      predicted_labels.append(ps.max(1)[1][i])
      true_labels.append(x)

  print("Testing Accuracy: {:.3f}".format(accuracy/len(dataloaders['val'])))

  return true_labels, predicted_labels

In [None]:
model.epochs = num_epochs
checkpoint = {'input_size': [3, 224, 224],
                 'batch_size': dataloaders['train'].batch_size,
                  'output_size': 17,
                  'state_dict': model.state_dict(),
                  'data_transforms': data_transforms,
                  'optimizer_dict':optimizer.state_dict(),
                  'epoch': model.epochs}
torch.save(checkpoint, './plantvillage_resnet50.pth')

In [None]:
true_labels, predicted_labels = test(model, dataloaders, device)

In [None]:
predicted_labels_list = [x.cpu().item() for x in predicted_labels]
true_labels_list = [x.cpu().item() for x in true_labels]

In [None]:
# Print the confusion matrix
print(metrics.confusion_matrix(true_labels_list, predicted_labels_list))

# Print the precision and recall, among other metrics
print(metrics.classification_report(true_labels_list, predicted_labels_list, target_names=image_datasets['val'].classes, digits=3))