In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!ls

Mounted at /content/drive
drive  sample_data


In [2]:
# move into project directory
repo_name = "crop-damage-classification"
%cd /content/drive/MyDrive/Personal-Projects/$repo_name
!ls

/content/drive/MyDrive/Personal-Projects/crop-damage-classification
common	     dataloading  Index.ipynb  output		    project-structure.md  transforms
config.yaml  experiments  index.py     preprocess	    README.md		  visualization
data	     Index_bc.py  models       preprocess_input.py  run.yaml


In [3]:
# set up environment
# comment if not required
'''
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install matplotlib numpy pandas pyyaml opencv-python
'''

'\n!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n!pip install matplotlib numpy pandas pyyaml opencv-python\n'

# Following cells are for downloading data

In [4]:
# this cell is for downloading data.
# as of yet data is not hosted and is available in the private data folder
# comment if not required
!pip install boto3
!pip install tqdm



In [5]:
# setup some imports
#custom imports
from transforms.transforms import ToTensor, Resize, CenterCrop
from dataloading.dataset import CropDataset
from common.utils import get_exp_params, init_config, get_config, save2config, get_modelinfo, get_saved_model
from models.resnet18 import Resnet18
from models.custom_models import get_model
from experiments.experiments import Experiment
from visualization.visualization import Visualization
from experiments.test_model import ModelTester

#py imports
import random
import numpy as np
import os
import torch
from torchvision import transforms
from torch.utils.data import DataLoader

In [6]:
import boto3
from pathlib import Path
from botocore import UNSIGNED
from botocore.client import Config
from tqdm.notebook import tqdm

def get_file_folders(s3_client, bucket_name, prefix=""):
    file_names = []
    folders = []

    default_kwargs = {
        "Bucket": bucket_name,
        "Prefix": prefix
    }
    next_token = ""

    while next_token is not None:
        updated_kwargs = default_kwargs.copy()
        if next_token != "":
            updated_kwargs["ContinuationToken"] = next_token

        response = s3_client.list_objects_v2(**updated_kwargs)
        contents = response.get("Contents")

        for result in contents:
            key = result.get("Key")
            if key[-1] == "/":
                folders.append(key)
            else:
                file_names.append(key)

        next_token = response.get("NextContinuationToken")

    return file_names, folders

def download_files(s3_client, bucket_name, local_path, file_names, folders):
    local_path = Path(local_path)

    for folder in tqdm(folders):
        folder_path = Path.joinpath(local_path, folder)
				# Create all folders in the path
        folder_path.mkdir(parents=True, exist_ok=True)

    for file_name in tqdm(file_names):
        file_path = Path.joinpath(local_path, file_name)
				# Create folder for parent directory
        file_path.parent.mkdir(parents=True, exist_ok=True)
        s3_client.download_file(
            bucket_name,
            file_name,
            str(file_path)
        )

data_path = 'data/input/images'
if not(os.path.exists(os.path.join(os.getcwd(), data_path))):
    client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    file_names, folders = get_file_folders(client, 'cgiar-crop-damage-classification-challenge')
    download_files(
        client,
        'cgiar-crop-damage-classification-challenge',
        "/content/drive/MyDrive/Personal-Projects/crop-damage-classification/data/input",
        file_names,
        folders
    )

In [7]:
# initialize directories and config data
init_config()
config = get_config()
print('Config parameters\n')
print(config)

Config parameters

{'X_key': 'image', 'data_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification/data', 'device': 'cuda', 'img_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification/data/input/images', 'output_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification/output', 'root_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification', 'use_gpu': True, 'y_key': 'label'}


In [8]:
# read experiment parameters
exp_params = get_exp_params()
print('Experiment parameters\n')
print(exp_params)

Experiment parameters

{'transform': {'resize_dim': 256, 'crop_dim': 224}, 'train': {'shuffle_data': True, 'batch_size': 128, 'val_split_method': 'k-fold', 'k': 5, 'val_percentage': 20, 'loss': 'cross-entropy', 'batch_interval': 512, 'epoch_interval': 1, 'num_epochs': 10}, 'model': {'name': 'resnet18', 'optimizer': 'Adam', 'lr': 0.001, 'weight_decay': 1e-05, 'amsgrad': False, 'momentum': 0.9}, 'test_model': False}


In [9]:
#initialize randomness seed
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [10]:
#preprocess data or load preprocessed data

#build label dict
label_dict = {
    'DR': 0,
    'G': 1,
    'ND': 2,
    'WD': 3,
    'other': 4
}

class_dict = {
    0: 'DR',
    1: 'G',
    2: 'ND',
    3: 'WD',
    4: 'other'
}

In [11]:
import os
from common.utils import get_config
import pandas as pd
from PIL import Image
import numpy as np
import torch
from torchvision.io import read_image

class Preprocessor:

    def __init__(self, data_filesuffix = 224):
        cfg = get_config()
        self.image_dir = cfg["img_dir"]
        self.train_labels = pd.read_csv(os.path.join(cfg["data_dir"], "input/Train.csv"))
        self.test_labels = pd.read_csv(os.path.join(cfg["data_dir"], "input/Test.csv"))
        self.processed_img_dir = os.path.join(cfg["data_dir"], "processed_input")
        self.X_key = cfg['X_key']
        self.y_key = cfg['y_key']
        self.data_filesuffix = data_filesuffix


    def transform_input(self, transform):
        train_files = self.train_labels["filename"].tolist()
        test_files = self.test_labels["filename"].tolist()
        train_data = np.empty((1, self.data_filesuffix, self.data_filesuffix, 3))
        test_data = np.empty((1, self.data_filesuffix, self.data_filesuffix, 3))
        print('Iterating through train files')
        for i, tf in enumerate(train_files):
            img = read_image(os.path.join(self.image_dir, tf))
            sample = { self.X_key: img }
            img = transform(sample)[self.X_key]
            img = img.cpu().detach().numpy()
            img = np.transpose(img, (1, 2, 0))
            train_data = np.concatenate((train_data, np.expand_dims(img, 0)))
        train_data = train_data[1:]
        np.savez_compressed(os.path.join(self.processed_img_dir, f"train_{self.data_filesuffix}"),  train_data)
        del train_data
        print('\nIterating through test files')
        for i, tf in enumerate(test_files):
            img = read_image(os.path.join(self.image_dir, tf))
            sample = { self.X_key: img }
            img = transform(sample)[self.X_key]
            img = img.cpu().detach().numpy()
            img = np.transpose(img, (1, 2, 0))
            test_data = np.concatenate((test_data, np.expand_dims(img, 0)))
        test_data = test_data[1:]
        np.savez_compressed(os.path.join(self.processed_img_dir, f"test_{self.data_filesuffix}"), test_data)
        del test_data

    def get_dataset_metrics(self, dataset):
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=False)
        print(len(dataloader))
        pop_mean = []
        pop_std0 = []
        pop_std1 = []
        for i, data in enumerate(dataloader):
            # shape (batch_size, 3, height, width)
            numpy_image = data[self.X_key].numpy()
            print(numpy_image.shape)
            # shape (3,)
            batch_mean = np.mean(numpy_image, axis=(0,2,3))
            batch_std0 = np.std(numpy_image, axis=(0,2,3))
            batch_std1 = np.std(numpy_image, axis=(0,2,3), ddof=1)

            del data[self.X_key]

            pop_mean.append(batch_mean)
            pop_std0.append(batch_std0)
            pop_std1.append(batch_std1)

        # shape (num_iterations, 3) -> (mean across 0th axis) -> shape (3,)
        pop_mean = np.array(pop_mean).mean(axis=0)
        pop_mean = [x/255 for x in pop_mean]
        pop_std0 = np.array(pop_std0).mean(axis=0)
        pop_std0 = [x/255 for x in pop_std0]
        pop_std1 = np.array(pop_std1).mean(axis=0)
        pop_std1 = [x/255 for x in pop_std1]
        return pop_mean, pop_std0, pop_std1

    def make_label_csv(self):
        ## This function is for implementing code that constructs a csv file
        ## listing labels of all images. The csv file will have 4 columsn - image file name, label (encoded),
        ## original label and full path
        pass



In [12]:
#save X_key and y_key
save2config('X_key', 'image')
save2config('y_key', 'label')

#transform data
data_transforms = transforms.Compose([ToTensor(), Resize(exp_params['transform']['resize_dim']), CenterCrop(exp_params['transform']['crop_dim'])])

#convert to dataset
ftr_dataset = CropDataset('input/Train.csv', label_dict, False, transforms=data_transforms)
test_dataset = CropDataset('input/Test.csv', label_dict, True, transforms=data_transforms)
smlen = int(0.01 * len(ftr_dataset))
smftr_dataset = torch.utils.data.Subset(ftr_dataset, list(range(smlen)))
print('Full train dataset length:', len(ftr_dataset))
print('Test dataset length:', len(test_dataset))
print('Subset train dataset length:', smlen, '\n')



Full train dataset length: 26068
Test dataset length: 8663
Subset train dataset length: 260 



In [13]:
'''
#running experiment on small subset of the dataset
exp = Experiment(exp_params['model']['name'], smftr_dataset)
model_history = exp.train()
'''

"\n#running experiment on small subset of the dataset\nexp = Experiment(exp_params['model']['name'], smftr_dataset)\nmodel_history = exp.train()\n"

In [None]:
#model training on full dataset
exp = Experiment(exp_params['model']['name'], ftr_dataset)
model_history = exp.train()

Running split 0




(128, 3, 224, 224)
(128, 3, 224, 224)
(128, 3, 224, 224)
(128, 3, 224, 224)
(128, 3, 224, 224)
(128, 3, 224, 224)
(128, 3, 224, 224)
(128, 3, 224, 224)
(128, 3, 224, 224)
(128, 3, 224, 224)
(128, 3, 224, 224)
(128, 3, 224, 224)


In [None]:
# get best model
model = get_model(exp_params["model"]["name"])
model = get_saved_model(model, '')
model_info = get_modelinfo('')

print("\nModel validation results")
print(model_info['results']['trlosshistory'])
#visualization results
vis = Visualization(model_info, model_history)
vis.get_results()

In [None]:
#model testing
print("\n\nTesting Saved Model")
mt = ModelTester(model, test_dataset)
mt.test_and_save_csv(class_dict)