In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!ls

Mounted at /content/drive
drive  sample_data


In [2]:
# move into project directory
repo_name = "crop-damage-classification"
%cd /content/drive/MyDrive/Personal-Projects/$repo_name
!ls

/content/drive/MyDrive/Personal-Projects/crop-damage-classification
common		    data	 Index_bc.py  models	  preprocess_input.py	run.yaml
config.yaml	    dataloading  Index.ipynb  output	  project-structure.md	transforms
corrupt_files.json  experiments  index.py     preprocess  README.md		visualization


In [3]:
# set up environment
# comment if not required
'''
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install matplotlib numpy pandas pyyaml opencv-python
'''

'\n!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n!pip install matplotlib numpy pandas pyyaml opencv-python\n'

# Following cells are for downloading data

In [4]:
# this cell is for downloading data.
# as of yet data is not hosted and is available in the private data folder
# comment if not required
!pip install boto3
!pip install tqdm

Collecting boto3
  Downloading boto3-1.34.19-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m810.7 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.35.0,>=1.34.19 (from boto3)
  Downloading botocore-1.34.19-py3-none-any.whl (11.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.0-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.34.19 botocore-1.34.19 jmespath-1.0.1 s3transfer-0.10.0


In [5]:
# setup some imports
#custom imports
from transforms.transforms import ToTensor, Resize, CenterCrop
from dataloading.dataset import CropDataset
from common.utils import get_exp_params, init_config, get_config, save2config, get_modelinfo, get_saved_model, read_json, insert_index_2csv
from models.resnet18 import Resnet18
from models.custom_models import get_model
from experiments.experiments import Experiment
from visualization.visualization import Visualization
from experiments.test_model import ModelTester
from preprocess.preprocessor import Preprocessor
from tqdm import tqdm

#py imports
import random
import numpy as np
import os
import torch
from torchvision import transforms
from torch.utils.data import DataLoader

In [6]:
import boto3
from pathlib import Path
from botocore import UNSIGNED
from botocore.client import Config
from tqdm.notebook import tqdm

def get_file_folders(s3_client, bucket_name, prefix=""):
    file_names = []
    folders = []

    default_kwargs = {
        "Bucket": bucket_name,
        "Prefix": prefix
    }
    next_token = ""

    while next_token is not None:
        updated_kwargs = default_kwargs.copy()
        if next_token != "":
            updated_kwargs["ContinuationToken"] = next_token

        response = s3_client.list_objects_v2(**updated_kwargs)
        contents = response.get("Contents")

        for result in contents:
            key = result.get("Key")
            if key[-1] == "/":
                folders.append(key)
            else:
                file_names.append(key)

        next_token = response.get("NextContinuationToken")

    return file_names, folders

def download_files(s3_client, bucket_name, local_path, file_names, folders):
    local_path = Path(local_path)

    for folder in tqdm(folders):
        folder_path = Path.joinpath(local_path, folder)
				# Create all folders in the path
        folder_path.mkdir(parents=True, exist_ok=True)

    for file_name in tqdm(file_names):
        file_path = Path.joinpath(local_path, file_name)
				# Create folder for parent directory
        file_path.parent.mkdir(parents=True, exist_ok=True)
        s3_client.download_file(
            bucket_name,
            file_name,
            str(file_path)
        )

data_path = 'data/input/images'
if not(os.path.exists(os.path.join(os.getcwd(), data_path))):
    client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    file_names, folders = get_file_folders(client, 'cgiar-crop-damage-classification-challenge')
    download_files(
        client,
        'cgiar-crop-damage-classification-challenge',
        "/content/drive/MyDrive/Personal-Projects/crop-damage-classification/data/input",
        file_names,
        folders
    )

In [7]:
# initialize directories and config data
init_config()
config = get_config()
print('Config parameters\n')
print(config)

Config parameters

{'X_key': 'image', 'data_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification/data', 'device': 'cpu', 'img_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification/data/input/images', 'output_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification/output', 'root_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification', 'use_gpu': False, 'y_key': 'label'}


In [8]:
#clean up invalid images

'''
from PIL import Image, UnidentifiedImageError
import json
import pandas as pd

image_files = os.listdir(os.path.join(config['root_dir'],'data/input/images'))
corrupt_files = []
print('scanning files for corrupt images\n\n')
for img_file in tqdm(image_files):
    try:
        img = Image.open(os.path.join(config['img_dir'], img_file))
    except UnidentifiedImageError as e:
        if os.path.exists(os.path.join(config["data_dir"], img_file)):
            os.remove(os.path.join(config["data_dir"], img_file))
        corrupt_files.append(img_file)
#corrupt_files = ['c3092a5186771280a99200624c4f67e33fde95ca.jpg']
data = { "data": corrupt_files }
with open('corrupt_files.json', 'w') as fp:
    json.dump(data, fp)

train_path = os.path.join(config['data_dir'], 'input/Train.csv')
train_df = pd.read_csv(train_path)
error_rows = train_df.loc[train_df['filename'].isin(corrupt_files)].index.tolist()
train_df = train_df.drop(labels = error_rows)
train_df.to_csv(train_path, index = False)
'''

'\nfrom PIL import Image, UnidentifiedImageError\nimport json\nimport pandas as pd\n\nimage_files = os.listdir(os.path.join(config[\'root_dir\'],\'data/input/images\'))\ncorrupt_files = []\nprint(\'scanning files for corrupt images\n\n\')\nfor img_file in tqdm(image_files):\n    try:\n        img = Image.open(os.path.join(config[\'img_dir\'], img_file))\n    except UnidentifiedImageError as e:\n        if os.path.exists(os.path.join(config["data_dir"], img_file)):\n            os.remove(os.path.join(config["data_dir"], img_file))\n        corrupt_files.append(img_file)\n#corrupt_files = [\'c3092a5186771280a99200624c4f67e33fde95ca.jpg\']\ndata = { "data": corrupt_files }\nwith open(\'corrupt_files.json\', \'w\') as fp:\n    json.dump(data, fp)\n\ntrain_path = os.path.join(config[\'data_dir\'], \'input/Train.csv\')\ntrain_df = pd.read_csv(train_path)\nerror_rows = train_df.loc[train_df[\'filename\'].isin(corrupt_files)].index.tolist()\ntrain_df = train_df.drop(labels = error_rows)\ntrain

In [9]:
# insert index column to label csvs
insert_index_2csv(os.path.join(config['data_dir'], 'input/Train.csv'))
insert_index_2csv(os.path.join(config['data_dir'], 'input/Test.csv'))

In [10]:
# read experiment parameters
exp_params = get_exp_params()
print('Experiment parameters\n')
print(exp_params)

Experiment parameters

{'transform': {'resize_dim': 256, 'crop_dim': 224}, 'train': {'shuffle_data': False, 'batch_size': 128, 'val_split_method': 'k-fold', 'k': 3, 'val_percentage': 20, 'loss': 'cross-entropy', 'epoch_interval': 1, 'num_epochs': 5}, 'model': {'name': 'resnet18', 'optimizer': 'Adam', 'lr': 0.001, 'weight_decay': 1e-05, 'amsgrad': False, 'momentum': 0.9}, 'test_model': False}


In [11]:
#initialize randomness seed
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [12]:
#preprocess data or load preprocessed data

#build label dict
label_dict = {
    'DR': 0,
    'G': 1,
    'ND': 2,
    'WD': 3,
    'other': 4
}

class_dict = {
    0: 'DR',
    1: 'G',
    2: 'ND',
    3: 'WD',
    4: 'other'
}

In [13]:
#save X_key and y_key
save2config('X_key', 'image')
save2config('y_key', 'label')

#transform data
data_transforms = [ToTensor(), Resize(exp_params['transform']['resize_dim']), CenterCrop(exp_params['transform']['crop_dim'])]

#convert to dataset
ftr_dataset = CropDataset('input/Train.csv', label_dict, False)
test_dataset = CropDataset('input/Test.csv', label_dict, True)
smlen = int(0.01 * len(ftr_dataset))
smftr_dataset = torch.utils.data.Subset(ftr_dataset, list(range(smlen)))
smtelen = int(0.01 * len(test_dataset))
smfte_dataset = torch.utils.data.Subset(test_dataset, list(range(smtelen)))
print('Full train dataset length:', len(ftr_dataset))
print('Test dataset length:', len(test_dataset))
print('Subset train dataset length:', smlen)
print('Subset test dataset length:', smtelen, '\n')



Full train dataset length: 26067
Test dataset length: 8663
Subset train dataset length: 260
Subset test dataset length: 86 



In [14]:
'''
print("Getting metrics for small data")
preop = Preprocessor()
all_folds_metrics = preop.get_dataset_metrics(smftr_dataset, data_transforms)
print(all_folds_metrics)
'''

'\nprint("Getting metrics for small data")\npreop = Preprocessor()\nall_folds_metrics = preop.get_dataset_metrics(smftr_dataset, data_transforms)\nprint(all_folds_metrics)\n'

In [15]:
'''
#running experiment on small subset of the dataset
exp = Experiment(exp_params['model']['name'], smftr_dataset, data_transforms, all_folds_metrics)
model_history = exp.train()
'''

"\n#running experiment on small subset of the dataset\nexp = Experiment(exp_params['model']['name'], smftr_dataset, data_transforms, all_folds_metrics)\nmodel_history = exp.train()\n"

In [16]:
print("Getting metrics for data")
preop = Preprocessor()
all_folds_metrics = preop.get_dataset_metrics(ftr_dataset, data_transforms)
print(all_folds_metrics)

Getting metrics for data
	Calculating metric for split 0 starting with 0, ending with 8689


  batchlist = list(map(np.array, zip(*batch)))


		Getting metrics for batch 0
		Getting metrics for batch 1
		Getting metrics for batch 2
		Getting metrics for batch 3
		Getting metrics for batch 4
		Getting metrics for batch 5
		Getting metrics for batch 6
		Getting metrics for batch 7
		Getting metrics for batch 8
		Getting metrics for batch 9
		Getting metrics for batch 10
		Getting metrics for batch 11
		Getting metrics for batch 12
		Getting metrics for batch 13
		Getting metrics for batch 14
		Getting metrics for batch 15
		Getting metrics for batch 16
		Getting metrics for batch 17
		Getting metrics for batch 18
		Getting metrics for batch 19
		Getting metrics for batch 20
		Getting metrics for batch 21
		Getting metrics for batch 22
		Getting metrics for batch 23
		Getting metrics for batch 24
		Getting metrics for batch 25
		Getting metrics for batch 26
		Getting metrics for batch 27
		Getting metrics for batch 28
		Getting metrics for batch 29
		Getting metrics for batch 30
		Getting metrics for batch 31
		Getting metrics 

  batchlist = list(map(np.array, zip(*batch)))


		Getting metrics for batch 37


  batchlist = list(map(np.array, zip(*batch)))


		Getting metrics for batch 38
		Getting metrics for batch 39
		Getting metrics for batch 40
		Getting metrics for batch 41
		Getting metrics for batch 42
		Getting metrics for batch 43
		Getting metrics for batch 44
		Getting metrics for batch 45
		Getting metrics for batch 46
		Getting metrics for batch 47
		Getting metrics for batch 48
		Getting metrics for batch 49
		Getting metrics for batch 50
		Getting metrics for batch 51
		Getting metrics for batch 52
		Getting metrics for batch 53
		Getting metrics for batch 54
		Getting metrics for batch 55
		Getting metrics for batch 56
		Getting metrics for batch 57
		Getting metrics for batch 58
		Getting metrics for batch 59
		Getting metrics for batch 60
		Getting metrics for batch 61
		Getting metrics for batch 62
		Getting metrics for batch 63
		Getting metrics for batch 64
		Getting metrics for batch 65
		Getting metrics for batch 66
		Getting metrics for batch 67
		Getting metrics for batch 68
		Getting metrics for batch 69
		Gettin

In [None]:
#model training on full dataset
exp = Experiment(exp_params['model']['name'], ftr_dataset, data_transforms, all_folds_metrics)
model_history = exp.train()

Running split 0 starting at 0 and ending with 8689
	Running Epoch 0
		Running through training dataset


In [None]:
# get best model
model = get_model(exp_params["model"]["name"])
model = get_saved_model(model, '')
model_info = get_modelinfo('')
best_fold = model_info['results']['fold']
metric_path = os.path.join(config["root_dir"], "models/checkpoints/all_folds_metrics.json")
all_folds_metrics = read_json(metric_path)
print('All folds metrics')
print(all_folds_metrics)

print("\nModel validation results")
print(model_info['results']['trlosshistory'])
#visualization results
vis = Visualization(model_info, model_history)
vis.get_results()

In [None]:
'''
#model testing on small test dataset
print("\n\nTesting Saved Model")
metrics = all_folds_metrics[f'{best_fold}'] if exp_params['train']['val_split_method'] == 'k-fold' else all_folds_metrics
mt = ModelTester(model, smfte_dataset, data_transforms, metrics)
mt.test_and_save_csv(class_dict)
'''

In [None]:
#model testing on test dataset
print("\n\nTesting Saved Model")
metrics = all_folds_metrics[f'{best_fold}'] if exp_params['train']['val_split_method'] == 'k-fold' else all_folds_metrics
mt = ModelTester(model, test_dataset, data_transforms, metrics)
mt.test_and_save_csv(class_dict)