In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!ls

Mounted at /content/drive
drive  sample_data


In [2]:
# move into project directory
repo_name = "crop-damage-classification"
%cd /content/drive/MyDrive/Personal-Projects/$repo_name
!ls

/content/drive/MyDrive/Personal-Projects/crop-damage-classification
common		    data	 Index_bc.py  models	  preprocess_input.py	run.yaml
config.yaml	    dataloading  Index.ipynb  output	  project-structure.md	transforms
corrupt_files.json  experiments  index.py     preprocess  README.md		visualization


In [3]:
# set up environment
# comment if not required
'''
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install matplotlib numpy pandas pyyaml opencv-python
'''

'\n!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n!pip install matplotlib numpy pandas pyyaml opencv-python\n'

# Following cells are for downloading data

In [4]:
# this cell is for downloading data.
# as of yet data is not hosted and is available in the private data folder
# comment if not required
!pip install boto3
!pip install tqdm



In [5]:
# setup some imports
#custom imports
from transforms.transforms import ToTensor, Resize, CenterCrop
from dataloading.dataset import CropDataset
from common.utils import get_exp_params, init_config, get_config, save2config, get_modelinfo, get_saved_model, read_json, insert_index_2csv
from models.resnet18 import Resnet18
from models.custom_models import get_model
from experiments.experiments import Experiment
from visualization.visualization import Visualization
from experiments.test_model import ModelTester
from preprocess.preprocessor import Preprocessor
from tqdm import tqdm

#py imports
import random
import numpy as np
import os
import torch
from torchvision import transforms
from torch.utils.data import DataLoader

In [6]:
import boto3
from pathlib import Path
from botocore import UNSIGNED
from botocore.client import Config
from tqdm.notebook import tqdm

def get_file_folders(s3_client, bucket_name, prefix=""):
    file_names = []
    folders = []

    default_kwargs = {
        "Bucket": bucket_name,
        "Prefix": prefix
    }
    next_token = ""

    while next_token is not None:
        updated_kwargs = default_kwargs.copy()
        if next_token != "":
            updated_kwargs["ContinuationToken"] = next_token

        response = s3_client.list_objects_v2(**updated_kwargs)
        contents = response.get("Contents")

        for result in contents:
            key = result.get("Key")
            if key[-1] == "/":
                folders.append(key)
            else:
                file_names.append(key)

        next_token = response.get("NextContinuationToken")

    return file_names, folders

def download_files(s3_client, bucket_name, local_path, file_names, folders):
    local_path = Path(local_path)

    for folder in tqdm(folders):
        folder_path = Path.joinpath(local_path, folder)
				# Create all folders in the path
        folder_path.mkdir(parents=True, exist_ok=True)

    for file_name in tqdm(file_names):
        file_path = Path.joinpath(local_path, file_name)
				# Create folder for parent directory
        file_path.parent.mkdir(parents=True, exist_ok=True)
        s3_client.download_file(
            bucket_name,
            file_name,
            str(file_path)
        )

data_path = 'data/input/images'
if not(os.path.exists(os.path.join(os.getcwd(), data_path))):
    client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    file_names, folders = get_file_folders(client, 'cgiar-crop-damage-classification-challenge')
    download_files(
        client,
        'cgiar-crop-damage-classification-challenge',
        "/content/drive/MyDrive/Personal-Projects/crop-damage-classification/data/input",
        file_names,
        folders
    )

In [7]:
# initialize directories and config data
init_config()
config = get_config()
print('Config parameters\n')
print(config)

Config parameters

{'X_key': 'image', 'data_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification/data', 'device': 'cpu', 'img_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification/data/input/images', 'output_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification/output', 'root_dir': '/content/drive/MyDrive/Personal-Projects/crop-damage-classification', 'use_gpu': False, 'y_key': 'label'}


In [8]:
#clean up invalid images

'''
from PIL import Image, UnidentifiedImageError
import json
import pandas as pd

image_files = os.listdir(os.path.join(config['root_dir'],'data/input/images'))
corrupt_files = []
print('scanning files for corrupt images\n\n')
for img_file in tqdm(image_files):
    try:
        img = Image.open(os.path.join(config['img_dir'], img_file))
    except UnidentifiedImageError as e:
        if os.path.exists(os.path.join(config["data_dir"], img_file)):
            os.remove(os.path.join(config["data_dir"], img_file))
        corrupt_files.append(img_file)
#corrupt_files = ['c3092a5186771280a99200624c4f67e33fde95ca.jpg']
data = { "data": corrupt_files }
with open('corrupt_files.json', 'w') as fp:
    json.dump(data, fp)

train_path = os.path.join(config['data_dir'], 'input/Train.csv')
train_df = pd.read_csv(train_path)
error_rows = train_df.loc[train_df['filename'].isin(corrupt_files)].index.tolist()
train_df = train_df.drop(labels = error_rows)
train_df.to_csv(train_path, index = False)
'''

'\nfrom PIL import Image, UnidentifiedImageError\nimport json\nimport pandas as pd\n\nimage_files = os.listdir(os.path.join(config[\'root_dir\'],\'data/input/images\'))\ncorrupt_files = []\nprint(\'scanning files for corrupt images\n\n\')\nfor img_file in tqdm(image_files):\n    try:\n        img = Image.open(os.path.join(config[\'img_dir\'], img_file))\n    except UnidentifiedImageError as e:\n        if os.path.exists(os.path.join(config["data_dir"], img_file)):\n            os.remove(os.path.join(config["data_dir"], img_file))\n        corrupt_files.append(img_file)\n#corrupt_files = [\'c3092a5186771280a99200624c4f67e33fde95ca.jpg\']\ndata = { "data": corrupt_files }\nwith open(\'corrupt_files.json\', \'w\') as fp:\n    json.dump(data, fp)\n\ntrain_path = os.path.join(config[\'data_dir\'], \'input/Train.csv\')\ntrain_df = pd.read_csv(train_path)\nerror_rows = train_df.loc[train_df[\'filename\'].isin(corrupt_files)].index.tolist()\ntrain_df = train_df.drop(labels = error_rows)\ntrain

In [9]:
# insert index column to label csvs
insert_index_2csv(os.path.join(config['data_dir'], 'input/Train.csv'))
insert_index_2csv(os.path.join(config['data_dir'], 'input/Test.csv'))

In [10]:
# read experiment parameters
exp_params = get_exp_params()
print('Experiment parameters\n')
print(exp_params)

Experiment parameters

{'transform': {'resize_dim': 350, 'crop_dim': 256}, 'train': {'shuffle_data': False, 'batch_size': 128, 'val_split_method': 'k-fold', 'k': 3, 'val_percentage': 20, 'loss': 'cross-entropy', 'epoch_interval': 1, 'num_epochs': 2}, 'model': {'name': 'alexnet', 'optimizer': 'Adam', 'lr': 0.001, 'weight_decay': 1e-05, 'amsgrad': False, 'momentum': 0.9}, 'test_model': False}


In [11]:
#initialize randomness seed
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [12]:
#preprocess data or load preprocessed data

#build label dict
label_dict = {
    'DR': 0,
    'G': 1,
    'ND': 2,
    'WD': 3,
    'other': 4
}

class_dict = {
    0: 'DR',
    1: 'G',
    2: 'ND',
    3: 'WD',
    4: 'other'
}

In [13]:
#save X_key and y_key
save2config('X_key', 'image')
save2config('y_key', 'label')

#transform data
data_transforms = [ToTensor(), Resize(exp_params['transform']['resize_dim']), CenterCrop(exp_params['transform']['crop_dim'])]

#convert to dataset
ftr_dataset = CropDataset('input/Train.csv', label_dict, False)
test_dataset = CropDataset('input/Test.csv', label_dict, True)
smlen = int(0.01 * len(ftr_dataset))
smftr_dataset = torch.utils.data.Subset(ftr_dataset, list(range(smlen)))
smtelen = int(0.01 * len(test_dataset))
smfte_dataset = torch.utils.data.Subset(test_dataset, list(range(smtelen)))
print('Full train dataset length:', len(ftr_dataset))
print('Test dataset length:', len(test_dataset))
print('Subset train dataset length:', smlen)
print('Subset test dataset length:', smtelen, '\n')



Full train dataset length: 26068
Test dataset length: 8663
Subset train dataset length: 260
Subset test dataset length: 86 



In [14]:
'''
print("Getting metrics for small data")
preop = Preprocessor()
all_folds_metrics = preop.get_dataset_metrics(smftr_dataset, data_transforms)
print(all_folds_metrics)
'''

'\nprint("Getting metrics for small data")\npreop = Preprocessor()\nall_folds_metrics = preop.get_dataset_metrics(smftr_dataset, data_transforms)\nprint(all_folds_metrics)\n'

In [15]:
'''
#running experiment on small subset of the dataset
all_folds_metrics = read_json(os.path.join(config['root_dir'], 'models/checkpoints/all_folds_metrics.json'))
all_folds_metrics = {int(k): v for k,v in all_folds_metrics.items()}
exp = Experiment(exp_params['model']['name'], smftr_dataset, data_transforms, all_folds_metrics)
model_history = exp.train()
'''

"\n#running experiment on small subset of the dataset\nall_folds_metrics = read_json(os.path.join(config['root_dir'], 'models/checkpoints/all_folds_metrics.json'))\nall_folds_metrics = {int(k): v for k,v in all_folds_metrics.items()}\nexp = Experiment(exp_params['model']['name'], smftr_dataset, data_transforms, all_folds_metrics)\nmodel_history = exp.train()\n"

In [16]:
'''
print("Getting metrics for data")
preop = Preprocessor()
all_folds_metrics = preop.get_dataset_metrics(ftr_dataset, data_transforms)
print(all_folds_metrics)
'''

'\nprint("Getting metrics for data")\npreop = Preprocessor()\nall_folds_metrics = preop.get_dataset_metrics(ftr_dataset, data_transforms)\nprint(all_folds_metrics)\n'

In [17]:
#model training on full dataset
all_folds_metrics = read_json(os.path.join(config['root_dir'], 'models/checkpoints/all_folds_metrics.json'))
all_folds_metrics = {int(k): v for k,v in all_folds_metrics.items()}
print(all_folds_metrics)
print('\n\n')

exp = Experiment(exp_params['model']['name'], ftr_dataset, data_transforms, all_folds_metrics)
model_history = exp.train()

{0: {'mean': [0.0017509627575967826, 0.001740593769971062, 0.001279151205923043], 'std0': [0.0009911220447689879, 0.0009941544018539728, 0.0011366211900524065]}, 1: {'mean': [0.001751121586444331, 0.0017421398677077947, 0.0012799400909274232], 'std0': [0.0009921487639932072, 0.0009948029237634996, 0.001136322816212972]}, 2: {'mean': [0.0017505734574561026, 0.0017417813048643223, 0.0012787652950660856], 'std0': [0.0009890975905399696, 0.0009928238158132515, 0.0011353534810683308]}, 3: {'mean': [0.0017509331890180999, 0.0017415279266881007, 0.0012793144759009865], 'std0': [0.0009907820645500632, 0.0009939293066660562, 0.0011360991234872855]}}



None



Running split 0 starting at 0 and ending with 8689
	Running Epoch 0


		Running through training set:   4%|▎         | 5/136 [09:24<4:06:34, 112.94s/it]


KeyboardInterrupt: 

In [None]:
# get best model
model = get_model(exp_params["model"]["name"])
model = get_saved_model(model, '')
model_info = get_modelinfo('')
best_fold = model_info['results']['fold']
metric_path = os.path.join(config["root_dir"], "models/checkpoints/all_folds_metrics.json")
all_folds_metrics = read_json(metric_path)
print('All folds metrics')
print(all_folds_metrics)

print("\nModel validation results")
print(model_info['results']['trlosshistory'])
#visualization results
vis = Visualization(model_info, model_history)
vis.get_results()

In [None]:
from common.utils import get_exp_params, get_accuracy, get_config, get_model_filename, save_experiment_output, image_collate
from torch.utils.data import DataLoader, Subset
import torch
from matplotlib import pyplot as plt
import os
import pandas as pd
import torch.nn.functional as F
from torchvision.transforms import Normalize, Compose
import numpy as np
from tqdm import tqdm
import warnings

class ModelTester:

    def __init__(self, model, te_dataset, data_transforms, metrics):
        cfg = get_config()
        self.te_dataset = te_dataset
        self.model = model.cpu()
        self.model.eval()
        self.exp_params = get_exp_params()
        self.te_loader = DataLoader(self.te_dataset,
            batch_size = self.exp_params['train']['batch_size'],
            shuffle = False
        )
        self.output_dir = cfg['output_dir']
        self.device = cfg['device']
        self.metrics = metrics
        self.data_transforms = data_transforms
        self.test_df = pd.read_csv(os.path.join(cfg['data_dir'], 'input/Test.csv'))

    def __plot_results(self, predicted_labels, subset_len = 10):
        fr = list(range(subset_len))
        subset_dataset = Subset(self.te_dataset, fr)
        subset_loader = DataLoader(subset_dataset, batch_size = 1, shuffle = False)
        fl = len(subset_dataset)
        plt.clf()
        plt.figure(figsize = (subset_len, 1))
        for bi, batch in enumerate(subset_loader):
            img = batch[self.X_key][0]
            plt.subplot(1,10,bi+1).set_title(predicted_labels[bi])
            plt.imshow(batch[bi,:,:,:])
            plt.axis(False)
        plt.show()
        plt.savefig(os.path.join(self.output_dir, "sample_test_results.png"))

    def test_and_save_csv(self, lbl_dict, plot_sample_results = False):
        warnings.filterwarnings('ignore')
        self.model = self.model.to(self.device)
        test_loader = DataLoader(self.te_dataset,
            batch_size = self.exp_params["train"]["batch_size"], shuffle = False,
            collate_fn = image_collate)
        self.model.eval()
        running_loss = 0.0
        acc = 0.0
        num2class = lambda x: lbl_dict[x.item()]
        sub_lbls = ['ID', 'DR', 'G', 'ND', 'WD', 'other']
        rpath = os.path.join(self.output_dir, "results.csv")
        cbi = 0
        bsize = self.exp_params['train']['batch_size']
        if os.path.exists(rpath):
            results_df = pd.read_csv(rpath)
            no_rows = len(results_df)
            if no_rows % bsize == 0:
                cbi = (no_rows // bsize) - 1
                cbin = cbi * bsize
            else:
                cbi = no_rows // bsize
                cbin = cbi * bsize
            cbir = list(range(cbin, no_rows))
            results_df = results_df.drop(labels = cbir, axis = 0)
        else:
            results_df = pd.DataFrame([], columns = sub_lbls)
            cbi = 0

        data_transforms = Compose(self.data_transforms)
        normalize = Normalize(self.metrics['mean'], self.metrics['std0'])
        with torch.no_grad():
            for bi, batch in enumerate(tqdm(test_loader, desc = 'Running through test dataset: ', position = 0, leave = True)):
                if bi >= cbi:
                    img_batch = list(map(data_transforms, batch[1]))
                    img_batch = np.stack(img_batch, 0)
                    img_batch = normalize(torch.from_numpy(img_batch)).to(self.device)
                    #img_target = torch.from_numpy(batch[2]).to(self.device)
                    img_ids = self.test_df.loc[batch[0],'ID'].tolist()
                    op = self.model(normalize(img_batch))
                    print(op.size())
                    opprobs = F.softmax(op, dim = 1)
                    print(opprobs.size())
                    print(op[0], opprobs[0])
                    oplbls = torch.argmax(op, 1)
                    classlbls = list(map(num2class, oplbls))
                    res = [[id] + preds for id,preds in zip(img_ids, op.tolist())]
                    batch_df = pd.DataFrame(res, columns = sub_lbls)
                    results_df = pd.concat([results_df, batch_df], 0)
                    results_df.to_csv(rpath, index = False)
                    del batch
                else:
                    pass


In [None]:
#model testing on small test dataset
print("\n\nTesting Saved Model")
metrics = all_folds_metrics[f'{best_fold}'] if exp_params['train']['val_split_method'] == 'k-fold' else all_folds_metrics
mt = ModelTester(model, smfte_dataset, data_transforms, metrics)
mt.test_and_save_csv(class_dict)

In [None]:
#model testing on test dataset
print("\n\nTesting Saved Model")
metrics = all_folds_metrics[f'{best_fold}'] if exp_params['train']['val_split_method'] == 'k-fold' else all_folds_metrics
mt = ModelTester(model, test_dataset, data_transforms, metrics)
mt.test_and_save_csv(class_dict)