In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!ls

MessageError: Error: credential propagation was unsuccessful

In [None]:
# move into project directory
repo_name = "crop-damage-classification"
%cd /content/drive/MyDrive/Personal-Projects/$repo_name
!ls

In [None]:
# set up environment
# comment if not required
'''
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install matplotlib numpy pandas pyyaml opencv-python
'''

# Following cells are for downloading data

In [None]:
# this cell is for downloading data.
# as of yet data is not hosted and is available in the private data folder
# comment if not required
!pip install boto3
!pip install tqdm

In [None]:
# setup some imports
#custom imports
from transforms.transforms import ToTensor, Resize, CenterCrop
from dataloading.dataset import CropDataset
from common.utils import get_exp_params, init_config, get_config, save2config, get_modelinfo, get_saved_model, read_json, insert_index_2csv
from models.resnet18 import Resnet18
from models.custom_models import get_model
from experiments.experiments import Experiment
from visualization.visualization import Visualization
from experiments.test_model import ModelTester
from preprocess.preprocessor import Preprocessor
from tqdm import tqdm

#py imports
import random
import numpy as np
import os
import torch
from torchvision import transforms
from torch.utils.data import DataLoader

In [None]:
import boto3
from pathlib import Path
from botocore import UNSIGNED
from botocore.client import Config
from tqdm.notebook import tqdm

def get_file_folders(s3_client, bucket_name, prefix=""):
    file_names = []
    folders = []

    default_kwargs = {
        "Bucket": bucket_name,
        "Prefix": prefix
    }
    next_token = ""

    while next_token is not None:
        updated_kwargs = default_kwargs.copy()
        if next_token != "":
            updated_kwargs["ContinuationToken"] = next_token

        response = s3_client.list_objects_v2(**updated_kwargs)
        contents = response.get("Contents")

        for result in contents:
            key = result.get("Key")
            if key[-1] == "/":
                folders.append(key)
            else:
                file_names.append(key)

        next_token = response.get("NextContinuationToken")

    return file_names, folders

def download_files(s3_client, bucket_name, local_path, file_names, folders):
    local_path = Path(local_path)

    for folder in tqdm(folders):
        folder_path = Path.joinpath(local_path, folder)
				# Create all folders in the path
        folder_path.mkdir(parents=True, exist_ok=True)

    for file_name in tqdm(file_names):
        file_path = Path.joinpath(local_path, file_name)
				# Create folder for parent directory
        file_path.parent.mkdir(parents=True, exist_ok=True)
        s3_client.download_file(
            bucket_name,
            file_name,
            str(file_path)
        )

data_path = 'data/input/images'
if not(os.path.exists(os.path.join(os.getcwd(), data_path))):
    client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    file_names, folders = get_file_folders(client, 'cgiar-crop-damage-classification-challenge')
    download_files(
        client,
        'cgiar-crop-damage-classification-challenge',
        "/content/drive/MyDrive/Personal-Projects/crop-damage-classification/data/input",
        file_names,
        folders
    )

In [None]:
# initialize directories and config data
init_config()
config = get_config()
print('Config parameters\n')
print(config)

In [None]:
#clean up invalid images

'''
from PIL import Image, UnidentifiedImageError
import json
import pandas as pd

image_files = os.listdir(os.path.join(config['root_dir'],'data/input/images'))
corrupt_files = []
print('scanning files for corrupt images\n\n')
for img_file in tqdm(image_files):
    try:
        img = Image.open(os.path.join(config['img_dir'], img_file))
    except UnidentifiedImageError as e:
        if os.path.exists(os.path.join(config["data_dir"], img_file)):
            os.remove(os.path.join(config["data_dir"], img_file))
        corrupt_files.append(img_file)
#corrupt_files = ['c3092a5186771280a99200624c4f67e33fde95ca.jpg']
data = { "data": corrupt_files }
with open('corrupt_files.json', 'w') as fp:
    json.dump(data, fp)

train_path = os.path.join(config['data_dir'], 'input/Train.csv')
train_df = pd.read_csv(train_path)
error_rows = train_df.loc[train_df['filename'].isin(corrupt_files)].index.tolist()
train_df = train_df.drop(labels = error_rows)
train_df.to_csv(train_path, index = False)
'''

In [None]:
# insert index column to label csvs
insert_index_2csv(os.path.join(config['data_dir'], 'input/Train.csv'))
insert_index_2csv(os.path.join(config['data_dir'], 'input/Test.csv'))

In [None]:
# read experiment parameters
exp_params = get_exp_params()
print('Experiment parameters\n')
print(exp_params)

In [None]:
#initialize randomness seed
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:
#preprocess data or load preprocessed data

#build label dict
label_dict = {
    'DR': 0,
    'G': 1,
    'ND': 2,
    'WD': 3,
    'other': 4
}

class_dict = {
    0: 'DR',
    1: 'G',
    2: 'ND',
    3: 'WD',
    4: 'other'
}

In [None]:
#save X_key and y_key
save2config('X_key', 'image')
save2config('y_key', 'label')

#transform data
data_transforms = [ToTensor(), Resize(exp_params['transform']['resize_dim']), CenterCrop(exp_params['transform']['crop_dim'])]

#convert to dataset
ftr_dataset = CropDataset('input/Train.csv', label_dict, False)
test_dataset = CropDataset('input/Test.csv', label_dict, True)
smlen = int(0.01 * len(ftr_dataset))
smftr_dataset = torch.utils.data.Subset(ftr_dataset, list(range(smlen)))
smtelen = int(0.01 * len(test_dataset))
smfte_dataset = torch.utils.data.Subset(test_dataset, list(range(smtelen)))
print('Full train dataset length:', len(ftr_dataset))
print('Test dataset length:', len(test_dataset))
print('Subset train dataset length:', smlen)
print('Subset test dataset length:', smtelen, '\n')



In [None]:
'''
print("Getting metrics for small data")
preop = Preprocessor()
all_folds_metrics = preop.get_dataset_metrics(smftr_dataset, data_transforms)
print(all_folds_metrics)
'''

In [None]:
'''
#running experiment on small subset of the dataset
all_folds_metrics = read_json(os.path.join(config['root_dir'], 'models/checkpoints/all_folds_metrics.json'))
all_folds_metrics = {int(k): v for k,v in all_folds_metrics.items()}
exp = Experiment(exp_params['model']['name'], smftr_dataset, data_transforms, all_folds_metrics)
model_history = exp.train()
'''

In [None]:
'''
print("Getting metrics for data")
preop = Preprocessor()
all_folds_metrics = preop.get_dataset_metrics(ftr_dataset, data_transforms)
print(all_folds_metrics)
'''

In [None]:
#model training on full dataset
all_folds_metrics = read_json(os.path.join(config['root_dir'], 'models/checkpoints/all_folds_metrics.json'))
all_folds_metrics = {int(k): v for k,v in all_folds_metrics.items()}
print(all_folds_metrics)
print('\n\n')

exp = Experiment(exp_params['model']['name'], ftr_dataset, data_transforms, all_folds_metrics)
model_history = exp.train()

In [None]:
# get best model
model = get_model(exp_params["model"]["name"])
model = get_saved_model(model, '')
model_info = get_modelinfo('')
best_fold = model_info['results']['fold']
metric_path = os.path.join(config["root_dir"], "models/checkpoints/all_folds_metrics.json")
all_folds_metrics = read_json(metric_path)
print('All folds metrics')
print(all_folds_metrics)

print("\nModel validation results")
print(model_info['results']['trlosshistory'])
#visualization results
vis = Visualization(model_info, model_history)
vis.get_results()

In [None]:
'''
#model testing on small test dataset
print("\n\nTesting Saved Model")
metrics = all_folds_metrics[f'{best_fold}'] if exp_params['train']['val_split_method'] == 'k-fold' else all_folds_metrics
mt = ModelTester(model, smfte_dataset, data_transforms, metrics)
mt.test_and_save_csv(class_dict)
'''

In [None]:
#model testing on test dataset
print("\n\nTesting Saved Model")
metrics = all_folds_metrics[f'{best_fold}'] if exp_params['train']['val_split_method'] == 'k-fold' else all_folds_metrics
mt = ModelTester(model, test_dataset, data_transforms, metrics)
mt.test_and_save_csv(class_dict)