# Imports

## Important
You need to choose the image gdsc5-smstudio-custom/1

In [30]:
!pip install imutils
!pip install docutils
!pip install -U sagemaker



In [31]:
import matplotlib.pyplot as plt  # Used for plotting
%matplotlib inline
import pandas as pd  # Home of the DataFrame construct, _the_ most important object for Data Science
import numpy as np
import sys  # Python system library needed to load custom functions
import os
import json
import re
import tarfile

from matplotlib.patches import Rectangle  # Allows drawing the bounding boxes of the worm sections
from PIL import Image  # For loading image files
from tqdm import tqdm  # for timing a for loop

import tarfile
import os
import os.path
import shutil

In [32]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.estimator import Framework, Estimator
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput
from sagemaker.session import TrainingInput
from sagemaker.pytorch import PyTorch
from sagemaker_training import (
    entry_point,
    environment,
    errors,
    files,
    intermediate_output,
    logging_config,
    params,
    runner,
)

# AWS Configuration

In [33]:
sagemaker_session = sagemaker.Session()
sagemaker_role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

print(f'SageMaker Session: {sagemaker_session}')
print(f'SageMaker role: {sagemaker_role}')
print(f'Bucket: {bucket}')

SageMaker Session: <sagemaker.session.Session object at 0x7fa9c0942198>
SageMaker role: arn:aws:iam::494549386743:role/service-role/AmazonSageMaker-ExecutionRole-20220618T161542
Bucket: sagemaker-us-east-1-494549386743


In [34]:
role='AmazonSageMaker-ExecutionRole-20220618T161542'
role

'AmazonSageMaker-ExecutionRole-20220618T161542'

In [35]:
sagemaker_session = sagemaker.Session()
role = get_execution_role()
role

'arn:aws:iam::494549386743:role/service-role/AmazonSageMaker-ExecutionRole-20220618T161542'

In [36]:
!aws configure list

      Name                    Value             Type    Location
      ----                    -----             ----    --------
   profile                <not set>             None    None
access_key     ****************3YFS   container-role    
secret_key     ****************YdK8   container-role    
    region                us-east-1              env    AWS_DEFAULT_REGION


In [37]:
!aws s3api list-buckets --query "Buckets[].Name"

[
    "sagemaker-studio-jna7gyyxsih",
    "sagemaker-us-east-1-494549386743"
]


In [38]:
s3_input = "s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/training-inputs".format(bucket)
s3_images = "s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/dataset" # Images files are here, in a subfolder named 'train'
s3_labels = "s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/labels" # Label files are here, in a subfolder named 'train'

s3_output = 's3://sagemaker-studio-jna7gyyxsih/yolo-output'.format(bucket)
print(s3_input)
print(s3_images)
print(s3_labels)

# cfg  images weights labels
cfg='{}/input/models/'.format(s3_input)
weights='{}/input/data/weights/'.format(s3_input)
outpath='{}/'.format(s3_output)


print(cfg)
print(weights)
print(outpath)

images='{}/'.format(s3_images)
labels='{}/'.format(s3_labels)

print(images)
print(labels)

s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/training-inputs
s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/dataset
s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/labels
s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/training-inputs/input/models/
s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/training-inputs/input/data/weights/
s3://sagemaker-studio-jna7gyyxsih/yolo-output/
s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/dataset/
s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/labels/


In [11]:
container='763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.11.0-gpu-py38-cu113-ubuntu20.04-sagemaker'
container

'763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.11.0-gpu-py38-cu113-ubuntu20.04-sagemaker'

# Job Configuration

In [39]:
# JSON encode hyperparameters.
def json_encode_hyperparameters(hyperparameters):
    return {str(k): json.dumps(v) for (k, v) in hyperparameters.items()}


hyperparameters = {
    "epochs": 10,
    "batch-size": 6
}

inputs = {
    "cfg": TrainingInput(cfg),
    "images": TrainingInput(images),
    "weights": TrainingInput(weights),
    "labels": TrainingInput(labels)
}

In [40]:
estimator = PyTorch(
    entry_point='train.py',
    source_dir='s3://sagemaker-studio-jna7gyyxsih/yolo-input/visualsearch/training-inputs/input/input.tar.gz',
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.g4dn.xlarge',
    model_dir='/opt/ml/model',
    max_run=2*24*60*60,
    input_mode='File',
    output_path=outpath,
    train_output=outpath,
    base_job_name='visualsearch-yolov5',
    # user_entry_point="train.py",
    hyperparameters=hyperparameters,
    framework_version='1.9',
    py_version='py38'
)

# Make Custom Changes To yolov5

 - detect.py -> Changes Bounding Box Output Format
 - train.py -> Configured Image Size and Data yaml (Coco to Visualsearch)
 - requirements.txt -> Updated TorchVision Version
 - utils/general.py -> Increased Time Limit

If you resume the training:

1. change changes/train.py --weights to 'best.pt' line 481
1. run the next cell to copy the weights

In [41]:
# import shutil
# source_path='/home/sagemaker-user/river-blindness/outputs/weights/best.pt'
# target_path='/home/sagemaker-user/river-blindness/changes/'

# shutil.copy(source_path, target_path)

In [42]:
directory_path = os.getcwd()
folder_name = os.path.basename(directory_path)

if(folder_name == "yolov5"):
    %cd ..

In [43]:
%cd /home/sagemaker-user/river-blindness

/home/sagemaker-user/river-blindness


In [44]:
if(os.path.isdir('yolov5')):
    # If Repo already cloned
    %cd yolov5
    !git pull
    %cd ..
else:
    #If Repo not cloned
    !git clone https://github.com/ultralytics/yolov5.git

/home/sagemaker-user/river-blindness/yolov5
Updating 9d7bc06..2e57b84
error: Your local changes to the following files would be overwritten by merge:
	detect.py
	models/common.py
	requirements.txt
	train.py
	utils/general.py
	utils/metrics.py
Please commit your changes or stash them before you merge.
Aborting
/home/sagemaker-user/river-blindness


In [45]:
# Copy Changed files from changes to cloned yolov5
yolov5_path="yolov5"
changes_path="changes"

for src_dir, dirs, files in os.walk(changes_path):
    dst_dir = src_dir.replace(changes_path, yolov5_path, 1)
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
    for file_ in files:
        src_file = os.path.join(src_dir, file_)
        dst_file = os.path.join(dst_dir, file_)
        if os.path.exists(dst_file):
            # in case of the src and dst are the same file
            if os.path.samefile(src_file, dst_file):
                continue
            os.remove(dst_file)
        shutil.copy(src_file, dst_dir)

In [46]:
def make_tarfile_without_parent(output_filename, source_dir):
    with tarfile.open(output_filename, "w:gz") as tar:
        for fn in os.listdir(source_dir):
            p = os.path.join(source_dir, fn)
            tar.add(p, arcname=fn)

In [47]:
make_tarfile_without_parent("input.tar.gz", yolov5_path)

## Upload Custom yolov5 to S3 Bucket

In [48]:
import boto3, os
s3 = boto3.resource('s3')
s3.meta.client.upload_file("input.tar.gz","sagemaker-studio-jna7gyyxsih", "yolo-input/visualsearch/training-inputs/input/input.tar.gz")

# Run Training Job

In [49]:
# to interrupt and stop the training job while running, uncomment and run
import boto3

client = boto3.client('sagemaker')
print(f'client: {client}')
training_job_name=client.list_training_jobs()['TrainingJobSummaries'][0]['TrainingJobName']
print(f'training_jobs: {training_job_name}')

client: <botocore.client.SageMaker object at 0x7fa9b15ade80>
training_jobs: visualsearch-yolov5-2022-08-11-13-19-18-018


To stop the training job: uncomment and run this cell

In [50]:
# response=client.stop_training_job(TrainingJobName=training_job_name)
# print(f'response: {response}')

In [51]:
import time 
dateStr = time.strftime("%Y-%m-%d--%H-%M-%S")
log_train_file_name='log-detect-'+dateStr+'.txt'
with open('/home/sagemaker-user/river-blindness/outputs/log/'+log_train_file_name, 'w') as f: f.write('')

In [52]:
%%time
%%capture --no-stderr output

estimator.fit(inputs,wait=False)

with open('outputs/log/'+log_train_file_name, 'w') as f: f.write(output.stdout)

CPU times: user 29.7 ms, sys: 0 ns, total: 29.7 ms
Wall time: 299 ms


In [53]:
with open('outputs/log/'+log_train_file_name, 'w') as f: f.write(output.stdout)

In [None]:
%pwd

# Get Results from S3

In [54]:
%%time
# if session is ended, and you must re-run the notebook: uncomment this cell
import boto3

folder='visualsearch-yolov5-2022-08-11-14-24-40-772' # put the related folder here manually
model_output='s3://sagemaker-studio-jna7gyyxsih/yolo-output/'+folder+'/output/model.tar.gz'
m = re.search('^s3://([^/]+)/(.+)$', model_output)
source_bucket = m.group(1)
source_file = m.group(2)
target_file = "outputs/model.tar.gz"

s3 = boto3.resource('s3')
s3.meta.client.download_file(source_bucket, source_file, target_file)

tar = tarfile.open(target_file, "r:gz")
tar.extractall(path="outputs")
tar.close()


CPU times: user 4.17 s, sys: 1.74 s, total: 5.91 s
Wall time: 12.8 s


In [124]:
# m = re.search('^s3://([^/]+)/(.+)$', estimator.model_data)
# source_bucket = m.group(1)
# source_file = m.group(2)
# target_file = "outputs/model.tar.gz"

# s3 = boto3.resource('s3')
# s3.meta.client.download_file(source_bucket, source_file, target_file)

# tar = tarfile.open(target_file, "r:gz")
# tar.extractall(path="outputs")
# tar.close()

In [125]:
# estimator.model_data

# Make a Submission

Submisson CSV needs to be semicolon seperated (not comma!!!)

In [55]:
def getAllFiles(path,extension='jpg'):
    import glob, os
    os.chdir(path)
    return [file for file in glob.glob('*.'+extension)]

In [56]:
def emptyDetectFolder(detect_path):
    
    import os, shutil
    root_path='/home/sagemaker-user/'
    yolov5_path='river-blindness/yolov5/'
    folder = root_path + yolov5_path + detect_path
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    
    print(f'Folder {folder} is empty now!')
    return

In [57]:
def runDetect(test_folder,imgsz=1024, conf_val=0.3):  
    root_path='/home/sagemaker-user/'
    test_path='river-blindness/yolov5/' # to get to test folder
    final_test_path=root_path+test_path+test_folder
    imgs=getAllFiles(final_test_path,extension='jpg')
    %cd /home/sagemaker-user/river-blindness/yolov5
    w_p= root_path + 'river-blindness/outputs/weights/best.pt'# get the best weights 
    for i in imgs:
        test_img=final_test_path+ '/' +i
        print(test_img)
        !python detect.py --weights {w_p} --img {imgsz} --conf {conf_val} --source {test_img}
    return

In [58]:
%pwd

'/home/sagemaker-user/river-blindness'

In [59]:
def find_all(name, path):
    result = []
    for root, dirs, files in os.walk(path):
        if name in files:
            result.append(os.path.join(root, name))
    return result

In [60]:
def collectDetectData(test_path):
    import datetime
    import time
    import shutil    
    now = datetime.datetime.now()
    current_time = now.strftime('%Y-%m-%d-%H-%M-%S')
    root_path='/home/sagemaker-user/'
    yolov5_path='river-blindness/yolov5/'
    # test_path=root_path+yolov5_path+test_path
    test_img_p='/home/sagemaker-user/river-blindness/content/test-images2/'
    imgs=getAllFiles(test_img_p,extension='jpg')
    detect_path=root_path+yolov5_path+'runs/detect/'  
    folder = 'detect-' + current_time
    path = os.path.join(detect_path, folder)  
    os.mkdir(path) 
    full_path=detect_path+folder+'/'
    print("Directory '% s' created " % full_path) 
    for i in imgs:
        test_img='test/images/'+i
        detect_img=find_all(i,detect_path)[-1]
        if len(find_all(i[:-4]+'-xyxy.txt',detect_path)):
            detect_label=find_all(i[:-4]+'-xyxy.txt',detect_path)[-1]
        shutil.copy(detect_img, full_path) #copying img to our folder
        shutil.copy(detect_label, full_path) #copying labels to our folder
    shutil.make_archive(folder, 'zip', full_path) # creating a zip file in test folder
    return full_path

In [61]:
def get_folder_size(path):
    size = 0
    for path, dirs, files in os.walk(path):
        for f in files:
            fp = os.path.join(path, f)
            size += os.stat(fp).st_size
   
    if size < 1024:
        return f"{size} bytes"
    elif size < 1024*1024:
        return f"{round(size/1024, 2)} KB"
    elif size < 1024*1024*1024:
        return f"{round(size/(1024*1024), 2)} MB"
    elif size < 1024*1024*1024*1024:
        return f"{round(size/(1024*1024*1024), 2)} GB"

In [62]:
def get_file_size(path):
    size = os.path.getsize(path)
    if size < 1024:
        return f"{size} bytes"
    elif size < 1024*1024:
        return f"{round(size/1024, 2)} KB"
    elif size < 1024*1024*1024:
        return f"{round(size/(1024*1024), 2)} MB"
    elif size < 1024*1024*1024*1024:
        return f"{round(size/(1024*1024*1024), 2)} GB"

In [63]:
def getLatestFolder(path):
    import glob
    import datetime
    import time
    now = datetime.datetime.now()
    current_time = now.strftime('%Y-%m-%d  %H:%M:%S')
    t=os.path.getctime  
    latest_folder =max(glob.glob(os.path.join(path, '*/')), key=t)
    print('Latest Modified folder: ')
    print(f'Path: {latest_folder}')
    print(f'Size: {get_folder_size(latest_folder)}')
    if path[-1] is '/':
        folder=latest_folder[len(path):-1]
    else:
        folder=latest_folder[len(path)+1:-1]
    file_date = time.ctime(os.path.getmtime(latest_folder))
    file_date = datetime.datetime.strptime(file_date, "%a %b %d %H:%M:%S %Y")
    print("Last modif: %s" % file_date.strftime('%Y-%m-%d  %H:%M:%S'))
    print("Current Time: ", current_time)
    return folder

In [64]:
def createTestPredictionDf(label_path,img_list):
    from PIL import Image
    root_path='/home/sagemaker-user/'
    yolov5_path='river-blindness/yolov5/'
    detect_path=root_path+yolov5_path+'runs/detect/'
    test_img_p='/home/sagemaker-user/river-blindness/content/test-images/'
    test_imgs=getAllFiles(test_img_p,extension='jpg')        
   # columns = ['section_id', 'file_name', 'xmin', 'xmax', 'ymin', 'ymax', 'detection_score', 'detection_class']
    columns = ['section_id', 'file_name', 'xmin', 'xmax', 'ymin', 'ymax', 'detection_score']
    df_results = pd.DataFrame(columns=columns)
    predictions = []
    for img in img_list:
        im = Image.open(test_img_p+img)
        w, h = im.size
        # print (f'w: {w}, h: {h}')
        scale=1
        txt_path=detect_path+label_path+img[:-4]+'-xyxy.txt'
        if os.path.exists(txt_path):
            detection_df=pd.read_table(txt_path, delimiter = ' ',header=None)        
            for index, row in detection_df.iterrows():
                xmin=int(scale*row[1])
                xmax=int(scale*row[3])
                ymin=int(scale*row[2])
                ymax=int(scale*row[4])
                box_dict = dict(
                section_id=f'{img}@{xmin}-{xmax}-{ymin}-{ymax}',
                file_name=img,
                xmin=xmin,
                ymin=ymin,
                xmax=xmax,
                ymax=ymax,
                detection_score=row[5]
                # , detection_class=class2Staining(row[0])
                )
                predictions.append(box_dict)

    prediction_df = pd.DataFrame(predictions)
    prediction_df.set_index('section_id', inplace=True)
    prediction_df.to_csv(f'{detect_path+label_path}/results_submission.csv', sep=';')
    prediction_df.to_csv(f'{detect_path+label_path}/results_with_comma.csv', sep=',')
    return prediction_df

In [65]:
# def resizeImage(img_path,resized_folder,resized_name,new_width=1600):
#     import cv2
#     from matplotlib import pyplot as plt
#     import imutils    
#     img = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), cv2.IMREAD_UNCHANGED)
#     resized = imutils.resize(img, width=new_width)    
#     resized_path=resized_folder+resized_name
#     cv2.imwrite(resized_path, resized)
#     print(f'Original image path: {img_path}')
#     print(f'Resized image path: {resized_path}')
#     print('Resized Dimensions : ',resized.shape)
#     print('Original Dimensions : ',img.shape)
#     # print(f'original image size: {get_file_size(img_path)}')
#     # print(f'Resized image size: {get_file_size(resized_path)}')
#     # plt.imshow(resized)
#     # resized_img = cv2.imread(resized_path)
#     # plt.imshow(resized_img)
#     # plt.show()
#     print(20*'*')
#     return

In [66]:
# img_folder='/home/sagemaker-user/river-blindness/content/test-images/'
# resized_folder='/home/sagemaker-user/river-blindness/content/test-images2/'
# [resizeImage(img_folder+i,resized_folder,i,new_width=1600) for i in test_imgs]

In [67]:
def emptyFolder(parent_dir,folder_name,root_path='/home/sagemaker-user/'):  
    import shutil
    folder =root_path+ parent_dir+ folder_name
    print(folder)
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    
    print(f'Folder {folder} is empty now!')
    return

In [68]:
# def reduce_quality(image_path, storage_folder, quality=50, add_qulity_to_name=False):
#     import cv2
#     os.makedirs(storage_folder, exist_ok=True)

#     split_realpath = os.path.split(os.path.relpath(image_path))
#     image_name = split_realpath[len(split_realpath)-1]
#     image_name_without_ext = os.path.splitext(image_name)[0]
#     image_ext = os.path.splitext(image_name)[1]
  
#     os.path.getsize(image_path)
#     # Open the image by specifying the image path.
#     image_file = Image.open(image_path)
#     im = cv2.imread(image_path)
#     print(f"reducing quality of image: {image_name} to {quality}%")
#     # print(f"Original image size: {get_file_size(image_path)}")
#     # print(f"Original image shape: {im.shape}")


#     # the default
#     if add_qulity_to_name:
#         new_image_name = str(quality) + 'q_' + image_name_without_ext + image_ext
#     else:
#         new_image_name = image_name_without_ext + image_ext

#     new_image_path = os.path.join(storage_folder, new_image_name)
#     image_file.save(new_image_path, quality=quality)
#     im = cv2.imread(new_image_path)

#     # print(f"Processed image size with quality {quality}%: {get_file_size(new_image_path)}")
#     # print("Processed image shape: " + str(im.shape))
#     # print(15*"--")
    
#     return

In [140]:
# emptyFolder('river-blindness/yolov5/test/','images')
# root_path='/home/sagemaker-user/'
# test_storage_folder=root_path+'river-blindness/yolov5/test/images/'
# test_folder=root_path+'river-blindness/content/test-images/'
# test_imgs=getAllFiles(test_folder,extension='jpg')

# qual=10
# [reduce_quality(test_folder+i, test_storage_folder, quality=qual,add_qulity_to_name=False) for i in test_imgs]

In [69]:
%%time
%%capture --no-stderr outputDetect

import time 
dateStr = time.strftime("%Y-%m-%d--%H-%M-%S")
log_detect_file_name='log-detect-'+dateStr+'.txt'
with open('/home/sagemaker-user/river-blindness/outputs/log/'+log_detect_file_name, 'w') as f: f.write('')

root_path='/home/sagemaker-user/'
yolov5_path='river-blindness/yolov5/'
test_folder='test/images/'
detect_path='runs/detect/'

emptyDetectFolder(detect_path)

runDetect(test_folder,imgsz=2000)

collectDetectData(test_folder)
label_path=getLatestFolder(root_path+yolov5_path+detect_path)
test_imgs=getAllFiles(root_path+yolov5_path+detect_path+label_path,extension='jpg')

df_prediction=createTestPredictionDf(label_path+'/', test_imgs)
df_prediction

CPU times: user 47.1 s, sys: 4.44 s, total: 51.6 s
Wall time: 29min 41s


In [70]:
with open('/home/sagemaker-user/river-blindness/outputs/log/'+log_detect_file_name, 'w') as f: f.write(outputDetect.stdout)