# Part - I: Dealing with Old Data (Any data which isn't from HPCL project)

### Basic Common Stuff
* Installing libraries
* Importing libraries
* Defining functions

In [None]:
!pip install XlsxWriter

Collecting XlsxWriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/159.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter
Successfully installed XlsxWriter-3.2.0


In [None]:
# Installing libraries and all
import openpyxl
import xlsxwriter
import pathlib


import pandas as pd
import numpy as np
import json

import os.path
import base64
import io
from io import BytesIO
from urllib.request import urlopen
import re
import glob
from PIL import Image, ImageDraw, ImageOps

ALL_DATE_NAMES = ["23.2.6", "22.12.7", "23.2.28", "23.4.9", "23.1.6", "23.5.5"]
# ALL_DATE_NAMES = ['23.2.6']

def add_information_for_specific_table(table_name, table_address, date_name,
                                       table_information_dict, save_file_address,
                                       worksheet_name):
  """
  This function processes a specific table's data to extract information and
  images for converting them into data for the excel file.

  Args:
      table_name (str): The name of the table being processed.
      table_address (str): The file path to the directory containing the
      table's data.
      date_name (str): The date associated with the table's data.
      table_information_dict (dict): A dictionary storing information about the tables, such as the last row written to in the Excel file.
      save_file_address (str): The file path to the Excel file where the analysis results will be saved.
      worksheet_name (str): The name of the worksheet in the Excel file where the table's data will be written.
  """
  all_images = []
  all_image_names = []
  coco_file_found = False

  # Get a list of all files in the table directory
  table_folder_address = os.path.join(table_address, "*")
  specific_table_content = list(glob.iglob(table_folder_address))

  # Iterate over file paths in the table directory
  for item_address in specific_table_content:

      # Check if the file is an image. Load and store it if it is
      potential_image = re.findall(".*jpg|png|JPG|PNG", item_address)
      if len(potential_image)>0:
          image_name = item_address.split("/")[-1]
          all_image_names.append(image_name)

          # Load and store image
          image = Image.open(item_address)
          image = ImageOps.exif_transpose(image)
          all_images.append(image)

      # Check if file is output folder and search for coco file
      elif (item_address.split("/")[-1].lower() == 'output'):
          coco_file_found, coco_file = search_for_coco(item_address)

          if coco_file_found:
              print("COCO file found")
              bool_cat = check_categories(coco_file)
              if not bool_cat:
                  print("Didn't find correct categories in COCO file")
          else:
              print("COCO file not found")

  # sorting image names and images
  sorting_order = np.argsort(all_image_names)
  all_image_names = np.array(all_image_names)[sorting_order]
  all_images = sort_images(all_images, sorting_order)


  main_dict = initialise_main_dict()

  try:
      # from all collected images, go into each single image and extract
      # annotations for that image
      for idx, current_image_name in enumerate(all_image_names):
          current_image_id = get_id(coco_file, current_image_name)
          current_bboxes, current_segs, current_classes, current_areas = get_coco_annotations_for_image_id(coco_file, current_image_id)

          # break the flow if 'ref' label isn't present in annotations
          # since ref has to be there
          if not ("ref" in [i.lower() for i in current_classes]):
              print(f"No ref label found")
              break

          # process bounding boxes for image and resize image to save it in
          # excel file
          current_bboxes = change_bboxes(current_bboxes)
          without_mask_images, with_mask_images, areas = get_final_images_and_area(all_images[idx],
                                                                                    current_bboxes,
                                                                                    current_segs,
                                                                                    current_classes,
                                                                                    current_areas)
          resize_images(with_mask_images)
          resize_images(without_mask_images)

          # enter all final information into a dictionary which will be used
          # to enter information into the excel file
          for final_idx, _ in enumerate(current_classes):
              main_dict['date'].append(date_name)
              main_dict['table'].append(table_name)
              main_dict['tile'].append(idx+1)
              main_dict['name'].append(current_classes[final_idx])
              main_dict['without_mask'].append(without_mask_images[final_idx])
              main_dict['with_mask'].append(with_mask_images[final_idx])
              main_dict['area'].append(areas[final_idx])

      table_number = int(re.findall("\d+", table_name)[0])

      last_row = table_information_dict[table_number]["last_row"]
      last_row = main_dict_to_excel_openpyxl(main_dict,
                                             last_row,
                                             save_file_address,
                                             worksheet_name)
      table_information_dict[table_number]["last_row"] = last_row

  except Exception as e:
        print("\nERROR:", e, "\n")
  return None

def read_coco_file(coco_file_address):
    open_file = open(coco_file_address)
    coco_file = json.load(open_file)
    return coco_file

def search_for_coco(file_address):
  """
  This function searches for a coco file inside the given address

  Args:
      file_address (str): The file path to the directory containing the
      table's data.
  """
  output_content = list(glob.iglob(file_address+"/*"))
  annotation_output = [i for i in output_content if i.split("/")[-1].lower() == 'annotation output']
  if len(annotation_output)>0:
      output_content = list(glob.iglob(os.path.join(annotation_output[0], "*")))
  try:
      coco_file = [i for i in output_content if "coco" in i.split("/")[-1].lower()]
      coco_file_found = True
      coco_file = read_coco_file(coco_file[0])
  except:
      coco_file_found = False
      coco_file = None
  return coco_file_found, coco_file

def initialise_main_dict():
  """
  Forms empty dictionary that can be used carry the data to be entered
  into excel files
  """
  main_dict = {
    'date':[],
    'table':[],
    'tile':[],
    'name':[],
    'without_mask':[],
    'with_mask':[],
    'area':[]
  }
  return main_dict

def write_initial_header(name_of_file, worksheet_name):
  """
  Writes the initial header for a given file and worksheet name.

  Args:
      name_of_file (str): The name of the file to write the header to.
      worksheet_name (str): The name of the worksheet to write the header to.
  """
  file_path = f"{name_of_file}.xlsx"
  if os.path.exists(file_path):
      wb = openpyxl.load_workbook(file_path)
      wb.create_sheet(worksheet_name)
      ws = wb[worksheet_name]

      ws.cell(row=1 , column=1).value="Date"
      ws.cell(row=1 , column=2).value="Table"
      ws.cell(row=1 , column=3).value="Tile"
      ws.cell(row=1 , column=4).value="Name"
      ws.cell(row=1 , column=5).value="Original"
      ws.cell(row=1 , column=6).value="With Mask"
      ws.cell(row=1 , column=7).value="Area"

      wb.save(file_path)

      wb.close()
  else:
      # use existing xlsx code
      workbook = xlsxwriter.Workbook(file_path)
      worksheet = workbook.add_worksheet(worksheet_name)

      worksheet.write(0, 0, "Date")
      worksheet.write(0, 1, "Table")
      worksheet.write(0, 2, "Tile")
      worksheet.write(0, 3, "Name")
      worksheet.write(0, 4, "Original")
      worksheet.write(0, 5, "With Mask")
      worksheet.write(0, 6, "Area")

      workbook.close()


def main_dict_to_excel_openpyxl(main_dict, last_row, name_of_file, worksheet_name):
  """
  This function writes data from a given dictionary to an excel file.
  """
  file_path = f'{name_of_file}.xlsx'
  wb = openpyxl.load_workbook(file_path)
  ws = wb[worksheet_name]

  number_entries = len(main_dict["date"])

  for row_number in range(2, number_entries+2):
      for col_number, title in enumerate(main_dict.keys(), start=1):
          value = main_dict[title][row_number-2]
          if title in ["date", 'table', 'tile', 'name', 'area']:
              ws.cell(row=row_number+last_row, column=col_number).value = value
          else:
              img_byte_arr = io.BytesIO()
              value.save(img_byte_arr, format='png')
              img = Image.open(img_byte_arr)
              img = openpyxl.drawing.image.Image(img)
              img.height = 20
              img.width = 64
              img.anchor = chr(64+col_number) + str(row_number+last_row)
              ws.add_image(img)
  wb.save(file_path)
  last_row += number_entries
  return last_row

def query_search(query_id):
    files = []
    page_token = None
    while True:
        response = service.files().list(q=f"'{query_id}' in parents",
                                        spaces='drive',
                                        fields='files(id, name)').execute()
        for file in response.get('files', []):
            # Process change
            file["name"] = file['name'].lower().strip()
        files.extend(response.get('files', []))
        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break
    return files

def download_gdrive_file(fileId):
    request = service.files().get_media(fileId=fileId)
    file = io.BytesIO()
    downloader = MediaIoBaseDownload(file, request)
    done = False
    while done is False:
      status, done = downloader.next_chunk()
      print(f"Download {int(status.progress() * 100)}.")
    return file

def check_categories(coco_file):
    cats = coco_file["categories"]
    cat_names = [i['name'].lower() for i in cats]
    if 'coral_a' in cat_names:
        return True
    else:
        return False

def get_id(coco_file, current_image_name):
    current_image_id = [i for i in coco_file['images'] if i['file_name'] == current_image_name][0]["id"]
    return current_image_id

def get_coco_annotations_for_image_id(coco_file, current_image_id):
    annotations = [i for i in coco_file['annotations'] if i['image_id'] == current_image_id]
    segmentations = [i['segmentation'] for i in annotations]
    category_ids = [i["category_id"] for i in annotations]
    classes = [[i["name"] for i in coco_file['categories'] if i["id"] == j][0] for j in category_ids]
    bboxes = create_bboxes_from_segs(segmentations)
#     bboxes = [i['bbox'] for i in annotations]
    areas = [i["area"] for i in annotations]
    return bboxes, segmentations, classes, areas

def create_bboxes_from_segs(segs):
    bboxes = []
    for seg in segs:
        xy = [(seg[0][idx*2], seg[0][idx*2+1]) for idx, i in enumerate(seg[0][0:-1:2])]
        x0 = 10000000
        y0 = 10000000
        x1 = -1
        y1 = -1
        for point in xy:
            if point[0]>x1:
                x1 = point[0]
            if point[0]<x0:
                x0 = point[0]
            if point[1]>y1:
                y1 = point[1]
            if point[1]<y0:
                y0 = point[1]
        bbox = [x0, y0, x1-x0, y1-y0]
        bboxes.append(bbox)
    return bboxes

def change_bboxes(bboxes):
    new_bboxes = []
    for bbox in bboxes:
        bbox[0] -= 100
        bbox[1] -= 100
        bbox[2] += 200
        bbox[3] += 200
        new_bboxes.append(bbox)
    return new_bboxes

def get_final_images_and_area(image, bboxes, segmentations, classes, areas):
    # get image with masks
    image_with_masks = layer_image_with_mask(image, segmentations)

    # get cropped images
    bbox_without_mask_images = []
    bbox_with_mask_images = []
    for bbox in bboxes:
        cropped_image = crop_image(image, bbox)
        bbox_without_mask_images.append(cropped_image)

        cropped_image = crop_image(image_with_masks, bbox)
        bbox_with_mask_images.append(cropped_image)

    # get areas of all the corals
    cm_2_areas = calculate_areas(classes, areas)

    return bbox_without_mask_images, bbox_with_mask_images, cm_2_areas

def crop_image(image, bbox):
    x0, y0, w, h = bbox
    cropped_image = image.crop((x0, y0, x0+w, y0+h))
    return cropped_image

def layer_image_with_mask(image, segmentations):
    image = image.convert('RGBA')
    image_copy = image.copy()
    draw = ImageDraw.Draw(image_copy)

    for seg in segmentations:
        xy = [(seg[0][idx*2], seg[0][idx*2+1]) for idx, i in enumerate(seg[0][0:-1:2])]
        draw.polygon(xy, fill = (255, 255, 0))
    layered_image = Image.blend(image, image_copy, 0.5)
    return layered_image


def calculate_areas(classes, areas):
    ref_idx = [idx for idx, i in enumerate(classes) if i.lower()=='ref'][0]
    ref_area = areas[ref_idx]
    cm_2_areas = [i * 25/ref_area for i in areas]
    return cm_2_areas

def sort_images(list_of_images, order):
    array_images = [np.asarray(i) for i in list_of_images]
    # array_images = np.array(array_images, dtype=object)[order]
    # return [Image.fromarray(i.astype(np.uint8)) for i in array_images]
    sorted_arrays = [x for _, x in sorted(zip(order, array_images), key=lambda pair: pair[0])]
    return [Image.fromarray(i) for i in sorted_arrays]

def resize_images(images_list):
    size = (256, 256)
    for i in images_list:
        i.thumbnail(size)
    return None

### Defining Custom Variables

In [None]:
main_dir_address = "/content/drive/MyDrive/Projects/Coral Microfragmentation/Coral Monitoring/22-23 Season/" #@param {type:"string"}
date_name = "" #@param {type:"string"}
intermediary_folder_path = "" #@param {type:"string"}
table_number = 10 #@param {type:"integer"}

### Fixed Variables Being Defined and Running the Main Function

In [None]:
if date_name:
    table_names = ["table_" + str(table_number)]
    table_folder_addresses = [os.path.join(main_dir_address, date_name, intermediary_folder_path, table_names[0])]
    save_file_name = "Analysis-Excel"
    save_file_name_extension = save_file_name + '.xlsx'
    save_file_path = os.path.join(table_folder_addresses[0], "analysis_file", save_file_name)
    worksheet_names = ["Main Analysis"]
else:
    print('Create files across all dates for given table...')
    table_names = ["table_" + str(table_number)] * len(ALL_DATE_NAMES)
    table_folder_addresses = []
    worksheet_names = []
    for date_name in ALL_DATE_NAMES:
        table_folder_addresses.append(
            os.path.join(main_dir_address, date_name, intermediary_folder_path, table_names[0])
        )
        worksheet_names.append(f"date_{date_name}")
    save_file_name = f"Analysis-Excel_table_{str(table_number)}"
    save_file_name_extension = save_file_name + '.xlsx'
    save_file_path = os.path.join(main_dir_address, "analysis_files", save_file_name)


for idx, table_name in enumerate(table_names):
    print(f'Progress: {(idx+1)/len(table_names)}')

    table_folder_address = table_folder_addresses[idx]
    worksheet_name = worksheet_names[idx]

    # initialising some needed files and dictionary
    pathlib.Path("/".join(save_file_path.split("/")[:-1])).mkdir(parents=True, exist_ok=True)
    table_information_dict = dict()
    table_information_dict[table_number] = {"last_row":0}
    write_initial_header(save_file_path, worksheet_name)
    add_information_for_specific_table(table_name, table_folder_address, date_name, table_information_dict, save_file_path, worksheet_name)

Create files across all dates for given table...
Progress: 0.16666666666666666
COCO file found
Progress: 0.3333333333333333
COCO file found
Progress: 0.5
COCO file found
Progress: 0.6666666666666666
COCO file found
Progress: 0.8333333333333334
COCO file found
Progress: 1.0
COCO file found


### Rough

In [None]:
# initialising some need files and dictionary

table_information_dict = dict()
for i in range(starting_table_number, ending_table_number+1):
    table_information_dict[i] = {"last_row":0}
    write_initial_header("Analysis-Excel"+str(i))

In [None]:
# defining custom variables
dates_to_check = ["22.12.7", "23.1.6", "23.2.6", "23.2.28", "23.4.9", "23.5.5"]
tables_to_check = ["table_4", "table_5", "table_6", "table_7", "table_8", "table_9", "table_10"]

dates_to_check = ["23.1.6"]
tables_to_check = ["table_4"]

In [None]:
monitoring_content = query_search("1fdjJD2nMX9V-8ddP4FjqpzoD-KnCI71P")
for date_folder in monitoring_content:
    if date_folder['name'] in dates_to_check:

        current_date_id = date_folder['id']
        date_content = query_search(current_date_id)

        date_content_names = [i["name"] for i in date_content]
        date_content_ids = [i["id"] for i in date_content]
        try:
            idx_new_tables = date_content_names.index('new tables')
            id_new_tables = date_content_ids[idx_new_tables]
            new_table_content = query_search(id_new_tables)
        except:
            try:
                idx_new_tables = date_content_names.index('coral tables')
                id_new_tables = date_content_ids[idx_new_tables]
                new_table_content = query_search(id_new_tables)
                new_table_content = query_search([i["id"] for i in new_table_content if i['name'] == 'new tables'][0])
            except:
                try:
                    idx_new_tables = date_content_names.index('post cleaning')
                    id_new_tables = date_content_ids[idx_new_tables]
                    new_table_content = query_search(id_new_tables)
                except:
                    new_table_content = date_content
        for table_folder in new_table_content:
            if (table_folder["name"] in tables_to_check) or (table_folder["name"].replace(" ", "_") in tables_to_check):
                print(f"---Checking for: Date-{date_folder['name']}, Table-{table_folder['name']}---")
                specific_table_content = query_search(table_folder['id'])

In [None]:
monitoring_content = query_search("1fdjJD2nMX9V-8ddP4FjqpzoD-KnCI71P")
for date_folder in monitoring_content:
    if date_folder['name'] in dates_to_check:

        current_date_id = date_folder['id']
        date_content = query_search(current_date_id)

        date_content_names = [i["name"] for i in date_content]
        date_content_ids = [i["id"] for i in date_content]
        try:
            idx_new_tables = date_content_names.index('new tables')
            id_new_tables = date_content_ids[idx_new_tables]
            new_table_content = query_search(id_new_tables)
        except:
            try:
                idx_new_tables = date_content_names.index('coral tables')
                id_new_tables = date_content_ids[idx_new_tables]
                new_table_content = query_search(id_new_tables)
                new_table_content = query_search([i["id"] for i in new_table_content if i['name'] == 'new tables'][0])
            except:
                try:
                    idx_new_tables = date_content_names.index('post cleaning')
                    id_new_tables = date_content_ids[idx_new_tables]
                    new_table_content = query_search(id_new_tables)
                except:
                    new_table_content = date_content
        for table_folder in new_table_content:
            if (table_folder["name"] in tables_to_check) or (table_folder["name"].replace(" ", "_") in tables_to_check):
                print(f"---Checking for: Date-{date_folder['name']}, Table-{table_folder['name']}---")
                specific_table_content = query_search(table_folder['id'])

                all_images = []
                all_image_names = []
                coco_file_found = False
                for item in specific_table_content:

                    potential_image = re.findall(".*jpg|png", item['name'])
                    if len(potential_image)>0:
                        image_name = potential_image[0]
                        all_image_names.append(image_name)
                        image_id = item['id']

                        #download image
                        file = download_gdrive_file(image_id)
                        image = Image.open(io.BytesIO(file.getvalue()))
                        image = ImageOps.exif_transpose(image)
                        all_images.append(image)

                    elif item['name'] == 'output':
                        output_content = query_search(item["id"])
                        annotation_id = [i["id"] for i in output_content if i["name"]=="annotation output"][0]
                        annotation_content = query_search(annotation_id)
                        try:
                            coco_file_id = [i["id"] for i in annotation_content if "coco" in i["name"]][0]
                        except:
                            print(f"coco file not found for {date_folder['name']}, {table_folder['name']}")
                            break


                        # download coco file
                        file = download_gdrive_file(coco_file_id)
                        coco_file = literal_eval(file.getvalue().decode("utf-8"))
                        bool_cat = check_categories(coco_file)
                        coco_file_found = True
                        if not bool_cat:
                            print(("Didn't find correct categories in COCO file of "
                                   f"{date_folder['name']}-{table_folder['name']}"))

                if not coco_file_found:
                    print(f"coco file not found for {date_folder['name']}, {table_folder['name']}")
                    continue

                sorting_order = np.argsort(all_image_names)
                all_image_names = np.array(all_image_names)[sorting_order]
                all_images = sort_images(all_images, sorting_order)


                main_dict = {
                    'date':[],
                    'table':[],
                    'tile':[],
                    'name':[],
                    'without_mask':[],
                    'with_mask':[],
                    'area':[]
                }
                # going into each image
                print("Looping through all the image names now...")
                try:
                    for idx, current_image_name in enumerate(all_image_names):
                        current_image_id = get_id(coco_file, current_image_name)
                        current_bboxes, current_segs, current_classes, current_areas = get_coco_annotations_for_image_id(coco_file, current_image_id)

                        if not ("ref" in [i.lower() for i in current_classes]):
                            print(f"No ref label found for {date_folder['name']}, {table_folder['name']}")
                            break

                        current_bboxes = change_bboxes(current_bboxes)
                        without_mask_images, with_mask_images, areas = get_final_images_and_area(all_images[idx],
                                                                                                 current_bboxes,
                                                                                                 current_segs,
                                                                                                 current_classes,
                                                                                                 current_areas)
                        resize_images(with_mask_images)
                        resize_images(without_mask_images)
                        for final_idx, _ in enumerate(current_classes):
                            main_dict['date'].append(date_folder['name'])
                            main_dict['table'].append(table_folder['name'])
                            main_dict['tile'].append(idx+1)
                            main_dict['name'].append(current_classes[final_idx])
                            main_dict['without_mask'].append(without_mask_images[final_idx])
                            main_dict['with_mask'].append(with_mask_images[final_idx])
                            main_dict['area'].append(areas[final_idx])

                    table_number = int(re.findall("\d+", table_folder["name"])[0])

                    last_row = table_information_dict[table_number]["last_row"]
#                     last_row = main_dict_to_excel_openpyxl(main_dict,
#                                                            last_row,
#                                                            "Jeremy_Analysis_"+str(table_number))
                    table_information_dict[table_number]["last_row"] = last_row

                except Exception as e:
                      print("\nERROR:", e, "\n")

# Part - II: When dealing with HPCL data

### Defining Functions and Installing/Importing Libraries

In [1]:
!pip install XlsxWriter
!pip install fiftyone -q

Collecting XlsxWriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter
Successfully installed XlsxWriter-3.2.0
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.9/108.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.5/192.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.1/98.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m


In [2]:
import openpyxl
import xlsxwriter
import pathlib

import pandas as pd
import numpy as np
import json

import os.path
import base64
import io
from io import BytesIO
from urllib.request import urlopen
import re
import glob
from PIL import Image, ImageDraw, ImageOps
from tqdm.notebook import tqdm

import fiftyone as fo
import fiftyone.utils.labels as foul
from shapely.geometry import Polygon

In [3]:
def main(parent_dir_address, sps_to_go_through, excel_sheet_column_names):
    all_images, all_annotations = iterate_over_sps(parent_dir_address, sps_to_go_through)
    prepared_excel_data = prepare_excel_data(all_images, all_annotations)
    write_excel_data(prepared_excel_data, excel_sheet)
    return None

def iterate_over_sps(parent_dir_address, sps_to_go_through):
    """
    This function iterates over all the SPs.
    Collects data for all of them.

    Args:
        parent_dir_address: Main parent directory path which contains all the SPs
        sps_to_go_through: All the SP names

    Returns:
        A tuple containing two lists: image paths and annotation paths.
    """
    all_image_paths = []
    all_annotation_paths = []

    for sp in tqdm(sps_to_go_through):
        full_sp_path = os.path.join(parent_dir_address, sp)
        image_paths, annotation_paths = iterate_over_ars_inside_an_sp(full_sp_path)
        all_image_paths.append(image_paths)
        all_annotation_paths.append(annotation_paths)
    return all_image_paths, all_annotation_paths


def iterate_over_ars_inside_an_sp(full_sp_path):
    """
    This function iterates over all the ARS inside a single SP.
    Collects data for all of them.

    Args:
        full_sp_path: Path to the SP directory.

    Returns:
        A tuple containing two lists: image paths and annotation paths.
    """
    all_image_paths = []
    all_annotation_paths = []

    # iterate over ARs
    for ars in tqdm(os.listdir(full_sp_path)):
        full_ars_path = os.path.join(full_sp_path, ars)
        image_paths, annotation_paths = iterate_over_ar_inside_an_ars(full_ars_path)
        all_image_paths.append(image_paths)
        all_annotation_paths.append(annotation_paths)
    return all_image_paths, all_annotation_paths

def iterate_over_ar_inside_an_ars(full_ars_path):
    """
    This function iterates over all the ARs inside a single ARS.
    Collects data for all of them.

    Args:
        full_ars_path: Path to the ARS directory.

    Returns:
        A tuple containing two lists: image paths and annotation paths.
    """
    all_image_paths = []
    all_annotation_paths = []

    # iterate over ARs
    for ar in tqdm(os.listdir(full_ars_path)):
        full_ar_path = os.path.join(full_ars_path, ar)
        image_paths, annotation_paths = fetch_data_from_ar(full_ar_path)
        all_image_paths.append(image_paths)
        all_annotation_paths.append(annotation_paths)
    return all_image_paths, all_annotation_paths


def fetch_data_from_ar(full_ar_path):
    """
    Fetches image and annotation data from a given directory.

    Args:
        full_ar_path: Path to the directory containing 'Processed' and
        'Annotations' folders.

    Returns:
        A tuple containing two lists: image paths and annotation paths.
    """
    # check if full_ar_path has 'Processed' folder
    processed_folder = os.path.join(full_ar_path, 'Processed')

    # check if folder 'Annotations' is present inside
    annotations_folder = os.path.join(processed_folder, 'Annotations')

    image_paths = []
    annotation_paths = []

    if os.path.exists(processed_folder):
        for file in os.listdir(processed_folder):
            # acceptable formats: jpg, png, jpeg. Case insensitive
            if file.lower().endswith(('.jpg', '.png', '.jpeg')):
                # if images are present, collect all image paths
                image_paths.append(os.path.join(processed_folder, file))

    if os.path.exists(annotations_folder):
        for file in os.listdir(annotations_folder):
            # if present collect all json files inside the folder
            if (file.lower().endswith('.json')) and ("vgg" not in file.lower()):
                annotation_paths.append(os.path.join(annotations_folder, file))

    return image_paths, annotation_paths

In [4]:
def prepare_excel_data(all_images, all_annotations):
    return True

def crop_image(image, bbox):
    x0, y0, w, h = bbox
    cropped_image = image.crop((x0, y0, x0+w, y0+h))
    return cropped_image

def layer_image_with_polylines(image, xy_lines):
    image = image.convert('RGBA')
    image_copy = image.copy()
    draw = ImageDraw.Draw(image_copy)
    xy_lines = np.array(xy_lines).flatten().tolist()
    draw.polygon(xy_lines, fill = (255, 255, 0))
    layered_image = Image.blend(image, image_copy, 0.5)
    return layered_image

def unnormalise_xy_polylines(xy_lines, image):
    xy_lines = xy_lines.copy()
    for idx, i in enumerate(xy_lines):
        i[0] = int(i[0] * image.size[0])
        i[1] = int(i[1] * image.size[1])
        xy_lines[idx] = i
    return xy_lines

def unnormalise_bbox(bbox, image):
    new_bbox = [int(bbox[0] * image.size[0]),
                int(bbox[1] * image.size[1]),
                int(bbox[2] * image.size[0]),
                int(bbox[3] * image.size[1])]
    return new_bbox

In [8]:
def write_excel_data():
    return True

def create_new_excel_file(path_of_file):
  """
  Writes all the given data to a given excel file and sheet

  Args:
      path_of_file (str): The path of the file to write to
  """
  if os.path.exists(path_of_file):
      wb = openpyxl.load_workbook(path_of_file)
      wb.create_sheet('Coral Analysis')
      ws = wb['Coral Analysis']

      ws.cell(row=1 , column=1).value="UID"
      ws.cell(row=1 , column=2).value="Original"
      ws.cell(row=1 , column=3).value="With Mask"
      ws.cell(row=1 , column=4).value="Area"

      wb.save(path_of_file)

      wb.close()
  else:
    workbook = xlsxwriter.Workbook(path_of_file)
    worksheet = workbook.add_worksheet("Coral Analysis")

    worksheet.write(0, 0, "UID")
    worksheet.write(0, 1, "Original")
    worksheet.write(0, 2, "With Mask")
    worksheet.write(0, 3, "Area")

    workbook.close()

def main_dict_to_excel_openpyxl(main_dict, path_of_file):
  """
  This function writes data from a given dictionary to an excel file.
  """
  wb = openpyxl.load_workbook(path_of_file)
  ws = wb['Coral Analysis']

  number_entries = len(main_dict["uids"])

  for row_number in tqdm(range(2, number_entries+2)):
      for col_number, col_key in enumerate(main_dict.keys(), start=1):
          value = main_dict[col_key][row_number-2]
          if col_key in ["uids", 'area']:
              ws.cell(row=row_number, column=col_number).value = value
          else:
              value = resize_image(value)
              img_byte_arr = io.BytesIO()
              value.save(img_byte_arr, format='png')
              img = Image.open(img_byte_arr)
              img = openpyxl.drawing.image.Image(img)
              img.height = 20
              img.width = 64
              img.anchor = chr(64+col_number) + str(row_number)
              ws.add_image(img)
  wb.save(path_of_file)
  return None

def resize_image(image):
    size = (256, 256)
    image.thumbnail(size)
    return image

### Running Data Fetching Functions

Important only for understanding the code

In [None]:
# fetching data from a single AR
output = fetch_data_from_ar("drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/SP1/ARS1 - 04.12.23/AR1/")

In [None]:
# fetching data from a single ARS
output = iterate_over_ar_inside_an_ars("drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/SP1/ARS1 - 04.12.23/")

In [None]:
# fetching data from a single SP
output = iterate_over_ars_inside_an_sp("drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/SP1")

### Main Functions to Run

In [5]:
# fetching data from different SPs
output = iterate_over_sps("drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/",
 ["SP1", "SP2", "SP3", "SP4", "SP5", "SP6"])

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
# use the above created output variable, collect all images, create data ready
# to be put into an excel sheet

# todo: generalise this function to operate on any list of lists type structure so
# that it can be used with other functions too

final_dict = {
    "uids":[],
    "wo_mask":[],
    "w_mask":[],
    "area":[]
}

# go through all collected image and annotations pairs
for sp_images, sp_annotations in tqdm(zip(output[0], output[1])):
    for ars_images, ars_annotations in zip(sp_images, sp_annotations):
        for ar_images, ar_annotation in zip(ars_images, ars_annotations):
            if len(ar_annotation) == 0:
                continue
            try:
                # create a dataset object using fiftyone library
                dataset = fo.Dataset.from_dir(
                data_path=os.path.dirname(ar_images[0]),
                dataset_type=fo.types.COCODetectionDataset,
                labels_path=ar_annotation[0]
                )
            except KeyError:
                # this happens when there is a mismatch between the image names
                # and the image names as mentioned in that folders coco file
                continue

            # creating polylines from fiftyone segmentation format
            foul.instances_to_polylines(dataset, 'segmentations', 'polylines')

            # go through all loaded images
            for sample_idx, sample_image in tqdm(enumerate(dataset)):
                temp_area_list = []
                sample_image_name = os.path.splitext(sample_image.filename)[0]
                sample_image_ext = os.path.splitext(sample_image.filename)[1]
                sample_image_filepath = sample_image.filepath
                img = Image.open(sample_image_filepath)
                img = ImageOps.exif_transpose(img)

                for annotation_idx, single_annotation in enumerate(sample_image.segmentations.detections):
                    ### take the annotation label and lowercase it. Fill any space with -
                    label = single_annotation.label
                    label = label.lower().strip().replace(" ", "-")

                    ### if label says 'ref' then record the size of the annotation in pixels and store it
                    if label == "ref":
                        final_dict["uids"].append(f"{sample_image_name}_ref")
                    else:
                        #### store the label in an entry
                        final_dict["uids"].append(f"{sample_image_name}_{label[-1]}")

                    #### read in a bounding box
                    tl_x, tl_y, w, h = single_annotation.bounding_box
                    #### increase the area covered under the bounding box by inflating the bounding box from all sides
                    inflated_bbox = tl_x-0.05, tl_y-0.05, w+0.1, h+0.1
                    inflated_bbox = unnormalise_bbox(inflated_bbox, img)
                    #### crop that portion out of the image
                    # todo: check if this crop has to be turned into non relative
                    crop_wo_mask = crop_image(img, inflated_bbox)
                    #### store this resultant crop
                    final_dict["wo_mask"].append(crop_wo_mask)

                    #### apply the original mask the image and crop the same above inflated bounding box
                    xy_lines = sample_image.polylines.polylines[annotation_idx].points[0]
                    unnormalised_xy_lines = unnormalise_xy_polylines(xy_lines, img)
                    img_w_mask = layer_image_with_polylines(img, unnormalised_xy_lines)
                    crop_w_mask = crop_image(img_w_mask, inflated_bbox)
                    #### store this image with a mask too
                    final_dict["w_mask"].append(crop_w_mask)

                    #### calculate the area under the mask in pixels and store it
                    pgon = Polygon(unnormalised_xy_lines)
                    temp_area_list.append(pgon.area)
                    if label == "ref":
                        ref_area = pgon.area

                ## after going through all the images and ref, calculate mask area in pixels/'ref' area in pixels * 25 from the stored values for each new cropped image
                for area in temp_area_list:
                    final_dict["area"].append(area/(ref_area) * 25)

In [18]:
# Separate out the Different ARS1, 2, and 3 Entries and store them in dicts

ars1_dict = {
    "uids":[],
    "wo_mask":[],
    "w_mask":[],
    "area":[]
}


ars2_dict = {
    "uids":[],
    "wo_mask":[],
    "w_mask":[],
    "area":[]
}


ars3_dict = {
    "uids":[],
    "wo_mask":[],
    "w_mask":[],
    "area":[]
}

# separate out files on the basis of ARS
for idx, image_name in enumerate(final_dict['uids']):
    if 'ARS1' in image_name:
        ars1_dict['uids'].append(image_name)
        ars1_dict['wo_mask'].append(final_dict["wo_mask"][idx])
        ars1_dict['w_mask'].append(final_dict["w_mask"][idx])
        ars1_dict['area'].append(final_dict["area"][idx])
    elif 'ARS2' in image_name:
        ars2_dict['uids'].append(image_name)
        ars2_dict['wo_mask'].append(final_dict["wo_mask"][idx])
        ars2_dict['w_mask'].append(final_dict["w_mask"][idx])
        ars2_dict['area'].append(final_dict["area"][idx])
    elif 'ARS3' in image_name:
        ars3_dict['uids'].append(image_name)
        ars3_dict['wo_mask'].append(final_dict["wo_mask"][idx])
        ars3_dict['w_mask'].append(final_dict["w_mask"][idx])
        ars3_dict['area'].append(final_dict["area"][idx])

In [19]:
# add all Data to Excel sheet
create_new_excel_file('drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/ars1_analysis.xlsx')
main_dict_to_excel_openpyxl(ars1_dict, 'drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/ars1_analysis.xlsx' )

create_new_excel_file('drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/ars2_analysis.xlsx')
main_dict_to_excel_openpyxl(ars2_dict, 'drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/ars2_analysis.xlsx' )

create_new_excel_file('drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/ars3_analysis.xlsx')
main_dict_to_excel_openpyxl(ars3_dict, 'drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/ars3_analysis.xlsx' )

  0%|          | 0/340 [00:00<?, ?it/s]

  0%|          | 0/296 [00:00<?, ?it/s]

  0%|          | 0/333 [00:00<?, ?it/s]

### Rough

In [12]:
# create a dataset object using fiftyone library
dataset = fo.Dataset.from_dir(
data_path="drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/SP1/ARS3 - 05.12.23/AR2/Processed",
dataset_type=fo.types.COCODetectionDataset,
labels_path="drive/MyDrive/Projects/Artificial Reefs- HPCL/Data/Monitoring Data/SP1/ARS3 - 05.12.23/AR2/Processed/Annotations/S1_ARS3_2.json"
)

 100% |█████████████████████| 4/4 [743.9ms elapsed, 0s remaining, 5.5 samples/s]      


INFO:eta.core.utils: 100% |█████████████████████| 4/4 [743.9ms elapsed, 0s remaining, 5.5 samples/s]      
