# Set working directory

In [3]:
# Load the drive to access the images and annotations
from google.colab import drive
drive.mount('/content/gdrive')

# reduce the path of the drive
def driveSymboLink():
  # this creates a symbolic link so that now the path /content/gdrive/My\ Drive/ is equal to /mydrive
  !ln -s /content/gdrive/My\ Drive/ /mydrive
  # !ls /mydrive

driveSymboLink()

Mounted at /content/gdrive


In [4]:
WDIR='/content/gdrive/MyDrive/MIDS/W210/Animal_Identification'

# Negative Samples
We currently have the data split into train-test. Before adding negative samples to our dataset, we need to do another split for dev-test.

In this notebook we will:
- create the new dev-test split (50/50)
- add negative samples (img and txt files) to our train set and to our dev set.

In [6]:
# Some dependencies
!pip install -U Jinja2

!pip install opencv-python-headless==4.1.2.30

!pip install fiftyone

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Jinja2
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 4.9 MB/s 
Installing collected packages: Jinja2
  Attempting uninstall: Jinja2
    Found existing installation: Jinja2 2.11.3
    Uninstalling Jinja2-2.11.3:
      Successfully uninstalled Jinja2-2.11.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
flask 1.1.4 requires Jinja2<3.0,>=2.10.1, but you have jinja2 3.1.2 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfully installed Jinja2-3.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting opencv-python-headless==4.1.2.30
  Downloading opencv_python_hea

In [7]:
# Add the path of the repo to locate utils
import sys
sys.path.append(WDIR)

# Import libraries
import fiftyone as fo
import os
import numpy as np
import shutil
from utils.utils import split_dev_test

# Other libraries
import pandas as pd
import json


NumExpr defaulting to 2 threads.
Migrating database to v0.16.2


In [None]:
# Load the drive to access the images and annotations
from google.colab import drive
drive.mount('/content/gdrive')

# reduce the path of the drive
def driveSymboLink():
  # this creates a symbolic link so that now the path /content/gdrive/My\ Drive/ is equal to /mydrive
  !ln -s /content/gdrive/My\ Drive/ /mydrive
  # !ls /mydrive

driveSymboLink()

As a prep step, if yo uhave the tar files (taken from LILAC BC), we used two different datasets:
- [Hyena](https://lila.science/datasets/hyena-id-2022/) (3GB)
- [GZGC](https://lila.science/datasets/great-zebra-giraffe-id) (10GB)

The GZGC has a mix of zebras and giraffes. We extracted only giraffes from this dataset. Due to a limited amount of giraffe images, we had to add two-thirds of hyenas and one third of giraffes.

In [8]:
!tar -xzvf "/content/gdrive/MyDrive/MIDS/W210/hyena.coco.tar.gz" -C "/content"     #[run this cell to extract tar.gz files]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
hyena.coco/images/train2022/000000000715.jpg
hyena.coco/images/train2022/._000000002102.jpg
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
hyena.coco/images/train2022/000000002102.jpg
hyena.coco/images/train2022/._000000002116.jpg
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
hyena.coco/images/train2022/000000002116.jpg
hyena.coco/images/train2022/._000000000701.jpg
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
hyena.coco/images/train2022/000000000701.jpg
hyena.coco/images/train2022/._000000001379.jpg
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
hyena.coco/images/train2022/000000001379.jpg
hyena.coco/images/train2022/._000000000067.jpg
tar: Ignoring unknown extended header keyword

In [9]:
!tar -xzvf "/content/gdrive/MyDrive/MIDS/W210/gzgc.coco.tar.gz" -C "/content"     #[run this cell to extract tar.gz files]

gzgc.coco/
gzgc.coco/images/
gzgc.coco/images/test2020/
gzgc.coco/images/train2020/
gzgc.coco/images/train2020/000000004944.jpg
gzgc.coco/images/train2020/000000003668.jpg
gzgc.coco/images/train2020/000000001984.jpg
gzgc.coco/images/train2020/000000002797.jpg
gzgc.coco/images/train2020/000000001510.jpg
gzgc.coco/images/train2020/000000001736.jpg
gzgc.coco/images/train2020/000000000376.jpg
gzgc.coco/images/train2020/000000004204.jpg
gzgc.coco/images/train2020/000000000203.jpg
gzgc.coco/images/train2020/000000002177.jpg
gzgc.coco/images/train2020/000000001871.jpg
gzgc.coco/images/train2020/000000004711.jpg
gzgc.coco/images/train2020/000000000977.jpg
gzgc.coco/images/train2020/000000002484.jpg
gzgc.coco/images/train2020/000000004188.jpg
gzgc.coco/images/train2020/000000003681.jpg
gzgc.coco/images/train2020/000000000481.jpg
gzgc.coco/images/train2020/000000001240.jpg
gzgc.coco/images/train2020/000000004685.jpg
gzgc.coco/images/train2020/000000001720.jpg
gzgc.coco/images/train2020/000000000

# Step 1: Split Dev-Test

In [None]:
# Only needed to run the first time
# split_dev_test(dataPath=WDIR+'/datasets/leopard')

Now we have three different datasets:
- 80% train
- 10% dev
- 10% test

We will add negative sampls to the train-dev sets.

# Step 2: Check if there are enough negative samples
The negative samples will be added to train-dev sets. These include images from hyenas and giraffes.

In [31]:
dev_images_path='/content/gdrive/MyDrive/MIDS/W210/Animal_Identification/datasets/leopard/dev/images'
dev_labels_path='/content/gdrive/MyDrive/MIDS/W210/Animal_Identification/datasets/leopard/dev/labels'

train_images_path='/content/gdrive/MyDrive/MIDS/W210/Animal_Identification/datasets/leopard/train/images'
train_labels_path='/content/gdrive/MyDrive/MIDS/W210/Animal_Identification/datasets/leopard/train/labels'

# Check the number of negative samples needed
num_sample_dev=len(os.listdir(dev_images_path))
num_sample_train=len(os.listdir(train_images_path))
neg_sample_split=0.20

total_samples= num_sample_dev + num_sample_train
num_neg_samples = round(total_samples*neg_sample_split)
print(f"Negative samples that will be added: {num_neg_samples}")

Negative samples that will be added: 1221


In [32]:
# Save the path to the hyena and giraffe folders (already untared)
hyena_img_path='/content/hyena.coco/images/train2022'
giraffe_img_path='/content/gzgc.coco/images/train2020'

In [33]:
# Check the number of samples in the hyena dataset
clean_hyena=[i for i in os.listdir(hyena_img_path) if "_" not in i]

num_sample_hyena=len(clean_hyena)
if (2*num_neg_samples/3)>num_sample_hyena:
  print("There are not enough samples from hyena dataset")
else:
  print("Enough hyena samples")
  print(f"There are {num_sample_hyena} out of {(2*num_neg_samples/3)} required.")

Enough hyena samples
There are 3104 out of 814.0 required.


In the code below we extract only the images annotated as giraffes. We got the list of all the giraffes by manually separating them. The reason is that there were many annotations wrong, making it hard to program.

In [23]:
#########
# No need to run again, this was done to create the curated giraffe reference file for reproducibility
#########

# # get the images of the giraffes
# clean_giraffe=[i for i in os.listdir(WDIR+'/Giraffe') if "_" not in i]

# # save them
# with open(WDIR+'/datasets/negatives/giraffe_reference.txt', 'w') as g:
#   for image_name in clean_giraffe:
#     g.write(image_name+'\n')

Use the `giraffe_reference.txt` file to extract only the giraffe images.

In [34]:
f = open("/content/gzgc.coco/annotations/instances_train2020.json")

data = json.load(f)

In [35]:
# get the list of giraffe images
with open(WDIR+'/datasets/negatives/giraffe_reference.txt', 'r') as f:
  giraffes=f.readlines()

In [None]:
# if data['images'][record]['file_name'] in clean_giraffe:
#   giraffe.append(data['images'][record]['file_name'])

In [36]:
# Check the number of samples in the giraffe dataset
num_sample_giraffe=len(giraffes)

if (num_neg_samples/3)>num_sample_giraffe:
  print("There are not enough samples from giraffe")
  print(f"{num_sample_giraffe} out of {num_neg_samples/3} required")
else:
  print("Enough giraffe samples")
  print(f"There are {num_sample_giraffe} out of {num_neg_samples/3} required.")

Enough giraffe samples
There are 532 out of 407.0 required.


In [17]:
# giraffes=[]

# # Check the number of samples in the giraffe dataset
# num_sample_giraffe=len(os.listdir(giraffe_img_path))

# for record in range(num_sample_giraffe):
#   if data['annotations'][record]['category_id'] == 0:
#     giraffes.append(data['images'][record]['file_name'])

# if (num_neg_samples/3)>len(giraffes):
#   print("There are not enough samples from giraffe")
#   print(f"{len(giraffes)} out of {num_neg_samples/3} required")
# else:
#   print("Enough giraffe samples")
#   print(f"There are {len(giraffes)} out of {num_neg_samples/3} required.")

Enough giraffe samples
There are 455 out of 407.0 required.


# Step 3: Move the negative files to the correct folder

In [37]:
neg_annotations=[]
neg_images=[]

np.random.seed(42)
neg_sample_count=0

# Shuffle the hyena images
np.random.shuffle(clean_hyena)
np.random.shuffle(giraffes)

# Take a sample until we collect the num of neg samples
for s in range(num_neg_samples):
  first_split = np.random.random_sample()
  second_split = np.random.random_sample()
  
  # Check if the images will be moved to train or dev set
  if first_split <0.8:

    # Check if the image has to be hyena or giraffe
    # hyena < 0.5
    if second_split <0.66:
      suffix='/hyena_'
      # Select the image
      img=clean_hyena.pop(0)
      # Create new name
      new_filename=suffix+img
      # Locate image
      source=hyena_img_path+'/'+img
      labelPath=train_labels_path+suffix+img[:-3]+'txt'
      
    else:
      suffix='/giraffe_'
      img=giraffes.pop(0)[:-1]
      new_filename=suffix+img
      source=giraffe_img_path+'/'+img
    
    labelPath=train_labels_path+suffix+img[:-3]+'txt'
      

    # Send to train dataset
    destination=train_images_path+new_filename
  else:
    if second_split <0.66:
      suffix='/hyena_'
      # Select the image
      img=clean_hyena.pop(0)
      # Create new name
      new_filename=suffix+img
      # Locate image
      source=hyena_img_path+'/'+img
      labelPath=dev_labels_path+suffix+img[:-3]+'txt'
      
    else:
      suffix='/giraffe_'
      img=giraffes.pop(0)[:-1]
      new_filename=suffix+img
      source=giraffe_img_path+'/'+img
    
    labelPath=dev_labels_path+suffix+img[:-3]+'txt'
    # Send to dev dataset
    destination=dev_images_path+new_filename

  # Collect all annotations and images on a list 
  neg_annotations.append(labelPath)
  neg_images.append(destination)

  # Move them baed on the source and destination
  shutil.move(source,destination)
  # create empty text files
  with open(labelPath, 'w'): pass

# print(neg_annotations)
# print(neg_images)

Don't forget to document all the added files in case you want to revert the state.

In [38]:
# Write a reference file for images and annotations
with open(WDIR+'/datasets/negatives/neg_annotations.txt','w') as na:
  for item in neg_annotations:
    na.write(item+"\n")
  
with open(WDIR+'/datasets/negatives/neg_images.txt','w') as ni:
  for item in neg_images:
    ni.write(item+"\n")

In [39]:
len(neg_annotations) == num_neg_samples

True

# Revert the state
If something went wrong or you'd like to revert the state, run the cells below.

- Load the annotations
- Delete the files

In [None]:
# Delete all images
filePath=WDIR+'/datasets/negatives/neg_images.txt'
with open(filePath, 'r') as f:
  lines=f.readlines()

for line in lines:
  os.remove(line[:-1])

In [None]:
# Delete all text files
filePath=WDIR+'/datasets/negatives/neg_annotations.txt'
with open(filePath, 'r') as f:
  lines=f.readlines()

for line in lines:
  os.remove(line[:-1])