# THIS IS TO BE USED ONLY AS A REFERENCE FOR THE FINAL CODE
### Treat it as a demo file

In [1]:
import pandas as pd
import os
import random
from utils import specimen_train_test_split
from tqdm import tqdm
import shutil

In [2]:
import os

# Define constants
CURR_DIR = os.getcwd()
CSV_DIR = os.path.join(CURR_DIR, 'csv')
IMAGE_STORE = os.path.join(CURR_DIR, 'images/after_padding_cropped/')
DATA_FOLDER = os.path.join(CURR_DIR, 'data')
ORGANIZED_DATA_FOLDER = os.path.join(DATA_FOLDER, 'organized_data')
TRAIN_FOLDER = os.path.join(ORGANIZED_DATA_FOLDER, 'train')
TEST_FOLDER = os.path.join(ORGANIZED_DATA_FOLDER, 'test')

COLUMNS_OF_INTEREST = ['Image_name', 'specimen', 'species', 'source_dataset']

# Check for existence of required directories and create them if necessary
assert os.path.exists(IMAGE_STORE), "IMAGE_STORE does not exist. Run download_data.ipynb first"
if not os.path.exists(DATA_FOLDER):
    os.mkdir(DATA_FOLDER)
if not os.path.exists(ORGANIZED_DATA_FOLDER):
    os.mkdir(ORGANIZED_DATA_FOLDER)
if not os.path.exists(TRAIN_FOLDER):
    os.mkdir(TRAIN_FOLDER)
if not os.path.exists(TEST_FOLDER):
    os.mkdir(TEST_FOLDER)


### Train Test CSV structure:
- index - index of the image
- specimen - specimen name, this is unique for each individual mosquito. Each specimen can have multiple images
- Image_name - name of the image (generally of the form: `specimen + number`)
- species - one hot encoded species category (Anopheles funestus, Culex etc)
- source_dataset - helps direct which folder to select images from in the dataloader

In [3]:
# read data from M7 and M4 
df_M7 = pd.read_csv(os.path.join(CSV_DIR, 'M7_data.csv'))
df_M4 = pd.read_csv(os.path.join(CSV_DIR, 'M4_data.csv'))
df_steph = pd.read_csv(os.path.join(CSV_DIR, 'steph_march.csv')) # colony bred stephensi specimen imaged in March
df_gamb = pd.read_csv(os.path.join(CSV_DIR, 'gambiae_march.csv')) # colony bred gambiae specimen imaged in March

df_map = {}
df_map['M7'] = df_M7
df_map['stephensi_march'] = df_steph
df_map['gambiae_march'] = df_gamb

### Manual data clean-up step -> Assigning correct species labels

Here, I am following a 6 class structure for the species labels. The 6 classes are:
- Anopheles funestus
- Anopheles gambiae
- Anopheles stephensi
- Anopheles other
- Culex
- Mansonia

Classes are 0 indexed, so the class labels are: `0, 1, 2, 3, 4, 5`

In [4]:
def clean_species_specimen_names(df):
    df['morphSpecies'] = df['morphSpecies'].str.lower()
    # remove rows that are nm
    df = df[df['morphSpecies'] != 'nm'].copy()
    # remove extra spaces
    df['morphSpecies'] = df['morphSpecies'].str.strip()

    unique_identifier = 'specimenId_unique' if 'specimenId_unique' in df.columns else 'specimenIdDup'
    df['Image_name'] = df[unique_identifier] + ".jpg"
    df['specimen'] = ""
    for idx, row in df.iterrows():
        #get the string before the last underscore
        if '_' in row['specimenId']:
            df.at[idx,'specimen'] = "".join(row[unique_identifier].split('_')[:-1])
        else:
            df.at[idx,'specimen'] = row[unique_identifier]
    return df

for k in df_map.keys():
    df_map[k]["species"] = -1 # this is to make sure that all species columns are filled 
    df_map[k] = clean_species_specimen_names(df_map[k])



##### This is hard-coded. Be carful

In [5]:
# we know that gambiae and stephensi are the only species in their respective datasets
df_map['stephensi_march']['morphSpecies'] = 'anopheles stephensi'
df_map['gambiae_march']['morphSpecies'] = 'anopheles gambiae'

#### Identify unique morphSpecies name and decide on the class labels

In [6]:
unique_morph_species = []
for k in df_map.keys():
    unique_morph_species.extend(df_map[k]['morphSpecies'].unique())

print(f"Unique morph species: {set(unique_morph_species)}")

MORPHSPECIES_TO_ONEHOT_SPECIES = { ## Modify this mapping as needed
    'anopheles funestus': 0,
    'anopheles gambiae': 1,
    'anopheles stephensi': 2,
    'anopheles other': 3,
    'culex': 4,
    'mansonia': 5,
    'other': 5,
    'ziemanni': 3
}

# assert that all species are in the mapping
for k in unique_morph_species:
    assert k in MORPHSPECIES_TO_ONEHOT_SPECIES.keys(), f"{k} not in mapping, add it to MORPHSPECIES_TO_ONEHOT_SPECIES"

# add onehot species column
for k in df_map.keys():
    df_map[k]['species'] = df_map[k]['morphSpecies'].map(MORPHSPECIES_TO_ONEHOT_SPECIES)

Unique morph species: {'anopheles other', 'anopheles gambiae', 'mansonia', 'ziemanni', 'culex', 'anopheles funestus', 'other', 'anopheles stephensi'}


### Creating folder structure for the images

In [7]:
for k in df_map.keys():
    df_map[k]['source_dataset'] = k
    # create folder k in DATA_FOLDER if it doesn't exist
    if not os.path.exists(os.path.join(DATA_FOLDER, k)):
        os.mkdir(os.path.join(DATA_FOLDER, k))
    print(f"{k} has {len(df_map[k])} rows")

M7 has 3706 rows
stephensi_march has 188 rows
gambiae_march has 109 rows


In [8]:
### Check that all required columns are present
def select_columns(df_map):
    cleaned_df_map = {}
    for k in df_map.keys():
        for col in COLUMNS_OF_INTEREST:
            assert col in df_map[k].columns, f"{col} not in {k}'s columns"
        cleaned_df_map[k] = df_map[k][COLUMNS_OF_INTEREST].copy()
    return cleaned_df_map

In [9]:
cleaned_df_map = select_columns(df_map)

### (This next block of code is optional )Data before M4 is from ODK. Here is some short code to replicate the same here

In [10]:
# make a copy of the df_M4 DataFrame
df_M4 = df_M4.copy()

# rename 'value' and 'mosquito_id' columns
df_M4.rename(columns={'value': 'Image_name', 'mosquito_id': 'specimen'}, inplace=True)

# map 'morph_id_anopheles_species' column values to 'gambiae' and 'stephensi'
species_map = {'gambiae': 1, 'stephensi': 2}
df_M4['species'] = df_M4['morph_id_anopheles_species'].map(species_map)

# select only the rows with non-null species values
df_M4 = df_M4[df_M4['species'].notna()].copy()

# add 'source_dataset' column with value 'M4'
df_M4['source_dataset'] = 'M4'

# print the count of unique species values
print(df_M4['species'].value_counts())

for col in COLUMNS_OF_INTEREST:
    assert col in df_M4.columns, f"{col} not in df_M4's columns"

# add the M4 DataFrame to the cleaned_df_map
cleaned_df_map['M4'] = df_M4[COLUMNS_OF_INTEREST].copy()

# create folder M4 in DATA_FOLDER if it doesn't exist
if not os.path.exists(os.path.join(DATA_FOLDER, 'M4')):
    os.mkdir(os.path.join(DATA_FOLDER, 'M4'))   

2.0    420
1.0    297
Name: species, dtype: int64


### Organize into source dataset folders


In [11]:
def identify_and_move_valid_images(df):
    df["Image_Available"] = True
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        image_name = row['Image_name']
        source_dataset = row['source_dataset']
        species = row['species']
        specimen = row['specimen']
        image_path = os.path.join(IMAGE_STORE, image_name)
        # check if the image exists
        if not os.path.exists(image_path):
            df.at[idx, 'Image_Available'] = False
            continue
        destination_path = os.path.join(DATA_FOLDER, source_dataset)
        # create the destination path if it doesn't exist
        if not os.path.exists(destination_path):
            os.makedirs(destination_path)
        # copy the image to the destination path
        shutil.copy(image_path, destination_path)
    # select only the rows with valid images
    df = df[df['Image_Available']].copy()
    return df

for k in cleaned_df_map.keys():
    print(f"Processing {k} dataframe")
    cleaned_df_map[k] = identify_and_move_valid_images(cleaned_df_map[k])
    print(f"{k} has {len(cleaned_df_map[k])} rows \n")


Processing M7 dataframe


100%|██████████| 3706/3706 [00:03<00:00, 959.73it/s] 


M7 has 3706 rows 

Processing stephensi_march dataframe


100%|██████████| 188/188 [00:00<00:00, 4924.80it/s]


stephensi_march has 188 rows 

Processing gambiae_march dataframe


100%|██████████| 109/109 [00:00<00:00, 4493.16it/s]


gambiae_march has 109 rows 

Processing M4 dataframe


100%|██████████| 717/717 [00:00<00:00, 4282.96it/s]

M4 has 295 rows 






# Create the train test split

In [12]:
train_test_dict = {}
for k in cleaned_df_map.keys():
    print(f"\nProcessing {k} dataframe")
    output = specimen_train_test_split(cleaned_df_map[k]) # output is a tuple of (train_df, test_df)
    train_test_dict[k] = output

# join and concatenate the train and test dataframes
train_df = pd.concat([train_test_dict[k][0] for k in train_test_dict.keys()])
test_df = pd.concat([train_test_dict[k][1] for k in train_test_dict.keys()])


Processing M7 dataframe
train_list length: 1427
test_list length: 357

Processing stephensi_march dataframe
train_list length: 40
test_list length: 10

Processing gambiae_march dataframe
train_list length: 31
test_list length: 8

Processing M4 dataframe
train_list length: 88
test_list length: 22


## Copy the images into the train and test folders (if dataloader uses this structure)

In [13]:
# delete everything in the train and test folders
for folder in [TRAIN_FOLDER, TEST_FOLDER]:
    if os.path.exists(folder):
        shutil.rmtree(folder)
    os.mkdir(folder)
    
# copy the images to the train and test folders according to the train and test dataframes and create folders for each species as C_{species}
def copy_images_to_train_test_folders(df, dest_folder):
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Copying images to {dest_folder.split('/')[-1]} folder"):
        image_name = row['Image_name']
        species = row['species']
        image_path = os.path.join(DATA_FOLDER, row['source_dataset'], image_name)
        if not os.path.exists(image_path):
            print(f"{image_path} doesn't exist")
            continue
        destination_path = os.path.join(dest_folder, f"C_{species}")
        # create the destination path if it doesn't exist
        if not os.path.exists(destination_path):
            os.makedirs(destination_path)
        # copy the image to the destination path
        shutil.copy(image_path, destination_path)

copy_images_to_train_test_folders(train_df, TRAIN_FOLDER)
copy_images_to_train_test_folders(test_df, TEST_FOLDER)

Copying images to train folder: 100%|██████████| 3470/3470 [00:02<00:00, 1619.26it/s]
Copying images to test folder: 100%|██████████| 828/828 [00:00<00:00, 1604.29it/s]
