# BiteMe | Data Definition

In this notebook we source and download all available data on insect bites and stings.

N.B. This will be explored in v2 if we decide to progress to that.

In [1]:
import os
import hashlib

import pandas as pd

In [2]:
# Define directories
base_dir_path = "../"

data_dir_path = os.path.join(base_dir_path, "data")
data_raw_dir_path = os.path.join(data_dir_path, "raw")

## Rename images to its hash

In [3]:
def hash_files(data_raw_dir_path):
    """
    Renames files in raw data directory to its file hash.
    """
    def sha1_file(file_path):
        """
        Create hashed name for file path
        """
        f = open(file_path, "rb")
        r = hashlib.sha1(f.read()).hexdigest()
        f.close()
        return r

    # List raw data directory
    data_raw_dir = os.listdir(data_raw_dir_path)

    # List sub directories in raw data directory
    for label_dir_path in data_raw_dir:
        label_dir_path = os.path.join(data_raw_dir_path, label_dir_path)

        # Get full relative file paths for images
        for img_name in os.listdir(label_dir_path):
            img_name_path = os.path.join(label_dir_path, img_name)
            
            # Rename image file with its hash
            if img_name_path.endswith(".jpg"):
                hexh = f"{os.path.join(label_dir_path, sha1_file(img_name_path))}.jpg"
                print(f"Renamed {img_name_path} to {hexh}")
                os.rename(img_name_path, hexh)

## Create metadata csv from data directories

In [4]:
def create_metadata(data_raw_dir_path: str) -> pd.DataFrame:
    """
    Parses through raw data directory and sub-directories to create a metadata csv,
    containing paths, names and labels.
    
    Parameters
    ----------
    data_raw_dir_path : str
        Relative path to raw data directory.
        
    Returns
    -------
    pd.DataFrame
        Dataframe containing relevant metadata collected from raw data directory.
    """
    
    data_raw_dir = os.listdir(data_raw_dir_path)

    # Create empty dictionary 
    data_raw_metadata = pd.DataFrame(columns=["img_name", "img_path", "label"])
    
    for label in data_raw_dir:
        label_dir_path = os.path.join(data_raw_dir_path, label)
        
        if os.path.isdir(label_dir_path):
            for img_name in os.listdir(label_dir_path):
                img_name_path = os.path.join(label_dir_path, img_name)
                
                if ".jpg" in img_name_path:
                    # Add to metadata
                    data_raw_metadata = data_raw_metadata.append(
                        {
                            "img_name": img_name, 
                            "img_path": img_name_path, 
                            "label": label
                        }, ignore_index=True
                    )            
    
    return data_raw_metadata

In [5]:
# Create metadata csv
create_metadata(data_raw_dir_path).to_csv(f"{data_raw_dir_path}/metadata.csv")