In [None]:
import pandas as pd
import pathlib
import numpy as np
from numpy import log as ln
import statistics as stats

In [None]:
# Import dataset
data_dir = pathlib.Path(
    'publicly-available-microscopy-data/IDR/data')

metadata_file = pathlib.Path(data_dir, "plate_details_per_screen.parquet.gzip")
metadata_df = pd.read_parquet(metadata_file)
print(metadata_df.head())

In [None]:
# Define stats

# Shannon Index
def h_index(p):
    """Calculates the Shannon Index of a set of unique attribute instances.
    Parameters
    ----------
    p: dict
        Dictionary of relative frequencies for each unique element in an attribute column.

    Returns
    -------
    h: float
        Shannon Index value
    
    results: list
        List of each -p_iln(p_i) value to use for Normalized Median Evenness statistic.
    """
    results = list()
    for entry in p.values():
        results.append(entry * ln(entry))
    
    h = -(sum(results))
    return h, results

# Pielou's Evenness
def pielou(h, s):
    return h / ln(s)

# Normalized Median Evenness
def nme(h_list):
    temp_list = list()
    for h_value in h_list:
        temp_list.append(-1.0 * h_value)
    nme = stats.median(temp_list) / np.max(temp_list)
    return nme

In [None]:
# Record and count unique entries
attribute_names = metadata_df.columns.to_list()
na_attributes = ["pixel_size_x", "pixel_size_y"]
for attribute in na_attributes:
    attribute_names.remove(attribute)

stat_results = list()
attribute_results = list()
for attribute in attribute_names:
    if attribute == "stain":
        pass
    elif attribute == "stain_target":
        pass
    else:
        unique_entries = metadata_df[attribute].unique()
        attribute_elements = dict()
        for element in unique_entries:
            attribute_elements[element] = len(metadata_df[metadata_df[attribute] == element])

        # Calculate richness
        richness = len(attribute_elements.keys())

        # Calculate Shannon Index
        total_instances = sum(attribute_elements.values())
        p_dict = dict()
        for i in attribute_elements.keys():
            p_dict[i] = attribute_elements[i] / total_instances

        h, pi_list = h_index(p=p_dict)

        # Calculate Normalized Median Evenness
        nme_result = nme(pi_list)
        # Calculate Pielou's evenness
        j = pielou(h=h, s=richness)

        attribute_results.append([attribute, 
                                  h, 
                                  nme_result, 
                                  j])

stat_results_df = pd.DataFrame(data=attribute_results, columns=['Attribute', 'H', 'NME', 'J'])
print(stat_results_df)