# QC how to!

This is oriented around using the numpy data format which is (#frames, #nodes, #coords, #individuals)

In [None]:
import glob
from natsort import natsorted
from bee_tracking import Tracking
import os
from utils.logger import logger
import pickle
from tqdm import tqdm

base_dir = "/Genomics/ayroleslab2/scott/bees/data/"
experiment_dict = {
    d: {} for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
}

In [None]:
logger.info("Loading data for experiments: " + str(list(experiment_dict.keys())))

In [None]:
key = "tracking_results_20211101QR"
experiment_dict = {key: {}}

In [None]:
for key, value in tqdm(experiment_dict.items()):
    experiment_dict[key]["result_files"] = natsorted(
        glob.glob(base_dir + key + "/*/*_aruco_data_with_track_numbers.csv")
    )
    if len(experiment_dict[key]["result_files"]) == 0:
        experiment_dict[key]["result_files"] = natsorted(
            glob.glob(base_dir + key + "/*_aruco_data_with_track_numbers.csv")
        )
    if len(experiment_dict[key]["result_files"]) == 0:
        logger.info("No result files found for experiment: " + key)
    experiment_dict[key]["track"] = Tracking.fromListOfArucoFiles(
        experiment_dict[key]["result_files"][0:2]
    )

In [None]:
logger.info(experiment_dict[key].keys()) # What's in the dictionary anyway?

In [None]:
track = experiment_dict[key]["track"]
arr = track.to_numpy()

In [None]:
logger.info(arr.shape)  # (num_frames, num_nodes, num_coords, num_individuals)

In [None]:
import pandas as pd

uniq_cols = track._track_dataframe.columns.get_level_values(0).unique()
tags = track._track_dataframe.columns.get_level_values(1).unique()

logger.info("Unique columns: " + str(uniq_cols))
logger.info("Unique tags: " + str(tags))

## Real QC!

### missingness

In [None]:
# Asking if along the x of each node, are all nodes missing
import numpy as np
atleast_one_node_defined = np.any(~np.isnan(arr[:, :, 0, :]), axis=1)
no_nodes_defined =  ~atleast_one_node_defined

# Take the sum and divide by the length to get our actual values!
missing_ct = np.sum(no_nodes_defined, axis=0)
missing_freq = missing_ct / no_nodes_defined.shape[0]

qc_df = pd.DataFrame({"tag": tags, "missing_freq": missing_freq})

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import colorcet as cc

palette = sns.color_palette(cc.glasbey, n_colors=arr.shape[3])

sns.set(style="white")
fig, ax = plt.subplots(figsize=(16, 8))
plt.xlim(0, 1)
sns.histplot(
    ax=ax,
    data=qc_df,
    x="missing_freq",
    bins=100,
    multiple="stack",
    palette=palette,
    edgecolor=".3",
    linewidth=0.5,
    legend=False,
)
plt.title("Missingness distribution")
plt.xlabel("Frequency of missing data")
sns.set_style("ticks")
sns.despine(offset=10, trim=True)

### Speed histograms

In [None]:
%%capture
import utils.trx_utils as trx_utils
import importlib

importlib.reload(trx_utils)
node_vels = trx_utils.instance_node_velocities(arr, 0, arr.shape[0])

In [None]:
node_names = {"abdomen": 0, "tag": 1, "head": 2, "thorax": 3}

node_vels[:, node_names["tag"], :]

fig, ax = plt.subplots(figsize=(16, 8))
node_vels[np.where(node_vels== 0)] = np.nan
sns.histplot(
    ax=ax,
    data=node_vels,
    bins=100,
    multiple="stack",
    palette=palette,
    edgecolor=".3",
    linewidth=0.5,
    log_scale=True,
    legend=False,
    stat="density",
)
plt.title("Speed distribution - px/frame")
sns.set_style("ticks")
sns.despine(offset=10, trim=True)