In [None]:
import glob
import json
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
import statsmodels.formula.api as smf
import yaml

plt.style.use("ggplot")

# Explore NodeODM processing statistics

## 1. Read NodeODM logs

In [None]:
base_path = r"/home/notebook/shared-seabee-ns9879k/seabirds/2024"

In [None]:
search_path = os.path.join(base_path, r"*/report/log.json")
flist = glob.glob(search_path)

mission_list = []
times_list = []
nfiles_list = []
st_date_list = []
end_date_list = []
for fpath in flist:
    mission = os.path.basename(os.path.dirname(os.path.dirname(fpath)))
    with open(fpath) as f:
        data = json.load(f)

    mission_list.append(mission)
    times_list.append(data["totalTime"] / 60)
    nfiles_list.append(data["images"])
    st_date_list.append(data["startTime"])
    end_date_list.append(data["endTime"])

df = pd.DataFrame(
    {
        "mission_name": mission_list,
        "nfiles": nfiles_list,
        "time_mins": times_list,
        "start_date": st_date_list,
        "end_date": end_date_list,
    }
)
df.head()

## 2. CDF of image counts

In [None]:
g = sn.displot(df["nfiles"], kind="ecdf")
g.set(xlabel="Number of files", title=f"Empirical CDF (n = {len(flist)})")

## 3. CDF of processing times

In [None]:
g = sn.displot(df["time_mins"] / 60, kind="ecdf")
g.set(xlabel="Processing time (hours)", title=f"Empirical CDF (n = {len(flist)})")

## 3. Relationship between number of images and processing time

In [None]:
res = smf.ols(formula="time_mins ~ nfiles - 1", data=df).fit()
print(res.summary())
res_df = res.get_prediction().summary_frame()
res_df["nfiles"] = df["nfiles"]
res_df.sort_values("nfiles", inplace=True)

plt.plot(df["nfiles"], df["time_mins"], "ro")
plt.plot(res_df["nfiles"], res_df["mean"], "k-")
plt.fill_between(
    res_df["nfiles"],
    res_df["mean_ci_lower"],
    res_df["mean_ci_upper"],
    alpha=0.1,
    color="r",
)
plt.xlabel("Number of raw images")
plt.ylabel("Processing time (mins)")

## 4. Memory and CPU usage

In [None]:
txt_path = r"/home/notebook/shared-seabee-ns9879k/notebook-logs/nodeodm-4missions-5249images-32cpu.txt"
log_df = pd.read_csv(
    txt_path,
    skiprows=2,
    header=None,
    names=["Name", "CPUs", "Memory_GB"],
    delim_whitespace=True,
)
log_df["CPUs"] = log_df["CPUs"].str.replace("m", "").astype(int) / 1000
log_df["Memory_GB"] = log_df["Memory_GB"].str.replace("Mi", "").astype(int) / 1000
del log_df["Name"]
log_df.index = pd.to_datetime(log_df.index)
log_df.plot(subplots=True)

## 5. Count of missions per day

In [None]:
for day in range(1, 16):
    search_path = f"/home/notebook/shared-seabee-ns9879k/seabirds/2023/Team1Dag{day}_*"
    flist = glob.glob(search_path)
    print(f"Team1Dag{day:02d}:", len(flist))