In [None]:
import pandas as pd
from collections import defaultdict

In [None]:
dtypes = {"ais_code": str, "description": str, "parent_code": str, "page": int}
df = pd.read_csv("../data/ais_codes.csv", dtype=dtypes)
src = pd.read_csv("../data/ais_codes_extracted.csv", dtype=dtypes)
meta = pd.read_csv("../data/formatting_metadata.csv")

In [None]:
# check that hierarchy formatting resulted in the same codes as in original extraction
assert meta["same_len"].all() # number of input codes equals number of output codes per page
assert src["ais_code"].equals(df["ais_code"]) # codes exactly match
assert src["description"].equals(df["description"]) # descriptions exactly match

In [None]:
# check if parent codes are actual codes
# 000000.0 indicates top level codes
df.loc[~df["parent_code"].isin(df["ais_code"]), "parent_code"].value_counts()

In [None]:
# check if there are any self-referential nodes
self_ref_mask = (df["ais_code"] == df["parent_code"])
df[self_ref_mask]

In [None]:
# aside from self-referential nodes, check if there are any cycles
temp = df[~self_ref_mask]

# construct directed graph as adjacency list
graph = defaultdict(list)
for _, row in temp.iterrows():
    graph[row["parent_code"]].append(row["ais_code"])

In [None]:
def get_cycle_nodes(adj_list: dict[str, list[str]]) -> list[str]:
    visited = set()
    recursion_stack = set()

    def dfs(node):
        if node in recursion_stack:
            return True
        if node in visited:
            return False

        visited.add(node)
        recursion_stack.add(node)

        for neighbor in adj_list.get(node, []):
            if dfs(neighbor):
                return True

        recursion_stack.remove(node)
        return False

    ret = []
    for node in adj_list:
        if dfs(node):
            ret.append(node)
    return ret

In [None]:
cycle_nodes = get_cycle_nodes(graph)
cycle_mask = df["ais_code"].isin(cycle_nodes)
df[cycle_mask]

In [None]:
# for codes that result in cycles, their pages should have "max depth reached" in their reformatted text in the metadata
cycle_pages = df.loc[self_ref_mask | cycle_mask, "page"].drop_duplicates()
assert meta.loc[meta["page"].isin(cycle_pages), "reformatted"].str.contains("max depth reached").all()

In [None]:
# check for duplicate codes
df[df["ais_code"].duplicated(keep=False)]

In [None]:
# manually correct cycles and duplicates
df.loc[952, "ais_code"] = "544899.2"
df.loc[(df["parent_code"] == "544999.2") & (df["page"] == 104), "parent_code"] = "544899.2"

df.loc[1065, "ais_code"] = "620099.9"
df.loc[df["parent_code"] == "620999.9", "parent_code"] = "620099.9"

In [None]:
df.to_csv("../data/ais_codes.csv", index=False)