In [None]:
import polars as pl
import matplotlib.pyplot as plt
import blitzbeaver as bb

In [None]:
csv_path = "../../aptihramy/data/csv_cleaned"

start_year = 1835
end_year = 1850

dataframes = [
    pl.read_csv(f"{csv_path}/{year}.csv", infer_schema_length=10000)
    for year in range(start_year, end_year + 1)
]

In [None]:
for i in range(len(dataframes)):
    df = dataframes[i]
    dataframes[i] = df.with_columns(df["enfants_chez_parents_prenom"].str.split("|")
                                    .list.eval(pl.element().filter(pl.element() != ""))
                                    .alias("enfants_chez_parents_prenom"))

In [None]:
# load the graph from a .beaver file
path_graph = "../graph.beaver"

graph = bb.read_beaver(path_graph)

In [None]:
record_schema = bb.RecordSchema(
    [
        bb.FieldSchema("nom_rue", bb.ElementType.String),
        bb.FieldSchema("chef_prenom", bb.ElementType.String),
        bb.FieldSchema("chef_nom", bb.ElementType.String),
        bb.FieldSchema("chef_origine", bb.ElementType.String),
        bb.FieldSchema("epouse_nom", bb.ElementType.String),
        bb.FieldSchema("chef_vocation", bb.ElementType.String),
        bb.FieldSchema("enfants_chez_parents_prenom", bb.ElementType.MultiStrings),
    ]
)

In [None]:
distance_metric_config = bb.DistanceMetricConfig(
    metric="lv_opti",
    caching_threshold=4,
    use_sigmoid=False,
    lv_substring_weight=0.5,
)
normal_memory_config = bb.MemoryConfig(
    memory_strategy="median",
)
multi_memory_config = bb.MemoryConfig(
    memory_strategy="mw-median",
    multiword_threshold_match=0.6,
    multiword_distance_metric=distance_metric_config,
)

config = bb.config(
    record_schema=record_schema,
    distance_metric_config=distance_metric_config,
    record_scorer_config=bb.RecordScorerConfig(
        record_scorer="weighted-average",
        weights=[
            0.15,
            0.25,
            0.25,
            0.1,
            0.1,
            0.1,
            0.1,
        ],
        min_weight_ratio=0.7,
    ),
    resolver_config=bb.ResolverConfig(
        resolving_strategy="best-match",
    ),
    memory_config=normal_memory_config,
    multistring_memory_config=multi_memory_config,
    interest_threshold=0.6,
    limit_no_match_streak=3,
    num_threads=17,
)

In [None]:
# compute the graph
graph = bb.execute_tracking(config, record_schema, dataframes, "debug")

In [None]:
def aggregate_histograms(histograms: list[int]) -> list[int]:
    """
    Aggregates a list of histograms into a single histogram.
    """
    max_len = max([len(h) for h in histograms])
    result = [0] * max_len
    for h in histograms:
        for i, v in enumerate(h):
            result[i] += v
    return result

def summary_graph(graph: bb.TrackingGraph):
    # compute the sum of the histograms for all frames
    histogram_records = aggregate_histograms([resolving.histogram_record_matchs for resolving in graph.diagnostics.resolvings])
    # do not show beyond 10 elements as the counts are very low
    histogram_records = histogram_records
    histogram_trackers = aggregate_histograms([resolving.histogram_tracker_matchs for resolving in graph.diagnostics.resolvings])
    histogram_trackers = histogram_trackers[:10]

    chain_metrics = bb.evaluate_tracking_chain_length(graph._raw)
    graph_metrics = bb.evaluate_tracking_graph_properties(graph._raw)

    records_match_ratios = graph_metrics.records_match_ratios[1:]
    trackers_match_ratios = graph_metrics.trackers_match_ratios[1:-1]
    avg_records_match = sum(records_match_ratios) / len(records_match_ratios)
    avg_trackers_match = sum(trackers_match_ratios) / len(trackers_match_ratios)

    per_divergence = sum(histogram_trackers[2:]) / sum(histogram_trackers[1:])
    per_conflict = sum(histogram_records[2:]) / sum(histogram_records[1:])

    # total number of trackers created
    print(f"Number of chains: {len(graph.trackers_ids)}")
    # average percentage of records that have been match with an existing tracker
    print(f"Percentage of matching records: {avg_records_match*100:.2f}%")
    # average percentage of trackers that have match with a record of the current frame
    print(f"Percentage of matching trackers: {avg_trackers_match*100:.2f}%")
    # number of times a tracker matched with more that one record
    # divided by the number of times a tracker matched with a record
    print(f"Percentage of divergences: {per_divergence*100:.2f}%")
    # number of times a record matched with multiple trackers
    # divided by the number of times a record matched with a tracker
    print(f"Percentage of conflicts: {per_conflict*100:.2f}%")

    plt.bar(range(1, len(chain_metrics.histogram)), chain_metrics.histogram[1:])
    plt.title("Histogram of tracking chain lengths")
    plt.show()

    plt.bar(range(len(histogram_records)), histogram_records)
    plt.title("Histogram of # matchs per record")
    plt.show()

    plt.bar(range(len(histogram_trackers)), histogram_trackers)
    plt.title("Histogram of # matchs per tracker")
    plt.show()

    

In [None]:
summary_graph(graph)

In [None]:
path_graph = "../graph.beaver"

bb.save_beaver(path_graph, graph)

In [None]:
normalization_config = bb.NormalizationConfig(
    threshold_cluster_match=0.5,
    min_cluster_size=2,
    distance_metric=distance_metric_config,
)

In [None]:
normalized_dfs = bb.execute_normalization(
    normalization_config,
    record_schema,
    graph,
    dataframes,
)

In [None]:
def find_chain_with_length(graph: bb.TrackingGraph, start_idx: int, length: int) -> None:
    idx = start_idx
    while idx < len(graph.trackers_ids):
        tracker_id = graph.trackers_ids[idx]
        chain = graph._raw.get_tracking_chain(tracker_id)
        if len(chain) >= length:
            return tracker_id
        idx += 1
    return None

In [None]:
# chain = find_chain_with_length(graph, dataframes, record_schema, 0, 3)
tracker_id = find_chain_with_length(graph, 2000, 15)
chain = graph.materialize_tracking_chain(tracker_id, dataframes, record_schema, normalized_dataframes=normalized_dfs)
chain.as_dataframe()

In [None]:
chain.as_dataframe(normalized=True)

In [None]:
df = chain.as_dataframe()

In [None]:
def get_col_as_list(df: pl.DataFrame, col: str) -> list[str]:
    return [v for v in df[col] if v is not None]

In [None]:
words = [
    "magimelien",
    "mazimilien",
    "mazirelien",
    "marinelien",
    "hgdfzs",
    "bob",
    "boob",
]
bb.compute_median_word(words)


In [None]:
bb.compute_words_clusters(
    words,
    distance_metric_config,
    threshold_match=0.6,
)

In [None]:
get_col_as_list(df, "nom_rue")

In [None]:
bb.normalize_words(
    get_col_as_list(df, "nom_rue"),
    distance_metric_config,
    threshold_match=0.6,
    min_cluster_size=2,
)