In [None]:
import polars as pl
import blitzbeaver as bb

In [None]:
csv_path = "../../aptihramy/data/csv_cleaned"

dataframes = [
    pl.read_csv(f"{csv_path}/1805.csv", infer_schema_length=10000),
    pl.read_csv(f"{csv_path}/1806.csv", infer_schema_length=10000),
    pl.read_csv(f"{csv_path}/1807.csv", infer_schema_length=10000),
    pl.read_csv(f"{csv_path}/1808.csv", infer_schema_length=10000),
    pl.read_csv(f"{csv_path}/1809.csv", infer_schema_length=10000),
    pl.read_csv(f"{csv_path}/1810.csv", infer_schema_length=10000),
]

In [None]:
# load the graph from a .beaver file
path_graph = "../graph.beaver"

graph = bb.TrackingGraph.load(path_graph)

In [None]:
record_schema = bb.RecordSchema(
    [
        bb.FieldSchema("nom_rue_norm", bb.ElementType.String),
        bb.FieldSchema("chef_prenom_norm", bb.ElementType.String),
        bb.FieldSchema("chef_nom_norm", bb.ElementType.String),
        bb.FieldSchema("chef_origine", bb.ElementType.String),
        bb.FieldSchema("epouse_nom", bb.ElementType.String),
        bb.FieldSchema("chef_vocation", bb.ElementType.String),
    ]
)

In [None]:
config = bb.TrackingConfig(
    num_threads=17,
    tracker=bb.TrackerConfig(
        interest_threshold=0.4,
        memory_strategy="median",
        record_scorer=bb.RecordScorerConfig(
            record_scorer="weighted-quadratic",
            weights=[
                0.15,
                0.25,
                0.25,
                0.1,
                0.1,
                0.1,
            ],
        )
    ),
    distance_metric=bb.DistanceMetricConfig(
        metric="lvopti",
        caching_threshold=4,
    ),
    resolver=bb.ResolverConfig(
        resolving_strategy="best-match",
    ),
)
bb.validate_tracking_config(config)

In [None]:
# compute the graph
graph = bb.test_tracking_engine(config, record_schema, dataframes, "debug")
graph = bb.TrackingGraph(graph)

In [None]:
path_graph = "../graph.beaver"

graph.save(path_graph)

In [None]:
chain_id = graph._raw.root.outs[678][0]

graph.materialize_tracking_chain(chain_id, dataframes)

In [None]:
print("Tracking chains:", len(graph._raw.root.outs))

chain_metrics = bb.evaluate_tracking_chain_length(graph._raw)
print(chain_metrics)

graph_metrics = bb.evaluate_tracking_graph_properties(graph._raw)
print(graph_metrics)

In [None]:
histogram: list[int] = chain_metrics.histogram

# display a graph of the histogram
import matplotlib.pyplot as plt

plt.bar(range(len(histogram)), histogram)
plt.show()

In [None]:
words = [
    "magimelien",
    "mazimilien",
    "mazirelien",
    "marinelien",
]
bb.compute_median_word(words)
