# Big Data Grab

A big grab of all the data to date.

In [1]:
from dakar_rallydj.getter import DakarAPIClient
from sqlite_utils import Database
import pandas as pd

def custom_key_fn(request, ignored_parameters=None, match_headers=False, serializer=None, **request_kwargs):
    return request.url.split("api/")[-1]  # Store the plain URL as the key

dakar = DakarAPIClient(
    use_cache=True,
    backend='sqlite',
    cache_name='dakar_cache_2025',
    expire_after=-1,  # Never expire
    key_fn=custom_key_fn
)

In [2]:
all_clazz = ["A", "F", "K", "M"]
all_stages = range(1, 13)

dakar.get_category()
dakar.get_groups()
dakar.get_clazz(category=all_clazz)
dakar.get_withdrawals(category=["A", "K", "M"])

for c in ["A", "M"]:
    dakar.get_stages(category=c)
    for s in all_stages:
        dakar.get_waypoints(category=c, stage=s)
        dakar.get_scores(category=c, stage=s)

In [3]:
db = Database("dakar_cache_2025.sqlite")  # Custom key function to use raw URLs
   
db.table_names()

['responses', 'redirects']

In [4]:
# The responses are hashed URLs
for i, row in enumerate(db["responses"].rows):
    print(row)
    if i>10:
        break

{'key': 'category-2025', 'value': b'\x80\x04\x95P\x0e\x00\x00\x00\x00\x00\x00}\x94(\x8c\x08_content\x94Bs\n\x00\x00[{"refueling":0,"label":"F","shortLabel":"cat.name.F","updatedAt":"2025-01-06T12:12:00+01:00","kmGoal":1000,"mapDisplay":true,"liveDisplay":false,"categoryLangs":[{"locale":"en","text":"DAKAR FUTURE MISSION 1000","variable":"cat.name.F"},{"variable":"cat.name.F","text":"DAKAR FUTURE MISSION 1000","locale":"fr"},{"locale":"es","variable":"cat.name.F","text":"DAKAR FUTURE MISSION 1000"},{"variable":"cat.name.F","locale":"ar","text":"DAKAR FUTURE MISSION 1000"}],"reference":"2025-F","lastStage":12,"promotionalDisplay":true,"position":8,"_bind":"category-2025","_origin":"category-2025","_id":"0a0fb4dfc03bcb6b0e9c940b5a09ae05","_key":"label","_updatedAt":1737366461403,"_parent":"millesime:28a3b2a601a2028494b004bb95233853","isPointCat":true},{"shortLabel":"cat.name.A","label":"A","mapDisplay":true,"liveDisplay":true,"updatedAt":"2025-01-06T12:11:14+01:00","promotionalDisplay":tr

## Adding the Data to a Database

A naive way of adding the data to a database is to aggregate each of the dataframes we can generate from a particular API call and then upload those to a database using the *pandas* `.to_sql()` command. If the database table does not exist, this function call will create the table.

In [5]:
all_clazz = ["A", "F", "K", "M"]
all_stages = range(1, 13)

category_df = dakar.get_category()
groups_df = dakar.get_groups()
clazz_df = dakar.get_clazz(category=["A", "K", "M", "F"])
withdrawals_df, withdrawn_competitors_df, withdrawn_teams_df = dakar.get_withdrawals(
    category=["A", "K", "M"])

stages_df, sectors_df, stage_surfaces_df, section_surfaces_df, surfaces_df, waypoints_df, long_results_df, long_results2_df, results_teams_df, results_competitors_df = (
    pd.DataFrame() for _ in range(10))

for c in ["A", "M"]:
    _stages_df, _sectors_df, _stage_surfaces_df, _section_surfaces_df, _surfaces_df = dakar.get_stages(
        category=c)
    stages_df = pd.concat([stages_df, _stages_df], ignore_index=True)
    sectors_df = pd.concat([sectors_df, _sectors_df], ignore_index=True)
    stage_surfaces_df = pd.concat([stage_surfaces_df, _stage_surfaces_df], ignore_index=True)
    section_surfaces_df = pd.concat([section_surfaces_df, _section_surfaces_df], ignore_index=True)
    surfaces_df = pd.concat([surfaces_df, _surfaces_df], ignore_index=True)

    for s in all_stages:
        _waypoints_df = dakar.get_waypoints(category=c, stage=s)
        waypoints_df = pd.concat([waypoints_df, _waypoints_df], ignore_index=True)

        _long_results_df, _long_results2_df, _results_teams_df, _results_competitors_df = dakar.get_scores(category=c, stage=s)
        long_results_df = pd.concat([long_results_df, _long_results_df], ignore_index=True)
        long_results2_df = pd.concat(
            [_long_results2_df, _long_results2_df], ignore_index=True)
        results_teams_df = pd.concat([results_teams_df, _results_teams_df], ignore_index=True)
        results_competitors_df = pd.concat([results_competitors_df, _results_competitors_df], ignore_index=True)

In [6]:
from dakar_rallydj.enrichers import derive_clazz_metadata

category_df = category_df.drop_duplicates()
groups_df = groups_df.drop_duplicates()
clazz_df = clazz_df.drop_duplicates()
withdrawals_df = withdrawals_df.drop_duplicates()
withdrawn_competitors_df = withdrawn_competitors_df.drop_duplicates()
withdrawn_teams_df = withdrawn_teams_df.drop_duplicates()

stages_df = stages_df.drop_duplicates()
sectors_df = sectors_df.drop_duplicates()
stage_surfaces_df = stage_surfaces_df.drop_duplicates()
section_surfaces_df = section_surfaces_df.drop_duplicates()
surfaces_df = surfaces_df.drop_duplicates()
waypoints_df = waypoints_df.drop_duplicates()
long_results_df = long_results_df.drop_duplicates()
long_results2_df = long_results2_df.drop_duplicates()
results_teams_df = results_teams_df.drop_duplicates()
results_competitors_df = results_competitors_df.drop_duplicates()

withdrawn_teams_df = derive_clazz_metadata(
    withdrawn_teams_df, clazz_df, groups_df)
results_teams_df = derive_clazz_metadata(results_teams_df, clazz_df, groups_df)

In [7]:
# replace . im colname with _
for df in [category_df, groups_df, clazz_df, withdrawals_df, withdrawn_competitors_df, withdrawn_teams_df, stages_df, sectors_df, stage_surfaces_df, section_surfaces_df, surfaces_df, waypoints_df, long_results_df, long_results2_df, results_teams_df, results_competitors_df]:
    df.columns = df.columns.str.replace('.', '_')

In [8]:
db = Database("dakar_results_2025.sqlite") 

# Insert DataFrames into SQLite (replace table if exists)

category_df.to_sql('category', db.conn, if_exists='replace', index=False)
groups_df.to_sql('groups', db.conn, if_exists='replace', index=False)
clazz_df.to_sql('clazz', db.conn, if_exists='replace', index=False)
withdrawals_df.to_sql('withdrawals', db.conn,
                      if_exists='replace', index=False)
withdrawn_competitors_df.to_sql(
    'withdrawn_competitors', db.conn, if_exists='replace', index=False)
withdrawn_teams_df.to_sql(
    'withdrawn_teams', db.conn, if_exists='replace', index=False)


stages_df.to_sql('stages', db.conn, if_exists='replace', index=False)
sectors_df.to_sql('sectors', db.conn, if_exists='replace', index=False)
stage_surfaces_df.to_sql('stage_surfaces', db.conn,
                         if_exists='replace', index=False)
section_surfaces_df.to_sql('section_surfaces', db.conn,
                           if_exists='replace', index=False)
surfaces_df.to_sql('surfaces', db.conn, if_exists='replace', index=False)
waypoints_df.to_sql('waypoints', db.conn, if_exists='replace', index=False)
long_results_df.to_sql('long_results', db.conn,
                       if_exists='replace', index=False)
long_results2_df.to_sql('long_results2', db.conn,
                       if_exists='replace', index=False)
results_teams_df.to_sql('results_teams', db.conn,
                        if_exists='replace', index=False)
results_competitors_df.to_sql(
    'results_competitors', db.conn, if_exists='replace', index=False);

In [9]:
db.tables

[<Table category (refueling, label, shortLabel, updatedAt, kmGoal, mapDisplay, liveDisplay, reference, lastStage, promotionalDisplay, position, _bind, _origin, _id, _key, _updatedAt, _parent, isPointCat, ar, en, es, fr)>,
 <Table groups (shortLabel, promotionalDisplay, tinyLabel, label, reference, position, _bind, _origin, _id, _parent, color, ar, en, es, fr)>,
 <Table clazz (label, promotionalDisplay, refueling, shortLabel, position, reference, _bind, _id, _parent, $group, tinyLabel, color, ar, en, es, fr, category, categoryClazz)>,
 <Table withdrawals (stage, bib, reason, _category)>,
 <Table withdrawn_competitors (bib, name, firstName, lastName, role, gender, nationality, profil, profil_sm, podium, aid)>,
 <Table withdrawn_teams (team_bib, reference, categoryClazz, clazz_label, tinyLabel, label, color, group_label)>,
 <Table stages (stage_code, stage, date, startDate, endDate, isCancelled, generalDisplay, isDelayed, marathon, length, type, timezone, stageWithBonus, mapCategoryDispla

In [10]:
print(db.schema)

CREATE TABLE "category" (
"refueling" INTEGER,
  "label" TEXT,
  "shortLabel" TEXT,
  "updatedAt" TEXT,
  "kmGoal" REAL,
  "mapDisplay" INTEGER,
  "liveDisplay" INTEGER,
  "reference" TEXT,
  "lastStage" INTEGER,
  "promotionalDisplay" INTEGER,
  "position" INTEGER,
  "_bind" TEXT,
  "_origin" TEXT,
  "_id" TEXT,
  "_key" TEXT,
  "_updatedAt" INTEGER,
  "_parent" TEXT,
  "isPointCat" REAL,
  "ar" TEXT,
  "en" TEXT,
  "es" TEXT,
  "fr" TEXT
);
CREATE TABLE "groups" (
"shortLabel" TEXT,
  "promotionalDisplay" INTEGER,
  "tinyLabel" TEXT,
  "label" TEXT,
  "reference" TEXT,
  "position" INTEGER,
  "_bind" TEXT,
  "_origin" TEXT,
  "_id" TEXT,
  "_parent" TEXT,
  "color" TEXT,
  "ar" TEXT,
  "en" TEXT,
  "es" TEXT,
  "fr" TEXT
);
CREATE TABLE "clazz" (
"label" TEXT,
  "promotionalDisplay" INTEGER,
  "refueling" INTEGER,
  "shortLabel" TEXT,
  "position" INTEGER,
  "reference" TEXT,
  "_bind" TEXT,
  "_id" TEXT,
  "_parent" TEXT,
  "$group" TEXT,
  "tinyLabel" TEXT,
  "color" TEXT,
  "ar" T