Data sources:
- [Vehicles from Traficom](https://tieto.traficom.fi/en/datatraficom/open-data?toggle=Open%20data%20for%20vehicles)
- [Municipalities from Statistics Finland](https://stat.fi/en/luokitukset/kunta/)

In [None]:
# Municipalities data
from src.processors.imports import get_municipalities

municipalities = get_municipalities()

display(municipalities)

In [None]:
# Vehicles data
from src.processors.imports import get_vehicles

vehicles = get_vehicles()

display(vehicles.dtypes)
display(vehicles.shape)
display(vehicles.head(10))

In [None]:
import copy
from src.processors.preprocesses import clean

df = vehicles.copy(deep=True)
mun = copy.deepcopy(municipalities)

df = clean(df, mun)

# Check which high count Other labeled makers are missing from mapping
other_makers = df[df["maker"] == "Other"]
maker_counts = other_makers["maker_text"].value_counts()
top_maker_rows = other_makers[other_makers["maker_text"].isin(maker_counts.index)]
top_maker_counts_in_top_rows = top_maker_rows["maker_text"].value_counts().nlargest(10)
display(top_maker_counts_in_top_rows)

df.reset_index(inplace=True, drop=True)

# NA check
na_rows = df[df[df.columns].isna().any(axis=1)]

# Municipality check
df_mun_codes = set(df["municipality"].unique())
mun_codes = set(mun.keys())
missing = df_mun_codes - mun_codes
display(f"Municipalities mismatch with: {list(missing)}")

# Sanity checks
display(df.dtypes)
display(df.shape)  # Same size as before
display(f"Min year: {df['registration_year'].min()}", f"Max year: {df['registration_year'].max()}")
display(f"Min mileage: {df['mileage'].min()}", f"Max mileage: {df['mileage'].max()}")
display(na_rows)  # Should have none

In [None]:
# Final data
import json
import os
from src.processors.utils import get_date
from src.processors.postprocesses import generate
from src.processors.validations import validate

date = get_date()
final = generate(df, mun, date)
valid = validate(final, mun)

if valid:
    path = os.path.join(os.path.join(os.getcwd(), "data.json"))
    file = json.dumps(final, indent=2, ensure_ascii=False)
    if os.path.isfile(path):
        os.remove(path)
    with open(path, "w", encoding="utf-8") as fh:
        fh.write(file)

display(final.keys())
display(final["municipalities"][0])