Data sources:
- [Vehicles from Traficom](https://tieto.traficom.fi/en/datatraficom/open-data?toggle=Open%20data%20for%20vehicles)
- [Municipalities from Statistics Finland](https://stat.fi/en/luokitukset/kunta/)
- [Geographic from MAPOG](https://gisdata.mapog.com/finland/administrative_boundaries_level8_polygon)

In [1]:
import io
import json
import os
import pandas as pd
import requests
import zipfile

In [2]:
# Vehicles data
url = "https://opendata.traficom.fi/Content/Ajoneuvorekisteri.zip"
response = requests.get(url)
response.raise_for_status()
zip_bytes = io.BytesIO(response.content)
with zipfile.ZipFile(zip_bytes) as z:
    z.extractall(os.getcwd())

In [3]:
# Municipalities data
url = "https://data.stat.fi/api/classifications/v2/classifications/kunta_1_20240101/classificationItems?content=data&meta=max&lang=en&format=json"
response = requests.get(url)
response.raise_for_status()
municipalities = {item["code"]: item["classificationItemNames"][0]["name"] for item in response.json()}

In [4]:
# Raw file
data_date = "2024-03-31"
input_filename = "Ajoneuvojen_avoin_data_5_23.csv"
input_file = os.path.join(os.getcwd(), input_filename)

columns = {
    "ensirekisterointipvm": "registration_date",
    "kayttoonottopvm": "intro_date",
    "ajoneuvoluokka": "classification",
    "vari": "color",
    "kayttovoima": "driving_force",
    "sahkohybridi": "is_hybrid",
    "merkkiSelvakielinen": "maker",
    "kunta": "municipality",
    "matkamittarilukema": "odometer",
}

df = pd.read_csv(
    input_file,
    sep=";",
    quotechar="'",
    encoding="latin",
    low_memory=False,
    memory_map=True,
    usecols=columns.keys(),
    dtype={"vari": str, "kayttovoima": str, "kunta": str, "sahkohybridi": str},
)

df.rename(mapper=columns, axis=1, inplace=True)

# Filter to only M1 vehicle class
df = df[df["classification"] == "M1"]
df.drop(labels="classification", axis=1, inplace=True)
df.reset_index(inplace=True, drop=True)

# Filter invalid municipalities
df = df[df["municipality"].isin(municipalities.keys())]
df.reset_index(inplace=True, drop=True)

df.head(10)

Unnamed: 0,registration_date,intro_date,color,driving_force,is_hybrid,maker,municipality,odometer
0,1984-07-09,19840000,1,1,,Ford,740,
1,1990-05-08,19900508,9,1,,Citroen,91,
2,2003-10-02,20031002,6,1,,Honda,837,284104.0
3,2006-03-17,20060317,Y,1,,Toyota,989,155944.0
4,2007-01-05,20070105,2,2,,Toyota,694,2692651.0
5,1996-03-14,19960314,5,1,,Nissan,777,262227.0
6,2003-07-01,20030701,8,1,,Honda,851,262915.0
7,2000-03-24,20000324,6,1,,Nissan,250,185968.0
8,2005-10-25,20051025,8,1,,BMW,755,205732.0
9,,19630000,9,1,,Ford,179,63519.0


In [5]:
# Clean up dates
df["registration_date"] = pd.to_datetime(df["registration_date"], format="%Y-%m-%d",
                                         cache=True, exact=True, yearfirst=True, errors="coerce")

# Registration year
def get_registration_year(row):
    if pd.notna(row["registration_date"]) and isinstance(row["registration_date"], pd.Timestamp):
        return row["registration_date"].year
    
    if pd.notna(row["intro_date"]):
        if row["intro_date"].startswith("0"):
            return int(data_date[:4])
        return int(row["intro_date"][:4])

df["registration_year"] = df.apply(get_registration_year, axis=1).astype("Int16")
df["registration_year"]

# Driving power grouping
df["is_hybrid"] = df["is_hybrid"].map(lambda x: x == "true")

def group_driving_force(row):
    if row["is_hybrid"]:
        return 3 # Hybrid
    
    if row["driving_force"] == "01":
        return 1 # Petrol
    elif row["driving_force"] == "02":
        return 2 # Diesel
    elif row["driving_force"] == "04":
        return 4 # Electricity
    else:
        return 5 # Other
    
df["driving_force"] = df.apply(group_driving_force, axis=1)

# Color grouping
def group_color(row):
    if row["color"] == "0":
        return "black"
    elif row["color"] == "1":
        return "brown"
    elif row["color"] == "2":
        return "red"
    elif row["color"] == "5":
        return "green"
    elif row["color"] == "6":
        return "blue"
    elif row["color"] == "8":
        return "grey"
    elif row["color"] == "9":
        return "white"
    elif row["color"] == "Y":
        return "silver"
    else:
        return "other"
    
df["color"] = df.apply(group_color, axis=1)

# Odometer
df["odometer"] = df["odometer"].astype("Int32").fillna(0)

# NA check
check_columns = ["municipality", "color", "driving_force", "registration_year"]
na_rows = df[df[check_columns].isna().any(axis=1)]

display(df.dtypes)
display(df.head(10))
display(f"Min year: {df["registration_year"].min()}", f"Max year: {df["registration_year"].max()}")
display(na_rows)

registration_date    datetime64[ns]
intro_date                   object
color                        object
driving_force                 int64
is_hybrid                      bool
maker                        object
municipality                 object
odometer                      Int32
registration_year             Int16
dtype: object

Unnamed: 0,registration_date,intro_date,color,driving_force,is_hybrid,maker,municipality,odometer,registration_year
0,1984-07-09,19840000,brown,1,False,Ford,740,0,1984
1,1990-05-08,19900508,white,1,False,Citroen,91,0,1990
2,2003-10-02,20031002,blue,1,False,Honda,837,284104,2003
3,2006-03-17,20060317,silver,1,False,Toyota,989,155944,2006
4,2007-01-05,20070105,red,2,False,Toyota,694,2692651,2007
5,1996-03-14,19960314,green,1,False,Nissan,777,262227,1996
6,2003-07-01,20030701,grey,1,False,Honda,851,262915,2003
7,2000-03-24,20000324,blue,1,False,Nissan,250,185968,2000
8,2005-10-25,20051025,grey,1,False,BMW,755,205732,2005
9,NaT,19630000,white,1,False,Ford,179,63519,1963


'Min year: 1918'

'Max year: 2024'

Unnamed: 0,registration_date,intro_date,color,driving_force,is_hybrid,maker,municipality,odometer,registration_year


In [6]:
grouped_driving = df.groupby(["driving_force", "municipality"]).size().reset_index(name="count")
total = grouped_driving["count"].sum()
grouped_driving["share"] = grouped_driving.apply(lambda row: row["count"] / total * 100, axis=1)

driving_grouped = grouped_driving.groupby(["driving_force"]).agg({"count": "sum", "share": "sum"}).reset_index()
driving_totals = pd.DataFrame({
    "driving_force": ["total"],
    "count": [driving_grouped["count"].sum()],
    "share":  [driving_grouped["share"].sum()]
})
driving = pd.concat([driving_grouped, driving_totals], ignore_index=False).reset_index(drop=True)

def driving_force_text(x):
    if x == 1:
        return "petrol"
    elif x == 2:
        return "diesel"
    elif x == 3:
        return "hybrid"
    elif x == 4:
        return "electricity"
    elif x == 5:
        return "other"
    else:
        return x
    
def format_with_whitespace(value):
    if isinstance(value, float) and value.is_integer():
        value = int(value)
    formatted_value = f"{value:,}".replace(',', ' ')
    return formatted_value
    
disp = driving.style.format({
    "driving_force": driving_force_text,
    "count": format_with_whitespace,
    "share": lambda x: f"{round(x, 2)}%"
})
disp.set_caption("Driving forces")
disp.hide(axis="index")

driving_force,count,share
petrol,1 630 466,61.99%
diesel,615 591,23.4%
hybrid,272 500,10.36%
electricity,90 662,3.45%
other,20 967,0.8%
total,2 630 186,100.0%


In [7]:
grouped_color = df.groupby(["color", "municipality"]).size().reset_index(name="count")
grouped_color["share"] = grouped_color.apply(lambda row: row["count"] / total * 100, axis=1)

color_grouped = grouped_color.groupby(["color"]).agg({"count": "sum", "share": "sum"}).reset_index()
color_totals = pd.DataFrame({
    "color": ["total"],
    "count": [color_grouped["count"].sum()],
    "share":  [color_grouped["share"].sum()]
})
color = pd.concat([color_grouped, color_totals], ignore_index=False).reset_index(drop=True)

disp = color.style.format({
    "count": format_with_whitespace,
    "share": lambda x: f"{round(x, 2)}%"
})
disp.set_caption("Colors")
disp.hide(axis="index")

color,count,share
black,397 497,15.11%
blue,344 991,13.12%
brown,179 216,6.81%
green,93 356,3.55%
grey,569 324,21.65%
other,63 793,2.43%
red,357 474,13.59%
silver,201 406,7.66%
white,423 129,16.09%
total,2 630 186,100.0%


In [8]:
grouped_year = df.groupby(["registration_year", "municipality"]).size().reset_index(name="count")
grouped_year["share"] = grouped_year.apply(lambda row: row["count"] / total * 100, axis=1)

year_grouped = grouped_year.groupby(["registration_year"]).agg({"count": "sum", "share": "sum"}).reset_index()
year_totals = pd.DataFrame({
    "registration_year": ["total"],
    "count": [year_grouped["count"].sum()],
    "share":  [year_grouped["share"].sum()]
})
registration_year = pd.concat([year_grouped, year_totals], ignore_index=False).reset_index(drop=True)

disp = pd.concat([registration_year.head(5), registration_year.tail(5)]).style.format({
    "count": format_with_whitespace,
    "share": lambda x: f"{round(x, 2)}%"
})
disp.set_caption("Registration years")
disp.hide(axis="index")

registration_year,count,share
1918,1,0.0%
1922,1,0.0%
1923,2,0.0%
1924,1,0.0%
1925,1,0.0%
2021,122 965,4.68%
2022,107 673,4.09%
2023,113 912,4.33%
2024,25 415,0.97%
total,2 630 186,100.0%


In [9]:
driving_forces = set(grouped_driving["driving_force"])
colors = set(grouped_color["color"])
years = set(grouped_year["registration_year"])

final = []
for municipality_code, group in grouped_driving.groupby("municipality"):
    # Driving forces
    driving_force_counts = dict(zip(group["driving_force"], group["count"]))
    for driving_force in driving_forces:
        if driving_force not in driving_force_counts:
            driving_force_counts[driving_force] = 0

    # Colors
    color_group = grouped_color[grouped_color["municipality"] == municipality_code]
    color_counts = dict(zip(color_group["color"], color_group["count"]))
    for color in colors:
        if color not in color_counts:
            color_counts[color] = 0

    # Registration years
    year_group = grouped_year[grouped_year["municipality"] == municipality_code]
    year_counts = dict(zip(year_group["registration_year"], year_group["count"]))
    year_counts_str = {str(year): count for year, count in year_counts.items()}

    final.append({
        "code": municipality_code,
        "name": municipalities[municipality_code],
        "countByDrivingForce": driving_force_counts,
        "countByColor": color_counts,
        "countByRegistrationYear": year_counts_str,
    })

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, "src", "assets"))

data = { 
    "date": data_date,
    "municipalities": final
}

for municipality in data["municipalities"]:
    sorted_color = dict(sorted(municipality["countByColor"].items()))
    municipality["countByColor"] = sorted_color

with open(os.path.join(parent_dir, "data.json"), "w", encoding="utf-8") as jf:
    jf.write(json.dumps(data, indent=4, ensure_ascii=False))