Data sources:
- [Vehicles from Traficom](https://tieto.traficom.fi/en/datatraficom/open-data?toggle=Open%20data%20for%20vehicles)
- [Municipalities from Statistics Finland](https://stat.fi/en/luokitukset/kunta/)
- [Geographic from MAPOG](https://gisdata.mapog.com/finland/administrative_boundaries_level8_polygon)

In [1]:
import io
import json
import numpy as np
import os
import pandas as pd
import requests
import zipfile

In [2]:
# Vehicles data
url = "https://opendata.traficom.fi/Content/Ajoneuvorekisteri.zip"
response = requests.get(url)
response.raise_for_status()
zip_bytes = io.BytesIO(response.content)
with zipfile.ZipFile(zip_bytes) as z:
    z.extractall(os.getcwd())

In [3]:
# Municipalities data
url = "https://data.stat.fi/api/classifications/v2/classifications/kunta_1_20240101/classificationItems?content=data&meta=max&lang=en&format=json"
response = requests.get(url)
response.raise_for_status()
municipalities = {item["code"]: item["classificationItemNames"][0]["name"] for item in response.json()}

In [4]:
# Raw file
data_date = "2024-03-31"
input_filename = "Ajoneuvojen_avoin_data_5_23.csv"
input_file = os.path.join(os.getcwd(), input_filename)

column_map = {
    "ensirekisterointipvm": "registration_date",
    "kayttoonottopvm": "intro_date",
    "ajoneuvoluokka": "classification",
    "vari": "color",
    "kayttovoima": "driving_force",
    "sahkohybridi": "is_hybrid",
    "merkkiSelvakielinen": "maker",
    "kunta": "municipality",
    "matkamittarilukema": "odometer",
}

csv = pd.read_csv(
    input_file,
    sep=";",
    quotechar="'",
    encoding="latin",
    low_memory=False,
    memory_map=True,
    usecols=column_map.keys(),
    dtype={"vari": str, "kayttovoima": str, "kunta": str, "sahkohybridi": str, "merkkiSelvakielinen": str},
)

csv.rename(mapper=column_map, axis=1, inplace=True)

# Filter to only M1 vehicle class
csv = csv[csv["classification"] == "M1"]
csv.drop(labels="classification", axis=1, inplace=True)

csv.reset_index(inplace=True, drop=True)

display(csv.dtypes)
display(csv.shape)
display(csv.head(10))

registration_date    object
intro_date           object
color                object
driving_force        object
is_hybrid            object
maker                object
municipality         object
odometer             object
dtype: object

(2638554, 8)

Unnamed: 0,registration_date,intro_date,color,driving_force,is_hybrid,maker,municipality,odometer
0,1984-07-09,19840000,1,1,,Ford,740,
1,1990-05-08,19900508,9,1,,Citroen,91,
2,2003-10-02,20031002,6,1,,Honda,837,284104.0
3,2006-03-17,20060317,Y,1,,Toyota,989,155944.0
4,2007-01-05,20070105,2,2,,Toyota,694,2692651.0
5,1996-03-14,19960314,5,1,,Nissan,777,262227.0
6,2003-07-01,20030701,8,1,,Honda,851,262915.0
7,2000-03-24,20000324,6,1,,Nissan,250,185968.0
8,2005-10-25,20051025,8,1,,BMW,755,205732.0
9,,19630000,9,1,,Ford,179,63519.0


In [5]:
df = csv.copy(deep=True)
# Clean up dates
df["registration_date"] = pd.to_datetime(df["registration_date"], format="%Y-%m-%d",
                                         cache=True, exact=True, yearfirst=True, errors="coerce")

# Registration year
def get_registration_year(row):
    if pd.notna(row["registration_date"]) and isinstance(row["registration_date"], pd.Timestamp):
        return row["registration_date"].year
    
    if pd.notna(row["intro_date"]):
        if row["intro_date"].startswith("0"):
            return int(data_date[:4])
        return int(row["intro_date"][:4])

df["registration_year"] = df.apply(get_registration_year, axis=1).astype("Int16")

# Municipalities, map unmatched to 999 Unknown
municipalities["999"] = "Unknown"
df["municipality"] = np.where(df["municipality"].isin(municipalities.keys()), df["municipality"], "999")

# Filter really old
df = df[df["registration_year"] >= 1980]

# Driving force grouping
df["is_hybrid"] = df["is_hybrid"].map(lambda x: x == "true")

def group_driving_force(row):
    if row["is_hybrid"]:
        return 3 # Hybrid
    
    if row["driving_force"] == "01":
        return 1 # Petrol
    elif row["driving_force"] == "02":
        return 2 # Diesel
    elif row["driving_force"] == "04":
        return 4 # Electricity
    else:
        return 5 # Other
    
df["driving_force"] = df.apply(group_driving_force, axis=1).astype(str)

# Color grouping
def group_color(row):
    color_map = {
        "0": "black",
        "1": "brown",
        "2": "red",
        "5": "green",
        "6": "blue",
        "8": "grey",
        "9": "white",
        "Y": "silver"
    }
    return color_map.get(row["color"], "other")
    
df["color"] = df.apply(group_color, axis=1)

# Odometer
df["odometer"] = df["odometer"].astype("Int32", errors="ignore").fillna(0)

# Makers more unique
df["maker"] = df["maker"].str.lower().fillna("")

# Makers grouping
def group_maker(row):
    maker_map = {
        "alfa": "Alfa Romeo",
        "alfa romeo": "Alfa Romeo",
        "aston martin": "Aston Martin",
        "audi": "Audi",
        "bmw": "BMW",
        "chevrolet": "Chevrolet",
        "chrysler": "Chrysler",
        "citroen": "Citroën",
        "dodge": "Dodge",
        "ferrari": "Ferrari",
        "fiat": "Fiat",
        "ford": "Ford",
        "honda": "Honda",
        "hyundai": "Hyundai",
        "jaguar": "Jaguar",
        "jeep": "Jeep",
        "kia": "Kia",
        "lada": "Lada",
        "lamborghini": "Lamborghini",
        "land drover": "Land Drover",
        "Maserati": "Maserati",
        "mazda": "Mazda",
        "mercedes": "Mercedes-Benz",
        "mitsubishi": "Mitsubishi",
        "nissan": "Nissan",
        "opel": "Opel",
        "peugeot": "Peugeot",
        "porsche": "Porsche",
        "polestar": "Polestar",
        "renault": "Renault",
        "saab": "Saab",
        "seat": "Seat",
        "skoda": "Škoda",
        "subaru": "Subaru",
        "suzuki": "Suzuki",
        "tesla": "Tesla",
        "toyota": "Toyota",
        "volkswagen": "Volkswagen",
        "vw": "Volkswagen",
        "volvo": "Volvo"
    }
    for prefix, target in maker_map.items():
        if row["maker"].startswith(prefix):
            return target
    return "Other"

df["maker_new"] = df.apply(group_maker, axis=1)

# other_makers = df[df["maker_new"] == "Other"]
# display(other_makers)

# Drop obsolete columns
df.drop(labels=["registration_date", "intro_date", "maker"], axis=1, inplace=True)
df.rename(columns={"maker_new": "maker"}, inplace=True)
df.reset_index(inplace=True, drop=True)

# NA check
check_columns = ["municipality", "color", "driving_force", "registration_year", "maker"]
na_rows = df[df[check_columns].isna().any(axis=1)]

# Sanity checks
display(df.dtypes)
display(df.shape)
display(f"Min year: {df["registration_year"].min()}", f"Max year: {df["registration_year"].max()}")
display(na_rows)

color                object
driving_force        object
is_hybrid              bool
municipality         object
odometer             object
registration_year     Int16
maker                object
dtype: object

(2626855, 7)

'Min year: 1980'

'Max year: 2024'

Unnamed: 0,color,driving_force,is_hybrid,municipality,odometer,registration_year,maker


In [6]:
grouped_driving = df.groupby(["driving_force", "municipality"]).size().reset_index(name="count")
total = grouped_driving["count"].sum()
grouped_driving["share"] = grouped_driving.apply(lambda row: row["count"] / total * 100, axis=1)

driving_grouped = grouped_driving.groupby(["driving_force"]).agg({"count": "sum", "share": "sum"}).reset_index()
driving_totals = pd.DataFrame({
    "driving_force": ["total"],
    "count": [driving_grouped["count"].sum()],
    "share":  [driving_grouped["share"].sum()]
})
driving = pd.concat([driving_grouped, driving_totals], ignore_index=False).reset_index(drop=True)

def driving_force_text(x):
    driving_force_map = {
        1: "petrol",
        2: "diesel",
        3: "hybrid",
        4: "electricity",
        5: "other"
    }
    return driving_force_map.get(x, x)
    
def format_with_whitespace(value):
    if isinstance(value, float) and value.is_integer():
        value = int(value)
    formatted_value = f"{value:,}".replace(',', ' ')
    return formatted_value
    
disp = driving.style.format({
    "driving_force": driving_force_text,
    "count": format_with_whitespace,
    "share": lambda x: f"{round(x, 2)}%"
})
disp.set_caption("Driving forces")
disp.hide(axis="index")

driving_force,count,share
1,1 624 524,61.84%
2,617 486,23.51%
3,273 001,10.39%
4,90 822,3.46%
5,21 022,0.8%
total,2 626 855,100.0%


In [7]:
grouped_color = df.groupby(["color", "municipality"]).size().reset_index(name="count")
grouped_color["share"] = grouped_color.apply(lambda row: row["count"] / total * 100, axis=1)

color_grouped = grouped_color.groupby(["color"]).agg({"count": "sum", "share": "sum"}).reset_index()
color_totals = pd.DataFrame({
    "color": ["total"],
    "count": [color_grouped["count"].sum()],
    "share":  [color_grouped["share"].sum()]
})
color = pd.concat([color_grouped, color_totals], ignore_index=False).reset_index(drop=True)

disp = color.style.format({
    "count": format_with_whitespace,
    "share": lambda x: f"{round(x, 2)}%"
})
disp.set_caption("Colors")
disp.hide(axis="index")

color,count,share
black,398 201,15.16%
blue,344 127,13.1%
brown,178 522,6.8%
green,92 335,3.52%
grey,570 186,21.71%
other,62 492,2.38%
red,356 261,13.56%
silver,201 845,7.68%
white,422 886,16.1%
total,2 626 855,100.0%


In [8]:
grouped_year = df.groupby(["registration_year", "municipality"]).size().reset_index(name="count")
grouped_year["share"] = grouped_year.apply(lambda row: row["count"] / total * 100, axis=1)

year_grouped = grouped_year.groupby(["registration_year"]).agg({"count": "sum", "share": "sum"}).reset_index()
year_totals = pd.DataFrame({
    "registration_year": ["total"],
    "count": [year_grouped["count"].sum()],
    "share":  [year_grouped["share"].sum()]
})
registration_year = pd.concat([year_grouped, year_totals], ignore_index=False).reset_index(drop=True)

disp = pd.concat([registration_year.head(5), registration_year.tail(5)]).style.format({
    "count": format_with_whitespace,
    "share": lambda x: f"{round(x, 2)}%"
})
disp.set_caption("Registration years")
disp.hide(axis="index")

registration_year,count,share
1980,1 161,0.04%
1981,1 416,0.05%
1982,2 528,0.1%
1983,3 370,0.13%
1984,4 605,0.18%
2021,123 224,4.69%
2022,107 917,4.11%
2023,114 138,4.35%
2024,25 476,0.97%
total,2 626 855,100.0%


In [9]:
grouped_maker = df.groupby(["maker", "municipality"]).size().reset_index(name="count")
grouped_maker["share"] = grouped_maker.apply(lambda row: row["count"] / total * 100, axis=1)

maker_grouped = grouped_maker.groupby(["maker"]).agg({"count": "sum", "share": "sum"}).reset_index()
maker_totals = pd.DataFrame({
    "maker": ["total"],
    "count": [maker_grouped["count"].sum()],
    "share":  [maker_grouped["share"].sum()]
})
maker = pd.concat([maker_grouped, maker_totals], ignore_index=False).reset_index(drop=True)

disp = pd.concat([maker.head(5), maker.tail(5)]).style.format({
    "count": format_with_whitespace,
    "share": lambda x: f"{round(x, 2)}%"
})
disp.set_caption("Makers")
disp.hide(axis="index")

maker,count,share
Alfa Romeo,3 430,0.13%
Aston Martin,32,0.0%
Audi,116 879,4.45%
BMW,119 959,4.57%
Chevrolet,10 629,0.4%
Toyota,388 255,14.78%
Volkswagen,284 428,10.83%
Volvo,208 859,7.95%
Škoda,162 572,6.19%
total,2 626 855,100.0%


In [10]:
# Counts for municipalities
driving_forces = set(grouped_driving["driving_force"])
colors = set(grouped_color["color"])
years = set(grouped_year["registration_year"])
makers = set(grouped_maker["maker"])

final = []
for municipality_code, group in grouped_driving.groupby("municipality"):
    # Driving forces
    driving_force_counts = dict(zip(group["driving_force"], group["count"]))
    for driving_force in driving_forces:
        if driving_force not in driving_force_counts:
            driving_force_counts[driving_force] = 0

    # Colors
    color_group = grouped_color[grouped_color["municipality"] == municipality_code]
    color_counts = dict(zip(color_group["color"], color_group["count"]))
    for color in colors:
        if color not in color_counts:
            color_counts[color] = 0

    # Registration years
    year_group = grouped_year[grouped_year["municipality"] == municipality_code]
    year_counts = dict(zip(year_group["registration_year"], year_group["count"]))
    year_counts_str = {str(year): count for year, count in year_counts.items()}

    # Makers
    maker_group = grouped_maker[grouped_maker["municipality"] == municipality_code]
    maker_counts = dict(zip(maker_group["maker"], maker_group["count"]))

    final.append({
        "code": municipality_code,
        "name": municipalities[municipality_code],
        "countByDrivingForce": driving_force_counts,
        "countByColor": color_counts,
        "countByRegistrationYear": year_counts_str,
        "countByMaker": maker_counts,
    })

for municipality in final:
    municipality["countByDrivingForce"] = dict(sorted(municipality["countByDrivingForce"].items()))
    municipality["countByColor"] = dict(sorted(municipality["countByColor"].items()))
    municipality["countByMaker"] = dict(sorted(municipality["countByMaker"].items()))

In [11]:
# Totals
total_driving_force_counts = {driving_force: 0 for driving_force in driving_forces}
total_color_counts = {color: 0 for color in colors}
total_year_counts = {str(year): 0 for year in years}
total_maker_counts = {maker: 0 for maker in makers}

for municipality in final:
    for driving_force, count in municipality["countByDrivingForce"].items():
        total_driving_force_counts[driving_force] += count

    for color, count in municipality["countByColor"].items():
        total_color_counts[color] += count

    for year, count in municipality["countByRegistrationYear"].items():
        total_year_counts[year] += count

    for maker, count in municipality["countByMaker"].items():
        total_maker_counts[maker] += count

final.append({
    "code": "000",
    "name": "Finland",
    "countByDrivingForce": total_driving_force_counts,
    "countByColor": dict(sorted(total_color_counts.items())),
    "countByRegistrationYear": total_year_counts,
    "countByMaker": dict(sorted(total_maker_counts.items())),
})

final.sort(key=lambda x: x["name"])

data = {
    "date": data_date,
    "municipalities": final
}

In [12]:
# Final data file
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, "src", "assets"))
with open(os.path.join(parent_dir, "data.json"), "w", encoding="utf-8") as fh:
    fh.write(json.dumps(data, indent=4, ensure_ascii=False))