Data sources:
- [Vehicles from Traficom](https://tieto.traficom.fi/en/datatraficom/open-data?toggle=Open%20data%20for%20vehicles)
- [Municipalities from Statistics Finland](https://stat.fi/en/luokitukset/kunta/)
- [Geographic from MAPOG](https://gisdata.mapog.com/finland/administrative_boundaries_level8_polygon)

In [1]:
import io
import json
import os
import pandas as pd
import requests
import zipfile

In [2]:
# Vehicles data
url = "https://opendata.traficom.fi/Content/Ajoneuvorekisteri.zip"
response = requests.get(url)
response.raise_for_status()
zip_bytes = io.BytesIO(response.content)
with zipfile.ZipFile(zip_bytes) as z:
    z.extractall(os.getcwd())

In [3]:
# Municipalities data
url = "https://data.stat.fi/api/classifications/v2/classifications/kunta_1_20240101/classificationItems?content=data&meta=max&lang=en&format=json"
response = requests.get(url)
response.raise_for_status()
municipalities = {item["code"]: item["classificationItemNames"][0]["name"] for item in response.json()}

In [4]:
input_filename = "Ajoneuvojen_avoin_data_5_23.csv"
input_file = os.path.join(os.getcwd(), input_filename)

columns = {
    "ensirekisterointipvm": "registration_date",
    "ajoneuvoluokka": "classification",
    "vari": "color",
    "kayttovoima": "driving_force",
    "sahkohybridi": "is_hybrid",
    "merkkiSelvakielinen": "maker",
    "kunta": "municipality",
    "matkamittarilukema": "odometer",
}

df = pd.read_csv(
    input_file,
    sep=";",
    quotechar="'",
    encoding="latin",
    low_memory=False,
    memory_map=True,
    usecols=columns.keys(),
    dtype={"vari": str, "kayttovoima": str, "kunta": str, "sahkohybridi": str},
)

df.rename(mapper=columns, axis=1, inplace=True)

# Filter to only M1 class
df = df[df["classification"] == "M1"]
df.drop(labels="classification", axis=1, inplace=True)
df.reset_index(inplace=True, drop=True)

# Filter invalid municipalities
df = df[df["municipality"].isin(municipalities.keys())]
df.reset_index(inplace=True, drop=True)

# Clean up dates
df["registration_date"] = pd.to_datetime(df["registration_date"], format="%Y-%m-%d",
                                         cache=True, exact=True, yearfirst=True, errors="coerce")

# Driving power grouping
df["is_hybrid"] = df["is_hybrid"].map(lambda x: x == "true")

def group_driving_force(row):
    if row["is_hybrid"]:
        return 3 # Hybrid
    
    if row["driving_force"] == "01":
        return 1 # Petrol
    elif row["driving_force"] == "02":
        return 2 # Diesel
    elif row["driving_force"] == "04":
        return 4 # Electricity
    else:
        return 5 # Other
    
df["driving_force"] = df.apply(group_driving_force, axis=1)

# Color grouping
def group_color(row):
    if row["color"] == "0":
        return "black"
    elif row["color"] == "1":
        return "brown"
    elif row["color"] == "2":
        return "red"
    elif row["color"] == "5":
        return "green"
    elif row["color"] == "6":
        return "blue"
    elif row["color"] == "8":
        return "grey"
    elif row["color"] == "9":
        return "white"
    elif row["color"] == "Y":
        return "silver"
    else:
        return "other"
    
df["color"] = df.apply(group_color, axis=1)

# Odometer to integer
df["odometer"] = pd.to_numeric(df["odometer"], errors="coerce")

df.head()

Unnamed: 0,registration_date,color,driving_force,is_hybrid,maker,municipality,odometer
0,1984-07-09,brown,1,False,Ford,740,
1,1990-05-08,white,1,False,Citroen,91,
2,2003-10-02,blue,1,False,Honda,837,284104.0
3,2006-03-17,silver,1,False,Toyota,989,155944.0
4,2007-01-05,red,2,False,Toyota,694,2692651.0


In [5]:
grouped_driving = df.groupby(["driving_force", "municipality"]).size().reset_index(name="count")
total = grouped_driving["count"].sum()
grouped_driving["share"] = grouped_driving.apply(lambda row: row["count"] / total * 100, axis=1)

driving_grouped = grouped_driving.groupby(["driving_force"]).agg({"count": "sum", "share": "sum"}).reset_index()
driving_totals = pd.DataFrame({
    "driving_force": ["total"],
    "count": [driving_grouped["count"].sum()],
    "share":  [driving_grouped["share"].sum()]
})
driving = pd.concat([driving_grouped, driving_totals], ignore_index=False).reset_index(drop=True)

def driving_force_text(x):
    if x == 1:
        return "petrol"
    elif x == 2:
        return "diesel"
    elif x == 3:
        return "hybrid"
    elif x == 4:
        return "electricity"
    elif x == 5:
        return "other"
    else:
        return x
    
def format_with_whitespace(value):
    if isinstance(value, float) and value.is_integer():
        value = int(value)
    formatted_value = f"{value:,}".replace(',', ' ')
    return formatted_value
    
disp = driving.style.format({
    "driving_force": driving_force_text,
    "count": format_with_whitespace,
    "share": lambda x: f"{round(x, 2)}%"
})
disp.set_caption("Passenger cars driving forcess")
disp.hide(axis="index")

driving_force,count,share
petrol,1 630 466,61.99%
diesel,615 591,23.4%
hybrid,272 500,10.36%
electricity,90 662,3.45%
other,20 967,0.8%
total,2 630 186,100.0%


In [6]:
grouped_color = df.groupby(["color", "municipality"]).size().reset_index(name="count")
grouped_color["share"] = grouped_color.apply(lambda row: row["count"] / total * 100, axis=1)

color_grouped = grouped_color.groupby(["color"]).agg({"count": "sum", "share": "sum"}).reset_index()
color_totals = pd.DataFrame({
    "color": ["total"],
    "count": [color_grouped["count"].sum()],
    "share":  [color_grouped["share"].sum()]
})
color = pd.concat([color_grouped, color_totals], ignore_index=False).reset_index(drop=True)

disp = color.style.format({
    "count": format_with_whitespace,
    "share": lambda x: f"{round(x, 2)}%"
})
disp.set_caption("Passenger car colors")
disp.hide(axis="index")

color,count,share
black,397 497,15.11%
blue,344 991,13.12%
brown,179 216,6.81%
green,93 356,3.55%
grey,569 324,21.65%
other,63 793,2.43%
red,357 474,13.59%
silver,201 406,7.66%
white,423 129,16.09%
total,2 630 186,100.0%


In [7]:
driving_forces = set(grouped_driving["driving_force"])
colors = set(grouped_color["color"])

final = []
for municipality_code, group in grouped_driving.groupby("municipality"):
    # Driving forces
    driving_force_counts = dict(zip(group["driving_force"], group["count"]))
    for df in driving_forces:
        if df not in driving_force_counts:
            driving_force_counts[df] = 0

    # Colors
    color_group = grouped_color[grouped_color["municipality"] == municipality_code]
    color_counts = dict(zip(color_group["color"], color_group["count"]))
    for color in colors:
        if color not in color_counts:
            color_counts[color] = 0

    final.append({
        "code": municipality_code,
        "name": municipalities[municipality_code],
        "countByDrivingForce": driving_force_counts,
        "countByColor": color_counts
    })

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, "src", "assets"))

data = { 
    "date": "2024-03-31",
    "municipalities": final
}

with open(os.path.join(parent_dir, "data.json"), "w", encoding="utf-8") as jf:
    jf.write(json.dumps(data, indent=4, ensure_ascii=False))