Data sources:
- [Vehicles from Traficom](https://tieto.traficom.fi/en/datatraficom/open-data?toggle=Open%20data%20for%20vehicles)
- [Municipalities from Statistics Finland](https://stat.fi/en/luokitukset/kunta/)
- [Geographic from MAPOG](https://gisdata.mapog.com/finland/administrative_boundaries_level8_polygon)

In [1]:
# Municipalities data
from src.data_import import get_municipalities
municipalities = get_municipalities()

display(municipalities)

{'020': 'Akaa',
 '005': 'Alajärvi',
 '009': 'Alavieska',
 '010': 'Alavus',
 '016': 'Asikkala',
 '018': 'Askola',
 '019': 'Aura',
 '035': 'Brändö',
 '043': 'Eckerö',
 '046': 'Enonkoski',
 '047': 'Enontekiö',
 '049': 'Espoo',
 '050': 'Eura',
 '051': 'Eurajoki',
 '052': 'Evijärvi',
 '060': 'Finström',
 '061': 'Forssa',
 '062': 'Föglö',
 '065': 'Geta',
 '069': 'Haapajärvi',
 '071': 'Haapavesi',
 '072': 'Hailuoto',
 '074': 'Halsua',
 '075': 'Hamina',
 '076': 'Hammarland',
 '077': 'Hankasalmi',
 '078': 'Hanko',
 '079': 'Harjavalta',
 '081': 'Hartola',
 '082': 'Hattula',
 '086': 'Hausjärvi',
 '111': 'Heinola',
 '090': 'Heinävesi',
 '091': 'Helsinki',
 '097': 'Hirvensalmi',
 '098': 'Hollola',
 '102': 'Huittinen',
 '103': 'Humppila',
 '105': 'Hyrynsalmi',
 '106': 'Hyvinkää',
 '108': 'Hämeenkyrö',
 '109': 'Hämeenlinna',
 '139': 'Ii',
 '140': 'Iisalmi',
 '142': 'Iitti',
 '143': 'Ikaalinen',
 '145': 'Ilmajoki',
 '146': 'Ilomantsi',
 '153': 'Imatra',
 '148': 'Inari',
 '149': 'Ingå',
 '151': 'Isojok

In [2]:
# Vehicles data
from src.data_import import get_vehicles
vehicles = get_vehicles()

display(vehicles.dtypes)
display(vehicles.shape)
display(vehicles.head(10))

registration_date    object
intro_date           object
color                object
driving_force        object
is_hybrid            object
maker_text           object
municipality         object
odometer             object
dtype: object

(2789147, 8)

Unnamed: 0,registration_date,intro_date,color,driving_force,is_hybrid,maker_text,municipality,odometer
0,09.07.1984,19840000,1,1,,Ford,740,
1,08.05.1990,19900508,9,1,,Citroen,91,
2,02.10.2003,20031002,6,1,,Honda,837,287106.0
3,17.03.2006,20060317,Y,1,,Toyota,989,155944.0
4,05.01.2007,20070105,2,2,,Toyota,694,297904.0
5,14.03.1996,19960314,5,1,,Nissan,777,262907.0
6,01.07.2003,20030701,8,1,,Honda,851,270594.0
7,24.03.2000,20000324,6,1,,Nissan,250,186269.0
8,25.10.2005,20051025,8,1,,BMW,755,218813.0
9,,19630000,9,1,,Ford,580,63519.0


In [3]:
import copy
import importlib
import src.data_cleaning

importlib.reload(src.data_cleaning)

df = vehicles.copy(deep=True)
mun = copy.deepcopy(municipalities)

df = src.data_cleaning.clean(df, mun)

# Check which high count Other labeled makers are missing from mapping
other_makers = df[df["maker"] == "Other"]
maker_counts = other_makers["maker_text"].value_counts()
top_maker_rows = other_makers[other_makers["maker_text"].isin(maker_counts.index)]
top_maker_counts_in_top_rows = top_maker_rows["maker_text"].value_counts().nlargest(10)
display(top_maker_counts_in_top_rows)

df.reset_index(inplace=True, drop=True)

# NA check
na_rows = df[df[df.columns].isna().any(axis=1)]

# Sanity checks
display(df.dtypes)
display(df.shape) # Same size as before
display(f"Min year: {df["registration_year"].min()}", f"Max year: {df["registration_year"].max()}")
display(na_rows) # Should have none

maker_text
mg           1741
capron       1446
adria        1446
byd           840
pontiac       822
buick         681
hymer         675
weinsberg     642
dethleffs     638
gm daewoo     599
Name: count, dtype: int64

color                object
driving_force        object
maker_text           object
municipality         object
odometer              Int32
registration_year     Int16
maker                object
dtype: object

(2789147, 7)

'Min year: 1979'

'Max year: 2024'

Unnamed: 0,color,driving_force,maker_text,municipality,odometer,registration_year,maker


In [4]:
# Final data
import json
import importlib
import os
import src.data_cleaning
import src.data_import
import src.data_validation


importlib.reload(src.data_cleaning)
importlib.reload(src.data_validation)

date = src.data_import.get_date()
final = src.data_cleaning.generate(df, mun, date)
valid = src.data_validation.validate(final, mun)

if valid:
    path = os.path.join(os.path.join(os.getcwd(), "data.json"))
    file = json.dumps(final, indent=2, ensure_ascii=False)
    if os.path.isfile(path):
        os.remove(path)
    with open(path, "w", encoding="utf-8") as fh:
        fh.write(file)

display(final.keys())
display(final["municipalities"][0])

dict_keys(['date', 'municipalities'])

{'code': '020',
 'name': 'Akaa',
 'drivingForceCount': {'diesel': 2630,
  'electricity': 334,
  'hybrid': 761,
  'other': 73,
  'petrol': 6071},
 'colorCount': {'black': 1373,
  'blue': 1380,
  'brown': 739,
  'green': 404,
  'grey': 2141,
  'other': 276,
  'red': 1399,
  'silver': 679,
  'white': 1478},
 'registrationYearCount': {'1979': 185,
  '1980': 9,
  '1981': 7,
  '1982': 15,
  '1983': 26,
  '1984': 31,
  '1985': 47,
  '1986': 57,
  '1987': 76,
  '1988': 89,
  '1989': 94,
  '1990': 91,
  '1991': 42,
  '1992': 40,
  '1993': 46,
  '1994': 44,
  '1995': 52,
  '1996': 85,
  '1997': 69,
  '1998': 112,
  '1999': 163,
  '2000': 196,
  '2001': 179,
  '2002': 250,
  '2003': 323,
  '2004': 346,
  '2005': 430,
  '2006': 442,
  '2007': 403,
  '2008': 475,
  '2009': 291,
  '2010': 405,
  '2011': 468,
  '2012': 363,
  '2013': 387,
  '2014': 392,
  '2015': 451,
  '2016': 454,
  '2017': 419,
  '2018': 414,
  '2019': 356,
  '2020': 300,
  '2021': 252,
  '2022': 222,
  '2023': 164,
  '2024': 107}