Data sources:
- [Vehicles from Traficom](https://tieto.traficom.fi/en/datatraficom/open-data?toggle=Open%20data%20for%20vehicles)
- [Municipalities from Statistics Finland](https://stat.fi/en/luokitukset/kunta/)

In [1]:
# Municipalities data
from src.processors.imports import get_municipalities

municipalities = get_municipalities()

display(municipalities)

{'020': 'Akaa',
 '005': 'Alajärvi',
 '009': 'Alavieska',
 '010': 'Alavus',
 '016': 'Asikkala',
 '018': 'Askola',
 '019': 'Aura',
 '035': 'Brändö',
 '043': 'Eckerö',
 '046': 'Enonkoski',
 '047': 'Enontekiö',
 '049': 'Espoo',
 '050': 'Eura',
 '051': 'Eurajoki',
 '052': 'Evijärvi',
 '060': 'Finström',
 '061': 'Forssa',
 '062': 'Föglö',
 '065': 'Geta',
 '069': 'Haapajärvi',
 '071': 'Haapavesi',
 '072': 'Hailuoto',
 '074': 'Halsua',
 '075': 'Hamina',
 '076': 'Hammarland',
 '077': 'Hankasalmi',
 '078': 'Hanko',
 '079': 'Harjavalta',
 '081': 'Hartola',
 '082': 'Hattula',
 '086': 'Hausjärvi',
 '111': 'Heinola',
 '090': 'Heinävesi',
 '091': 'Helsinki',
 '097': 'Hirvensalmi',
 '098': 'Hollola',
 '102': 'Huittinen',
 '103': 'Humppila',
 '105': 'Hyrynsalmi',
 '106': 'Hyvinkää',
 '108': 'Hämeenkyrö',
 '109': 'Hämeenlinna',
 '139': 'Ii',
 '140': 'Iisalmi',
 '142': 'Iitti',
 '143': 'Ikaalinen',
 '145': 'Ilmajoki',
 '146': 'Ilomantsi',
 '153': 'Imatra',
 '148': 'Inari',
 '149': 'Ingå',
 '151': 'Isojok

In [2]:
# Vehicles data
from src.processors.imports import get_vehicles

vehicles = get_vehicles()

display(vehicles.dtypes)
display(vehicles.shape)
display(vehicles.head(10))

registration_date    object
intro_date           object
color                object
driving_force        object
is_hybrid            object
maker_text           object
municipality         object
mileage              object
dtype: object

(2760535, 8)

Unnamed: 0,registration_date,intro_date,color,driving_force,is_hybrid,maker_text,municipality,mileage
0,02.10.2003,20031002,6,1,,Honda,837,287106
1,17.03.2006,20060317,Y,1,,Toyota,989,161261
2,05.01.2007,20070105,2,2,,Toyota,694,305646
3,14.03.1996,19960314,5,1,,Nissan,777,272000
4,01.07.2003,20030701,8,1,,Honda,851,270594
5,24.03.2000,20000324,6,1,,Nissan,250,189993
6,25.10.2005,20051025,8,1,,BMW,755,218813
7,,19630000,9,1,,Ford,200,63519
8,21.06.1999,19990621,6,1,,Opel,423,457611
9,16.11.1993,19931116,2,1,,Toyota,91,167672


In [3]:
import copy
from src.processors.preprocesses import clean

df = vehicles.copy(deep=True)
mun = copy.deepcopy(municipalities)

df = clean(df, mun)

# Check which high count Other labeled makers are missing from mapping
other_makers = df[df["maker"] == "Other"]
maker_counts = other_makers["maker_text"].value_counts()
top_maker_rows = other_makers[other_makers["maker_text"].isin(maker_counts.index)]
top_maker_counts_in_top_rows = top_maker_rows["maker_text"].value_counts().nlargest(10)
display(top_maker_counts_in_top_rows)

df.reset_index(inplace=True, drop=True)

# NA check
na_rows = df[df[df.columns].isna().any(axis=1)]

# Municipality check
df_mun_codes = set(df["municipality"].unique())
mun_codes = set(mun.keys())
missing = df_mun_codes - mun_codes
display(f"Municipalities mismatch with: {list(missing)}")

# Sanity checks
display(df.dtypes)
display(df.shape)  # Same size as before
display(f"Min year: {df['registration_year'].min()}", f"Max year: {df['registration_year'].max()}")
display(f"Min mileage: {df['mileage'].min()}", f"Max mileage: {df['mileage'].max()}")
display(na_rows)  # Should have none

maker_text
capron       2878
adria        2630
mg           2271
pontiac      1322
dethleffs    1291
byd          1183
hymer        1079
weinsberg     995
buick         916
plymouth      877
Name: count, dtype: int64

'Municipalities mismatch with: []'

color                object
driving_force        object
maker_text           object
municipality         object
mileage               Int32
registration_year     Int16
maker                object
dtype: object

(2760535, 7)

'Min year: 1979'

'Max year: 2025'

'Min mileage: -1'

'Max mileage: 5000000'

Unnamed: 0,color,driving_force,maker_text,municipality,mileage,registration_year,maker


In [4]:
# Final data
import json
import os
from src.processors.utils import get_date
from src.processors.postprocesses import generate
from src.processors.validations import validate

date = get_date()
final = generate(df, mun, date)
valid = validate(final, mun)

if valid:
    path = os.path.join(os.path.join(os.getcwd(), "data.json"))
    file = json.dumps(final, indent=2, ensure_ascii=False)
    if os.path.isfile(path):
        os.remove(path)
    with open(path, "w", encoding="utf-8") as fh:
        fh.write(file)

display(final.keys())
display(final["municipalities"][0])

dict_keys(['date', 'municipalities'])

{'code': '020',
 'name': 'Akaa',
 'mileageCount': {'na': 580,
  'under50k': 569,
  '50kto100k': 1257,
  '100kto150k': 1265,
  '150kto200k': 1362,
  '200kto250k': 1469,
  '250kto300k': 1132,
  '300kto350k': 795,
  '350kto400k': 498,
  '400kto450k': 264,
  '450kto500k': 147,
  '500kto550k': 77,
  '550kto600k': 33,
  'over600k': 44},
 'drivingForceCount': {'diesel': 2622,
  'electricity': 391,
  'hybrid': 865,
  'other': 76,
  'petrol': 5538},
 'colorCount': {'black': 1403,
  'blue': 1272,
  'brown': 644,
  'green': 343,
  'grey': 2070,
  'other': 285,
  'red': 1273,
  'silver': 696,
  'white': 1506},
 'registrationYearCount': {'1979': 234,
  '1980': 7,
  '1981': 4,
  '1982': 12,
  '1983': 4,
  '1984': 10,
  '1985': 10,
  '1986': 12,
  '1987': 15,
  '1988': 25,
  '1989': 14,
  '1990': 33,
  '1991': 25,
  '1992': 17,
  '1993': 30,
  '1994': 34,
  '1995': 34,
  '1996': 56,
  '1997': 63,
  '1998': 104,
  '1999': 156,
  '2000': 182,
  '2001': 162,
  '2002': 238,
  '2003': 307,
  '2004': 349,
