# Data Project

Install the DST api-data reader and the pandas_datareader. 

In [4]:
# The DST API wrapper
%pip install git+https://github.com/alemartinello/dstapi

# A wrapper for multiple APIs with a pandas interface
%pip install pandas-datareader

Collecting git+https://github.com/alemartinello/dstapi
  Cloning https://github.com/alemartinello/dstapi to /private/var/folders/rp/8b9m7ytn6zv_75m2zj88jl7m0000gn/T/pip-req-build-1755b6f_
  Running command git clone --filter=blob:none --quiet https://github.com/alemartinello/dstapi /private/var/folders/rp/8b9m7ytn6zv_75m2zj88jl7m0000gn/T/pip-req-build-1755b6f_
  Resolved https://github.com/alemartinello/dstapi to commit d9eeb5a82cbc70b7d63b2ff44d92632fd77123a4
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Imports and set magics:

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from dstapi import DstApi # install with `pip install git+https://github.com/alemartinello/dstapi`
import pandas_datareader # install with `pip install pandas-datareader`

# Read and clean data

Consider the following dictionary definitions:

In [18]:
columns_dict = {}
columns_dict['TRANSMID'] = 'vehicle'
columns_dict['TID'] = 'year'
columns_dict['INDHOLD'] = 'value'

We will download all data from table PKM1 and ...  using DstApi. First we will read and clean data from table PKM1.

In [19]:
pkm1_api = DstApi('PKM1') # loading the data, by writting the table name inside the paranthesis
params = pkm1_api._define_base_params(language='en') # we have no restriction, only that we eant the table in english

pkm1 = pkm1_api.get_data(params=params) #getting the data, where we in params defined, what we wanted to include from the table.
pkm1.head()

Unnamed: 0,TRANSMID,TID,INDHOLD
0,VEHICLES ON THE ROAD TOTAL,1981,..
1,Bicycles/Mopeds max. 30 km/h,1981,..
2,Motor vehicles total,1981,46168
3,Private cars and vans under 2.001 kg.,1981,36854
4,Vans over 2.000 kg.,1981,3795


Rename column 'TRANSMID' to 'vehicle'

In [20]:
pkm1.rename(columns=columns_dict,inplace=True)
pkm1.head(14)

Unnamed: 0,vehicle,year,value
0,VEHICLES ON THE ROAD TOTAL,1981,..
1,Bicycles/Mopeds max. 30 km/h,1981,..
2,Motor vehicles total,1981,46168
3,Private cars and vans under 2.001 kg.,1981,36854
4,Vans over 2.000 kg.,1981,3795
5,Taxis,1981,441
6,Motorcycles,1981,282
7,Mopeds max. 45 km/h,1981,0
8,Buses and coaches total,1981,4797
9,Scheduled buses,1981,2418


The dataset contains following vehicles, which do not fit into our analysis, therefor they are droped. 

In [28]:
# Build up a logical index I
I = pkm1.vehicle.str.contains('VEHICLES ON THE ROAD TOTAL')
I |= pkm1.vehicle.str.contains('Motor vehicles total')
I |= pkm1.vehicle.str.contains('Vans over 2.000 kg.')
I |= pkm1.vehicle.str.contains('Scheduled buses')
I |= pkm1.vehicle.str.contains('Coaches and other buses')
pkm1.loc[I, :]

pkm1 = pkm1.loc[I == False] # keep everything else
pkm1.reset_index(inplace = True, drop = True) # Drop old index too
pkm1.head(9)

Unnamed: 0,vehicle,year,value
0,Bicycles/Mopeds max. 30 km/h,1980,..
1,Private cars and vans under 2.001 kg.,1980,38027
2,Buses and coaches total,1980,4611
3,Train,1980,4503
4,Aeroplane,1980,..
5,Mopeds max. 45 km/h,1980,0
6,Ship,1980,..
7,Taxis,1980,458
8,Motorcycles,1980,275


Futhermore we want to restrict the years without any information, maybee

Sort by year and vehicle 

In [33]:
pkm1.sort_values(by=['vehicle', 'year'],inplace=True)
pkm1.head(43)

Unnamed: 0,vehicle,year,value
4,Aeroplane,1980,..
14,Aeroplane,1981,..
23,Aeroplane,1982,..
27,Aeroplane,1983,..
37,Aeroplane,1984,..
52,Aeroplane,1985,..
60,Aeroplane,1986,..
68,Aeroplane,1987,..
77,Aeroplane,1988,..
81,Aeroplane,1989,..


# Merge with population data from Denmark Statistics


We want to have the value (mio. personkm) from the PKM1 table in per capita terms. To get that, we would need to download population data from Denmark Statistics:

In [34]:
FT_api = DstApi('FT')
params = FT_api._define_base_params(language='en')
params['variables'][0]['values'] = ['000'] 
## 000 is the code for all of Denmark, this can be seen by using: FT_api.variable_levels('HOVEDDELE', language='en')
pop = FT_api.get_data(params=params)

pop.rename(columns={'TID':'year','INDHOLD':'population'},inplace=True)
pop =  pop.loc[:,['year','population']]
pop.head()

Unnamed: 0,year,population
0,2010,5534738
1,1769,797584
2,1840,1289075
3,1860,1608362
4,1901,2449540


In [36]:
merged = pd.merge(pkm1,pop,how='left',on=['year'])
merged.head(43)

Unnamed: 0,vehicle,year,value,population
0,Aeroplane,1980,..,5122065
1,Aeroplane,1981,..,5123989
2,Aeroplane,1982,..,5119155
3,Aeroplane,1983,..,5116464
4,Aeroplane,1984,..,5112130
5,Aeroplane,1985,..,5111108
6,Aeroplane,1986,..,5116273
7,Aeroplane,1987,..,5124794
8,Aeroplane,1988,..,5129254
9,Aeroplane,1989,..,5129778
