# Data project

**Variables** 

Udlån 

Materialeudgifter (1.000 kr.)

Imports and Magic settings 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets

import datetime

import pandas_datareader # install with `pip install pandas-datareader`
import pydst # install with `pip install git+https://github.com/elben10/pydst`


# autoreload modules when code is run
%load_ext autoreload
%autoreload 2

# user written modules
import dataproject
plt.rcParams.update({"axes.grid":True,"grid.color":"black","grid.alpha":"0.25","grid.linestyle":"--"})
plt.rcParams.update({'font.size': 14})

from pandasql import sqldf

**Importing data**

**Importing data for libraries from DST.** 

In [19]:
#Object for interacting with DST 
Dst = pydst.Dst(lang='en') # setup data loader with the langauge 'english'


#Values of the variables  
bib_vars  = Dst.get_variables(table_id = 'BIB1')

for id in [ 'BNØGLE', 'Tid']:
    print(id)
    values = bib_vars.loc[bib_vars.id ==id, ['values']].values[0,0]
    for value in values:
        print(f'id = {value["id"]}, text = {value["text"]}')

BNØGLE
id = 15110, text = Loan. All materials
id = 15120, text = Loan. Books
id = 15130, text = Loan. Audio books
id = 15140, text = Loan. Music recordings
id = 15150, text = Loan. Moving pictures
id = 15160, text = Loan. Multimedia
id = 15170, text = Loan. Other materials
id = 15175, text = Loan. Serial publications
id = 15180, text = Stock. All materials
id = 15190, text = Stock. Books
id = 15200, text = Stock. Audio books
id = 15210, text = Stock. Music recordings
id = 15220, text = Stock. Moving pictures
id = 15230, text = Stock. Multimedia
id = 15240, text = Stock. Other materials
id = 15245, text = Stock. Serial publications (subscribers)
id = 15250, text = Uses of electronic ressources (downloads)
id = 15260, text = Expenditure, materials (DKK 1,000)
Tid
id = 2009, text = 2009
id = 2010, text = 2010
id = 2011, text = 2011
id = 2012, text = 2012
id = 2013, text = 2013
id = 2014, text = 2014
id = 2015, text = 2015
id = 2018, text = 2018
id = 2019, text = 2019
id = 2020, text = 202

In [25]:
#Object for interacting with DST 
Dst = pydst.Dst(lang='en') # setup data loader with the langauge 'english'

#Dictionary of variables of the disired data  (Loan. All materials)
var_pick = {'OMRÅDE': ['*'], 'BNØGLE': [ '15120' ,'15260'], 'Tid': ['*']}

#Raw dataset 
bib_api = Dst.get_data(table_id = 'BIB1', variables = var_pick)

bib_api.head()

Unnamed: 0,OMRÅDE,BNØGLE,TID,INDHOLD
0,All Denmark,Loan. Books,2009,32256695
1,All Denmark,"Expenditure, materials (DKK 1,000)",2009,387005
2,Region Nordjylland,Loan. Books,2009,3137993
3,Region Nordjylland,"Expenditure, materials (DKK 1,000)",2009,32779
4,Region Midtjylland,Loan. Books,2009,7021388


**Importing data for population by munisipality** 

In [7]:
#Loading the data 
var_pick = {'OMRÅDE': ['*'], 'Tid':['*']}
bef_api = Dst.get_data(table_id='FOLK1A', variables=var_pick)
bef_api.sort_values(by=['OMRÅDE'], inplace=True)

# Keeping only the 1 quarter 
I = bef_api['TID'].str.endswith('Q1')
bef_api_q1 = bef_api[I]

#Renaming the time variable such that it does not include the quarter 
bef_api_q1 = bef_api_q1.rename(columns={'TID': 'tid_temp'})
bef_api_q1['tid_temp'] = bef_api_q1['tid_temp'].str.slice(stop=4)
bef_api_q1 = bef_api_q1.rename(columns={'tid_temp': 'TID'})

#Variable TID must be an object for merging 
bib_api['TID'] = bib_api['TID'].astype('object')

bef_api_q1.head()

Unnamed: 0,OMRÅDE,TID,KØN,ALDER,CIVILSTAND,INDHOLD
3452,Aabenraa,2013,Total,Total,Total,59208
3157,Aabenraa,2015,Total,Total,Total,58904
3767,Aabenraa,2011,Total,Total,Total,59795
870,Aabenraa,2022,Total,Total,Total,58693
2465,Aabenraa,2023,Total,Total,Total,59002


**Merging dataset**

Comparing datasets

In [39]:
#The datasets
print(f'Library dataset contains data from: {np.min(bib_api.TID.unique())} to {np.max(bib_api.TID)}')
print(f'Municipalities in Library data = {len(bib_api.OMRÅDE.unique())}')

print(f'Bef dataset contains data from: {np.min(bef_api_q1.TID.unique())} to {np.max(bef_api_q1.TID)}')
print(f'Municipalities in Library data = {len(bef_api_q1.OMRÅDE.unique())}')

#Differenses in the datasets
diff_mun = [m for m in bef_api_q1.OMRÅDE.unique() if m not in bib_api.OMRÅDE.unique()]
print (f' Munisipalities in bef and not in Lib: {diff_mun}')


Library dataset contains data from: 2009 to 2021
Municipalities in Library data = 103
Bef dataset contains data from: 2008 to 2023
Municipalities in Library data = 105
 Munisipalities in bef and not in Lib: ['Christiansø', 'Fanø']


Merging using SQL

In [None]:
case 
    when bib.BNØGLE = 'Loan. Books' then bib.INDHOLD                            = 'loan'
    when bib.BNØGLE = 'Expenditure, materials (DKK 1,000)' then bib.INDHOLD     = 'Expenditure'
end as var_navn,

In [50]:

#bef.INDHOLD as n, 

q = """
SELECT 
bef.OMRÅDE, 
bef.TID, 
bef.INDHOLD as n, 
bib.INDHOLD as loan_n,
bib.BNØGLE
FROM bef_api_q1 as bef
INNER JOIN bib_api as bib
ON bib.OMRÅDE = bef.OMRÅDE AND bib.TID = bef.TID
where bib.BNØGLE = 'Loan. Books'
"""

# Apply query using pandasql
df = sqldf(q)

print(f'Merged dataset contains data from: {np.min(df.TID.unique())} to {np.max(bef_api_q1.TID)}')
print(f'Merged dataset contains data form Municipalities = {len(df.OMRÅDE.unique())}')
print(f'Merged dataset contains data form Municipalities = {len(df.OMRÅDE.BNØGLE())}')


df.head()


Merged dataset contains data from: 2009 to 2023
Merged dataset contains data form Municipalities = 103


Unnamed: 0,OMRÅDE,TID,n,loan_n,BNØGLE
0,All Denmark,2009,5511451,32256695,Loan. Books
1,Region Nordjylland,2009,580515,3137993,Loan. Books
2,Region Midtjylland,2009,1247732,7021388,Loan. Books
3,Region Syddanmark,2009,1199667,6795849,Loan. Books
4,Region Hovedstaden,2009,1662285,10641050,Loan. Books


In [46]:
q = """
SELECT 
bef.OMRÅDE, 
bef.TID, 
bef.INDHOLD as n, 
case 
    when bib.BNØGLE = 'Loan. Books' then 'loan'
    when bib.BNØGLE = 'Expenditure, materials (DKK 1,000)' then 'Expenditure'
end as var_navn

FROM bef_api_q1 as bef
INNER JOIN bib_api as bib
ON bib.OMRÅDE = bef.OMRÅDE AND bib.TID = bef.TID
"""
# Apply query using pandasql
df = sqldf(q)

In [48]:
df.head()

Unnamed: 0,OMRÅDE,TID,n,var_navn
0,Aabenraa,2013,59208,Expenditure
1,Aabenraa,2013,59208,loan
2,Aabenraa,2015,58904,Expenditure
3,Aabenraa,2015,58904,loan
4,Aabenraa,2011,59795,Expenditure
