# Data project - Group: De Slemme Programmeringsrotter

Imports and set magics:

In [1]:
# uncomment below if you need to install 
#%pip install git+https://github.com/alemartinello/dstapi
#%pip install pandas-datareader

# Import modules
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

import pandas_datareader # install with `pip install pandas-datareader`
from dstapi import DstApi # install with `pip install git+https://github.com/alemartinello/dstapi`

# autoreload modules when code is run
%load_ext autoreload
%autoreload 2

# user written modules
import Oliver

  plt.style.use('seaborn-whitegrid')


# Read and clean data

In [2]:
# Load data
ind = DstApi('FODIE') 
tabsum = ind.tablesummary(language='en')

# Quick overview of data. 
for variable in tabsum['variable name']:
   print(variable+':')
   display(ind.variable_levels(variable, language='en'))

Table FODIE: Live births by region, mothers ancestry, mothers country of origin, mothers citizenship, age of mother, sex of child and time
Last update: 2023-02-10T08:00:00
OMRÅDE:


Unnamed: 0,id,text
0,000,All Denmark
1,084,Region Hovedstaden
2,101,Copenhagen
3,147,Frederiksberg
4,155,Dragør
...,...,...
100,773,Morsø
101,840,Rebild
102,787,Thisted
103,820,Vesthimmerlands


MOHERK:


Unnamed: 0,id,text
0,5,Persons of Danish origin
1,4,Immigrants
2,3,Descendant
3,0,Unknown origin


MOOPRIND:


Unnamed: 0,id,text
0,5100,Denmark
1,5122,Albania
2,5124,Andorra
3,5706,Belarus
4,5126,Belgium
...,...,...
236,5275,Vanuatu
237,5532,East Timor
238,5599,Pacific Islands
239,5103,Stateless


MOSTAT:


Unnamed: 0,id,text
0,5100,Denmark
1,5122,Albania
2,5124,Andorra
3,5706,Belarus
4,5126,Belgium
...,...,...
236,5275,Vanuatu
237,5532,East Timor
238,5599,Pacific Islands
239,5103,Stateless


MODERSALDER:


Unnamed: 0,id,text
0,10,10 years
1,11,11 years
2,12,12 years
3,13,13 years
4,14,14 years
5,15,15 years
6,16,16 years
7,17,17 years
8,18,18 years
9,19,19 years


BARNKON:


Unnamed: 0,id,text
0,D,Boys
1,P,Girls


Tid:


Unnamed: 0,id,text
0,2007,2007
1,2008,2008
2,2009,2009
3,2010,2010
4,2011,2011
5,2012,2012
6,2013,2013
7,2014,2014
8,2015,2015
9,2016,2016


We notice that 'OMRÅDE' can take the value 'All Denmark' as well as the individual municipialities. We keep this in mind when further investigating, thus to not get duplicates across the categories (as a birth will be registered in both a municipiality and all of Denmark).

In [3]:
# The _define_base_params -method gives us a nice template (selects all available data)
params = ind._define_base_params(language='en')
params

{'table': 'fodie',
 'format': 'BULK',
 'lang': 'en',
 'variables': [{'code': 'OMRÅDE', 'values': ['*']},
  {'code': 'MOHERK', 'values': ['*']},
  {'code': 'MOOPRIND', 'values': ['*']},
  {'code': 'MOSTAT', 'values': ['*']},
  {'code': 'MODERSALDER', 'values': ['*']},
  {'code': 'BARNKON', 'values': ['*']},
  {'code': 'Tid', 'values': ['*']}]}

In [4]:
inc_api = ind.get_data(params=params)
inc_api.sort_values(by=['TID', 'OMRÅDE', 'MODERSALDER', 'MOOPRIND', 'BARNKON'], inplace=True)

## Explore each data set

First examine how large the dataframe is:

In [5]:
print(inc_api.shape)

(412674, 8)


There seems to be 412674 rows (observations) and 8 columns (variables). The 7 rows are of course the groups as seem in section 'Read and clean data'. But what is the 8th?

First we would simply like to see a the data as a dataframe and what a row could look like:

In [6]:
inc_api.head(5)

Unnamed: 0,OMRÅDE,MOHERK,MOOPRIND,MOSTAT,MODERSALDER,BARNKON,TID,INDHOLD
329917,Aabenraa,Descendant,Germany,Denmark,17 years,Boys,2007,1
305207,Aabenraa,Immigrants,Armenia,Armenia,18 years,Girls,2007,1
301436,Aabenraa,Persons of Danish origin,Denmark,Denmark,18 years,Boys,2007,2
324174,Aabenraa,Persons of Danish origin,Denmark,Denmark,18 years,Girls,2007,2
329923,Aabenraa,Immigrants,Iraq,Iraq,18 years,Boys,2007,1


In [7]:
inc_api.tail(5)

Unnamed: 0,OMRÅDE,MOHERK,MOOPRIND,MOSTAT,MODERSALDER,BARNKON,TID,INDHOLD
389354,Ærø,Immigrants,Germany,Germany,36 years,Girls,2022,1
389369,Ærø,Persons of Danish origin,Denmark,Denmark,38 years,Girls,2022,1
389370,Ærø,Persons of Danish origin,Denmark,Denmark,41 years,Girls,2022,2
389371,Ærø,Persons of Danish origin,Denmark,Denmark,42 years,Boys,2022,1
389372,Ærø,Persons of Danish origin,Denmark,Denmark,44 years,Girls,2022,1


It seems the 8th the column (variable) is 'INDHOLD'. 'INDHOLD' seems to show the amount of births in the specific category of the other 7 variables. 

If we try to sort by 'INDHOLD' to examine the rows where 'INDHOLD' is large, we would expect to see the categories of 'Persons of Danish origin' and a age of the mother of roughly late twenties. We examine this:

In [8]:
inc_api.sort_values(by=['INDHOLD'], inplace=True)
inc_api.tail(5)

Unnamed: 0,OMRÅDE,MOHERK,MOOPRIND,MOSTAT,MODERSALDER,BARNKON,TID,INDHOLD
360699,All Denmark,Persons of Danish origin,Denmark,Denmark,29 years,Boys,2007,2359
351557,All Denmark,Persons of Danish origin,Denmark,Denmark,31 years,Girls,2007,2361
227982,All Denmark,Persons of Danish origin,Denmark,Denmark,29 years,Boys,2021,2371
333961,All Denmark,Persons of Danish origin,Denmark,Denmark,31 years,Boys,2007,2400
333962,All Denmark,Persons of Danish origin,Denmark,Denmark,30 years,Boys,2007,2410


As seen above that is what we found. Thus it seems that 'INDHOLD' is the birth count for the category.

We now have a dataset we can analyze.

# Analysis

Now we would like to analyze the data. First let us investigate amount of children born in a specific year, the gender of them and the origin of the mother. Let us use year 2008 as an example. See code below:

In [9]:
# Select only the category 'All Denmark' from the area in which the child was born.
# One should either choose 'All Denmark' or all of the individual municipialities when analyzing the data to get the levels correct, as a child is registered in both categories.
all_denmark_data = inc_api[inc_api['OMRÅDE'] == 'All Denmark']

# Select the data for the year 2008
all_denmark_data = all_denmark_data.set_index('TID')
year_data = all_denmark_data.loc[all_denmark_data.index.get_level_values('TID') == 2019]

# Group data by gender of child and origin of mother. 
grouped_data = year_data.groupby(['BARNKON', 'MOHERK'])['INDHOLD'].sum()

# check data out
print(grouped_data)

BARNKON  MOHERK                  
Boys     Descendant                   1156
         Immigrants                   5781
         Persons of Danish origin    24582
Girls    Descendant                   1037
         Immigrants                   5430
         Persons of Danish origin    23181
Name: INDHOLD, dtype: int64


Lets check if this is realistic. We then have total amount of live births in Denmark in 2019:

In [10]:
sum = grouped_data.sum()
print("The total amount of live births in Denmark in 2019 was: " + str(sum))

The total amount of live births in Denmark in 2019 was: 61167


 A quick check on the [DST website](https://www.dst.dk/en/Statistik/emner/borgere/befolkning/foedsler) shows that the amount of live births in Denmark in 2019 was 61167. It uses the same table as us, but this shows us that we have aggregated the categories correctly. Now to investigate amount of children born in a specific year, the gender of them and the origin of the mother. See code below:

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets

# Select only the category 'All Denmark' from the area in which the child was born.
all_denmark_data = inc_api[inc_api['OMRÅDE'] == 'All Denmark']

# Define a function to update the plot based on the selected year
def update_plot(year):
    # Select the data for the selected year
    year_data = all_denmark_data.loc[all_denmark_data['TID'] == year]

    # Group data by gender of child and origin of mother.
    grouped_data = year_data.groupby(['BARNKON', 'MOHERK'])['INDHOLD'].sum()

    # Reverse the order of the levels in the grouped_data DataFrame
    grouped_data = grouped_data.reorder_levels(['MOHERK', 'BARNKON'])

    # Transform the data into a format suitable for a stacked bar chart
    stacked_data = grouped_data.unstack(level=0)

    # Create the stacked bar chart
    ax = stacked_data.plot(kind='bar', stacked=True)

    # Move the legend to the right of the plot
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

    # Add chart labels and title
    plt.title(f'Number of Children Born in Denmark in {year} by Mother Origin and Gender')
    plt.xlabel('Mother Origin')
    plt.ylabel('Number of Children Born')

    # Add labels to the stacked bars
    for container in ax.containers:
        for i, bar in enumerate(container):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width() / 2, height / 2, int(height), ha='center', va='center', color='black', fontsize=12)

widgets.interact(update_plot,
                 year=widgets.IntSlider(value=2019, min=2008, max=2021, step=1, description='Year', continuous_update=False),
)

interactive(children=(IntSlider(value=2019, continuous_update=False, description='Year', max=2021, min=2008), …

<function __main__.update_plot(year)>

# Merge data sets

# Conclusion