# Football in Denmark: Where are we playing?

Imports and set magics:

In [None]:
# %pip install git+https://github.com/alemartinello/dstapi #Installing the API (only need to do once)

In [None]:
# %pip install pandas-datareader # Installing the data reader (only need to do once)

In [None]:
# %pip install geopandas 
# Installing the geopandas reader (only need to do once)

In [None]:
import pandas as pd
from IPython.display import Image
import numpy as np
from matplotlib.ticker import FuncFormatter
import geopandas as gpd
import matplotlib.pyplot as plt
import ipywidgets as widgets
plt.rcParams.update({"axes.grid":True,"grid.color":"black","grid.alpha":"0.25","grid.linestyle":"--"})
plt.rcParams.update({'font.size': 14})
import ipywidgets as widgets
# from matplotlib_venn import venn2
from dstapi import DstApi # install with `pip install git+https://github.com/alemartinello/dstapi`

# autoreload modules when code is run
%load_ext autoreload
%autoreload 2

# user written modules
import dataproject


# Read data

Creating a dictionary for the columns names:

In [None]:
columns_dict = {}
columns_dict['BLSTKOM'] = 'region'
columns_dict['AKTIVITET'] = 'activity'
columns_dict['KON'] = 'sex'
columns_dict['ALDER1'] = 'age'
columns_dict['TID'] = 'year'
columns_dict['POP'] = 'population'
columns_dict['INDHOLD'] = 'players'

**Step 1:** Downloading all of the football-variables in merged_df01

In [None]:
idrakt_api = DstApi('IDRAKT01')  #Creating the DST API which will allow us to interact with the API server
params = idrakt_api._define_base_params(language='en') #Creating a parameter dictionary with the language set to English
variables = params['variables'] # Returns a view, that we can edit
variables[1]['values'] = ['A22'] # Choosing football as the activity (The ID for football is A22 Using code from: https://alemartinello.com/2022/02/24/dstapi/)
print(variables)

In [None]:
idrakt = idrakt_api.get_data(params=params) #Downloading the dataset

**Step 1A** Henter Befolkningsdata fra FOLK1AM

In [None]:
folk_api = DstApi('FOLK1A')  #Creating the DST API which will allow us to interact with the API server
params = folk_api._define_base_params(language='en') #Creating a parameter dictionary with the language set to English
variables = params['variables'] # Returns a view, that we can edit
variables[2]['values'] = ['IALT']
variables[3]['values'] = ['TOT']
print(variables)

In [None]:
folk = folk_api.get_data(params=params) #Downloading the dataset

# Clean data

**Cleaning population data (FOLK1A)**

In [None]:
folk.drop(columns=['ALDER', 'CIVILSTAND'], inplace=True)

In [None]:
folk_q1 = folk.loc[folk['TID'].str.contains('Q1')] # Selecting only the first quarter of the year
folk_q1['TID'] = folk_q1['TID'].str.replace('Q1', '')
folk_q1['KØN'] = folk_q1['KØN'].str.replace('Total', 'Sex, total')
folk_q1['OMRÅDE'] = folk_q1['OMRÅDE'].str.replace('Landsdel', 'Province')

# Define a list of the regions in the dataset
regions = ['All Denmark', 'Region Nordjylland', 'Region Midtjylland', 'Region Syddanmark', 'Region Sjælland', 'Region Hovedstaden']

# Filter the DataFrame to keep only rows where 'OMRÅDE' matches one of the regions
folk_q1 = folk_q1[folk_q1['OMRÅDE'].isin(regions)]
folk_q1.head()

**Cleaning football data (IDRAKT01)**

**Step 2:** Only keep rows where the variable is in `Age, total` and afterwards deleting the coloumn.

In [None]:
#Only keeps rows with age = 'Age, total' and afterwards deleting the age coloumn
idrakt = idrakt[idrakt['ALDER1'] == 'Age, total']
idrakt.drop(columns=['ALDER1'],inplace=True)

Summarizing provinces to regions:

In [None]:
def map_regions(BLSTKOM):
    if BLSTKOM == 'All Denmark':
        return 'All Denmark'
    elif BLSTKOM == 'Province Nordjylland':
        return 'Region Nordjylland'
    elif BLSTKOM in ['Province Vestjylland', 'Province Østjylland']:
        return 'Region Midtjylland'
    elif BLSTKOM in ['Province Fyn', 'Province Sydjylland']:
        return 'Region Syddanmark'
    elif BLSTKOM in ['Province Østsjælland', 'Province Vest- og Sydsjælland']:
        return 'Region Sjælland'
    elif BLSTKOM in ['Province Bornholm', 'Province Byen København', 'Province Københavns omegn', 'Province Nordsjælland']:
        return 'Region Hovedstaden'
    else:
        return BLSTKOM  # Keep the original value for non-province entries

# Apply the function and update the 'BLSTKOM' column in place
idrakt['BLSTKOM'] = idrakt['BLSTKOM'].apply(map_regions)

Only keeping regions and all of Denmark

In [None]:
# Filter the DataFrame to keep only rows where 'BLSTKOM' matches one of the regions
idrakt = idrakt[idrakt['BLSTKOM'].isin(regions)]

# Display the first few rows to verify the filtering
idrakt.head()

## Merging the two datasets and the sorting:

In [None]:
idrakt['TID'] = idrakt['TID'].astype(str)
folk_q1['TID'] = folk_q1['TID'].astype(str)

consolidated_df = pd.merge(
    idrakt,
    folk_q1.rename(columns={'INDHOLD': 'POP'}),  # Rename INDHOLD to POP in folk_q1 before merging
    how='left',  # Keep all rows from idrakt
    left_on=['BLSTKOM', 'KON', 'TID'],  # Columns to match in idrakt
    right_on=['OMRÅDE', 'KØN', 'TID']  # Corresponding columns to match in folk_q1
)
consolidated_df.drop(['OMRÅDE', 'KØN'], axis=1, inplace=True) #Drops unnessecary columns

#We need to add across provinces within the regions
def first_value(series):
    return series.iloc[0]

merged_df = consolidated_df.groupby(['region', 'sex', 'year']).agg({'players': 'sum', 'population': first_value}).reset_index()

# rename columns
merged_df.rename(columns=columns_dict,inplace=True)

Sort the dataset by region, year and sex

In [None]:
# Sorting the dataset by county then year and then total sex, but first we customly sort the order of Sex
# Define a custom sorting order
sort_order = ['Sex, total', 'Men', 'Women']

# Create a categorical type with the custom order
merged_df['sex'] = pd.Categorical(merged_df['sex'], categories=sort_order, ordered=True)

merged_df.sort_values(by=['region','year','sex'],inplace=True)
merged_df.reset_index(drop=True,inplace=True)
merged_df.head()

# Analysis across genders

Here we calculate the total number of Danish football players split based on sex. We have made the graph in the py-file as total_players_dk.

In [None]:
from dataproject import total_players_dk
total_players_dk(merged_df)
# Display the saved plot in the notebook
Image(filename='stacked_bar_chart.png')

It is hard to see the relative development between the sexes and for total. Hence, we index the values and display here.

In [None]:
# Assuming df is your DataFrame and you call this function in your Jupyter Notebook
from dataproject import index_players_dk
index_players_dk(merged_df)
Image(filename='indexed_players_chart.png')

2018 women suddenly increases a lot towards 2019. The lockdown in 2020 is visuably for both sexes. However, afterwards, women footballers continues to increase with a much higher rate than men. Actually we see, that the increase in football players does not at all correspond to the increment in population. That could have something to do with the fact, that we look at total population, and not the population in the age group 6-60 years (the age group, where most football players are).

**Annual growth contributions**

In [None]:
from dataproject import plot_growth_contributions_all_denmark
plot_growth_contributions_all_denmark(merged_df)
# Display the saved plot
Image(filename='growth_contributions_all_denmark.png')

The figure above shows that even though women increases a lot, the contributions to the overall growth is fairly dominated by men because of the larger base. 

# Analysis across geography

Plotting the number of football players by region in an interactive graph. 

Click the names in the legend to hide/show the data for a specific region or All Denmark.

In [None]:
from dataproject import process_data, plot_data

# Process the data
grouped_df = process_data(merged_df)

# Plot the data
plot_data(grouped_df)

Comment: The graph shows how just two Regions are increasing in 2022 compared to 2014, i.e. Region Hovedstaden and Region Nordjylland. The other regions are decreasing or stagnating. However, the tendency across regions are the same as 

**Making a plot of the share of players in each region compared to the population in the given region.**

The plot as well is interactive. Click the names in the legend to hide/show the data for a specific region or All Denmark.

In [None]:
from dataproject import calculate_player_share, plot_share_data

# Calculate the share of players in population
processed_df = calculate_player_share(merged_df)

# Plot the data
plot_share_data(processed_df)

Making a plot of the sexes across counties

In [None]:
from dataproject import calculate_geographic_shares

# Assuming df is your DataFrame with football player data
geographic_shares = calculate_geographic_shares(idrakt)

import matplotlib.pyplot as plt

# Creating a bar plot
geographic_shares.plot(kind='bar', x='county', y='share')
plt.title('Share of Football Players by Geography')
plt.xlabel('Geography')
plt.ylabel('Share')
plt.show()


# Conclusion

ADD CONCISE CONLUSION.

We see that...