# YOUR PROJECT TITLE

> **Note the following:** 
> 1. This is *not* meant to be an example of an actual **data analysis project**, just an example of how to structure such a project.
> 1. Remember the general advice on structuring and commenting your code
> 1. The `dataproject.py` file includes a function which can be used multiple times in this notebook.

In [None]:
# The DST API wrapper
#%pip install git+https://github.com/alemartinello/dstapi

# A wrapper for multiple APIs with a pandas interface
#%pip install pandas-datareader

Imports and set magics:

In [None]:
import pandas as pd
import numpy as np
import ipywidgets as widgets

import datetime
import pandas_datareader 
from dstapi import DstApi

import matplotlib.pyplot as plt
plt.rcParams.update({"axes.grid":True,"grid.color":"black","grid.alpha":"0.25","grid.linestyle":"--"})
plt.rcParams.update({'font.size': 14})

# autoreload modules when code is run
%load_ext autoreload
%autoreload 2

# user written modules
import dataproject


# Read and clean data

Importing data from DST through an API

In [None]:
ind = DstApi('LONS50') 

Getting a quick overview over the data

In [None]:
tabsum = ind.tablesummary(language='en')
display(tabsum)

In [None]:
# The available values for a each variable: 
for variable in tabsum['variable name']:
   print(variable+':')
   display(ind.variable_levels(variable, language='en'))


In [None]:
# The _define_base_params -method gives us a nice template (selects all available data)
params = ind._define_base_params(language='en')
params

In [None]:
params = {'table': 'lons50',
 'format': 'BULK',
 'lang': 'en',
 'variables': [{'code': 'ALDER1', 'values': ['20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-']},
  {'code': 'SEKTOR', 'values': ['1018', '1016', '1046']},
  {'code': 'AFLOEN', 'values': ['TIFA']},
  {'code': 'LONGRP', 'values': ['LTOT']},
  {'code': 'LØNMÅL', 'values': ['FORINKL']},
  {'code': 'KØN', 'values': ['MOK']},
  {'code': 'Tid', 'values': ['*']}]}


In [None]:
inc_api = ind.get_data(params=params)
inc_api.info()
inc_api.head()

In [None]:
inc_api.sort_values(by=['ALDER1', 'TID', 'SEKTOR'], inplace=True)
inc_api.head(5)

In [None]:
inc_api['INDHOLD'] = inc_api['INDHOLD'].astype(float)
print(inc_api['INDHOLD'].dtype)

In [None]:
inc_api['TID'] = inc_api['TID'].astype(float)
print(inc_api['TID'].dtype)

NR. 2 Data Set

In [None]:
ind1 = DstApi('LONS30') 

In [None]:
tabsum1 = ind1.tablesummary(language='en')
display(tabsum1)

In [None]:
# The available values for a each variable: 
for variable in tabsum1['variable name']:
  print(variable+':')
  display(ind.variable_levels(variable, language='en'))

In [None]:
# The _define_base_params -method gives us a nice template (selects all available data)
params1 = ind1._define_base_params(language='en')
params1

In [None]:
params1 = {'table': 'lons30',
 'format': 'BULK',
 'lang': 'en',
 'variables': [{'code': 'OMRÅDE', 'values': ['*']},
  {'code': 'SEKTOR', 'values': ['1018', '1016', '1046']},
  {'code': 'AFLOEN', 'values': ['TIFA']},
  {'code': 'LONGRP', 'values': ['LTOT']},
  {'code': 'LØNMÅL', 'values': ['FORINKL']},
  {'code': 'KØN', 'values': ['MOK']},
  {'code': 'Tid', 'values': ['*']}]}

In [None]:
inc1_api = ind1.get_data(params=params1)
inc1_api.info()
inc1_api.head()

In [None]:
inc1_api.sort_values(by=['OMRÅDE', 'TID', 'SEKTOR'], inplace=True)
inc1_api.head(5)

In [None]:
inc1_api['INDHOLD'] = inc1_api['INDHOLD'].astype(float)
print(inc1_api['INDHOLD'].dtype)

In [None]:
inc1_api['TID'] = inc1_api['TID'].astype(float)
print(inc1_api['TID'].dtype)

Cleaning data

In [None]:
#Dropping irrelevant variables from first data set
Inc_age = inc_api.drop(['LONGRP', 'LØNMÅL', 'AFLOEN','KØN'],axis=1) # Drop irrelevant variables
Inc_age.head(10)

In [None]:
#Dropping irrelevant variables from first data set
Inc_area = inc1_api.drop(['LONGRP', 'LØNMÅL', 'AFLOEN','KØN'],axis=1) # Drop irrelevant variables
Inc_area.head(10)

# Merge data sets

In [None]:
Inc_age_cleaned = Inc_age.dropna().drop_duplicates()
Inc_area_cleaned = Inc_area.dropna().drop_duplicates()

# Merge datasets on common 'TID' and 'SEKTOR' assuming 'SEKTOR' exists in both.
# We are using an inner join here, meaning we only keep rows that match in both datasets.
merged_df = pd.merge(Inc_age_cleaned, Inc_area_cleaned, on=['TID', 'SEKTOR'], suffixes=('_age', '_area'))

# Check the merged data
print(merged_df.head())

# Analysis

In [None]:
# Find the most recent year in the merged dataset
most_recent_year = int(merged_df['TID'].max())
print(f"Most recent year in the dataset: {most_recent_year}")


In [None]:
pip install seaborn


In [None]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)


In [None]:
import seaborn as sns

# Set the seaborn style for better aesthetics
sns.set_style("whitegrid")

# For Age Groups
plt.figure(figsize=(18, 10))
sns.boxplot(x='ALDER1', y='INDHOLD_age', data=merged_df_recent_year, palette="Set3", showfliers=True, showmeans=True,
            meanprops={"marker":"o", "markerfacecolor":"white", "markeredgecolor":"black", "markersize":"10"})
plt.title(f'Income Distribution by Age Group for {most_recent_year}', fontsize=16)
plt.xlabel('Age Group', fontsize=14)
plt.ylabel('Income', fontsize=14)
plt.xticks(rotation=45, fontsize=12)  # Ensure labels are readable
plt.tight_layout()  # Adjust layout to make room for label
plt.show()

# For Areas
plt.figure(figsize=(18, 10))
sns.boxplot(x='OMRÅDE', y='INDHOLD_area', data=merged_df_recent_year, palette="Set2", showfliers=True, showmeans=True,
            meanprops={"marker":"o", "markerfacecolor":"white", "markeredgecolor":"black", "markersize":"10"})
plt.title(f'Income Distribution by Area for {most_recent_year}', fontsize=16)
plt.xlabel('Area', fontsize=14)
plt.ylabel('Income', fontsize=14)
plt.xticks(rotation=45, fontsize=12)  # Ensure labels are readable
plt.tight_layout()  # Adjust layout to make room for label
plt.show()


## Explore each data set

In [None]:
pip install plotly


In [None]:
pip install 'nbformat>=4.2.0'


In [None]:
import plotly.express as px


In [None]:
# Ensure 'TID' is the correct data type for plotting
merged_df['TID'] = pd.to_datetime(merged_df['TID'], format='%Y').dt.year

# Plotting
fig_age = px.line(merged_df, x='TID', y='INDHOLD_age', color='ALDER1', 
                  labels={'TID': 'Year', 'INDHOLD_age': 'Income', 'ALDER1': 'Age Group'},
                  title='Income Development Over Time by Age Group')
fig_age.update_layout(transition_duration=500)
fig_age.show()


In [None]:
# Plotting
fig_area = px.line(merged_df, x='TID', y='INDHOLD_area', color='OMRÅDE', 
                   labels={'TID': 'Year', 'INDHOLD_area': 'Income', 'OMRÅDE': 'Area'},
                   title='Income Development Over Time by Area')
fig_area.update_layout(transition_duration=500)
fig_area.show()

# Conclusion

ADD CONCISE CONLUSION.