# GDP and Population

> **Note the following:** 
> 1. This is *not* meant to be an example of an actual **data analysis project**, just an example of how to structure such a project.
> 1. Remember the general advice on structuring and commenting your code
> 1. The `dataproject.py` file includes a function which can be used multiple times in this notebook.

Imports and set magics:

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from matplotlib_venn import venn2
import pandas_datareader.data as web
 

# If not already installed, it is nessesary to install the extensipn below. 
#%pip install eurostat
import eurostat


# autoreload modules when code is run
%load_ext autoreload
%autoreload 2

# user written modules
import dataproject


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
# We access the data from eurostat adn name it df
df = eurostat.get_data_df('nama_10_gdp')
display(df)


Unnamed: 0,freq,unit,na_item,geo\TIME_PERIOD,1975,1976,1977,1978,1979,1980,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,A,CLV05_MEUR,B1G,AT,,,,,,,...,251167.1,252879.7,255017.1,259996.4,266016.9,272985.2,277001.4,259083.6,269392.5,283659.7
1,A,CLV05_MEUR,B1G,BA,,,,,,,...,9118.0,9224.4,9358.7,9658.1,9978.2,10355.7,10650.6,10358.6,11080.5,11427.6
2,A,CLV05_MEUR,B1G,BE,,,,,,,...,308495.4,313684.3,320657.4,323677.5,328736.7,334655.5,342451.3,325503.1,344132.7,354684.3
3,A,CLV05_MEUR,B1G,BG,,,,,,,...,25127.0,25373.6,26129.3,26806.7,27691.2,28654.5,29721.9,28535.0,30816.2,31876.7
4,A,CLV05_MEUR,B1G,CH,,,,,,,...,381505.9,390405.6,396747.1,405030.0,410565.5,422877.6,427969.5,418277.5,435957.0,445107.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29630,A,PYP_MNAC,YA1,PT,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-27.4,156.0
29631,A,PYP_MNAC,YA1,RO,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29632,A,PYP_MNAC,YA1,SI,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29633,A,PYP_MNAC,YA1,UK,,,,,,,...,,,,,,,,,,


In [26]:
# We choose which rows that we want to see.
    # we have chosen to se the gross domestic product in Chain linked volumes (2015), million euro. 
gdp = df[df['na_item'] == 'B1GQ']
gdp = gdp[gdp['unit']=='CLV15_MEUR']



In [27]:
# We remove the columns freq, unit, na_item, and the years 1975-2011
drop_these = ['freq' , 'unit', 'na_item'] + [str(i) for i in range(1975,2012,1)] 
##print(drop_these)

gdp.drop(drop_these, axis=1, inplace=True) # axis = 1 -> columns, inplace=True -> changed, no copy made



['freq', 'unit', 'na_item', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011']


In [28]:
# We rename the coloumn geo\TIME_PERIOD
gdp.rename(columns={'geo\TIME_PERIOD': 'Country_code'}, inplace=True)

In [29]:
# We remove the values for EA, EA12, EA19, EA20, EU15, EU27_2020, and EU28
# These values are of mulitple countries added together.
remove_these = ['EA', 'EA12', 'EA19', 'EA20', 'EU15', 'EU27_2020', 'EU28']
##print(remove_these)

for i in remove_these : 
    gdp = gdp[gdp['Country_code']!= i]


['EA', 'EA12', 'EA19', 'EA20', 'EU15', 'EU27_2020', 'EU28']


In [30]:
# we are resetting the index
gdp.reset_index(inplace = True, drop = True)
gdp.head(5)

Unnamed: 0,Country_code,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AL,9768.4,9866.2,10041.3,10264.1,10604.4,11007.6,11450.0,11689.0,11282.1,,
1,AT,338486.5,338572.8,340811.7,344269.2,351118.3,359048.5,367756.8,373337.1,349242.1,365156.5,383403.2
2,BA,13695.8,14017.6,14179.3,14791.1,15270.7,15766.1,16369.8,16842.2,16334.4,17541.8,18225.3
3,BE,400181.0,402018.8,408364.8,416701.4,421979.7,428814.0,436502.4,446283.8,422356.8,448263.6,462119.8
4,BG,44117.0,43869.7,44293.9,45812.3,47204.9,48508.8,49811.2,51822.6,49771.2,53571.0,55371.7


We are now adding another dataset; this dataset shows the population in the countries

In [31]:
# we name our parameters
code = 'DEMO_PJAN'
pars = eurostat.get_pars(code)

# We access the data that we need
my_filter_pars = {'startPeriod':2012,'endPeriod': 2022, 'sex': 'T', 'age':'TOTAL'}
population = eurostat.get_data_df(code, filter_pars=my_filter_pars)

population.head()


Unnamed: 0,freq,unit,age,sex,geo\TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,A,NR,TOTAL,T,AD,78115.0,76246.0,,,,,,76177.0,,,
1,A,NR,TOTAL,T,AL,2903008.0,2897770.0,2892394.0,2885796.0,2875592.0,2876591.0,2870324.0,2862427.0,2845955.0,2829741.0,
2,A,NR,TOTAL,T,AM,3274285.0,,,3010598.0,2998577.0,2986151.0,2972732.0,2965269.0,2959694.0,2963251.0,
3,A,NR,TOTAL,T,AT,8408121.0,8451860.0,8507786.0,8584926.0,8700471.0,8772865.0,8822267.0,8858775.0,8901064.0,8932664.0,8978929.0
4,A,NR,TOTAL,T,AZ,9235085.0,9356483.0,9477119.0,9593038.0,9705643.0,9809981.0,9898085.0,9981457.0,10067108.0,10119133.0,


In [32]:
# We rename the column geo\TIME_PERIOD

population.rename(columns={'geo\TIME_PERIOD': 'Country_code'}, inplace=True)

In [33]:
del_coloumns = ['freq' , 'unit', 'age', 'sex']

print( 'These are the deleted columns:' , del_coloumns)

population.drop(columns=del_coloumns, axis=1, inplace=True) 

population.head()

These are the deleted columns: ['freq', 'unit', 'age', 'sex']


Unnamed: 0,Country_code,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AD,78115.0,76246.0,,,,,,76177.0,,,
1,AL,2903008.0,2897770.0,2892394.0,2885796.0,2875592.0,2876591.0,2870324.0,2862427.0,2845955.0,2829741.0,
2,AM,3274285.0,,,3010598.0,2998577.0,2986151.0,2972732.0,2965269.0,2959694.0,2963251.0,
3,AT,8408121.0,8451860.0,8507786.0,8584926.0,8700471.0,8772865.0,8822267.0,8858775.0,8901064.0,8932664.0,8978929.0
4,AZ,9235085.0,9356483.0,9477119.0,9593038.0,9705643.0,9809981.0,9898085.0,9981457.0,10067108.0,10119133.0,


We are now chaning the direction of the two datasets, making them long rather than wide. 

In [34]:
population_long = pd.wide_to_long(population , stubnames='' , i= 'Country_code', j= 'year')



gdp_long = pd.wide_to_long(gdp, stubnames= '', i= 'Country_code' , j= 'year')

print('This is the data for poppulation' , population_long.head(5))
print('This is the data for gdp',gdp_long.head(5))



This is the data for poppulation                             
Country_code year           
AD           2012    78115.0
AL           2012  2903008.0
AM           2012  3274285.0
AT           2012  8408121.0
AZ           2012  9235085.0
This is the data for gdp                            
Country_code year          
AL           2012    9768.4
AT           2012  338486.5
BA           2012   13695.8
BE           2012  400181.0
BG           2012   44117.0


In [35]:
gdp_long.describe

<bound method NDFrame.describe of                            
Country_code year          
AL           2012    9768.4
AT           2012  338486.5
BA           2012   13695.8
BE           2012  400181.0
BG           2012   44117.0
...                     ...
SI           2022   49542.2
SK           2022   90751.6
TR           2022       NaN
UK           2022       NaN
XK           2022    7381.7

[418 rows x 1 columns]>

We will now merge the two datasets, by doing an inner join; 
meaning we choose the observations (countries) which are in both datasets. 

In [36]:
inner = pd.merge(gdp_long, population_long, how = 'inner' , on = ['Country_code' , 'year'])
inner

Unnamed: 0_level_0,Unnamed: 1_level_0,_x,_y
Country_code,year,Unnamed: 2_level_1,Unnamed: 3_level_1
AL,2012,9768.4,2903008.0
AT,2012,338486.5,8408121.0
BA,2012,13695.8,3839265.0
BE,2012,400181.0,11075889.0
BG,2012,44117.0,7327224.0
...,...,...,...
SI,2022,49542.2,2107180.0
SK,2022,90751.6,5434712.0
TR,2022,,
UK,2022,,


In [37]:
inner.rename(columns={'_x': 'GDP', '_y':'Population'}, inplace=True)
inner.reset_index(inplace=True)

In [38]:
# we are checking how many datapoints we have data for, 
# eg. how many non-null observations we have. 
inner.count()

Country_code    418
year            418
GDP             412
Population      398
dtype: int64

We will now remove alle the countries that have NaN's for all the values of either GDP or Population: 

In [39]:
inner.dropna(inplace=True)

We will now calculate GDP per Capita, for each country and year. We do this by dividing our GDP column with our population column, and since our GDP column is in million euros, we also multiply by a million to get our GDP/Capita in euros. 

In [40]:
inner["GDP/Cap"] = inner["GDP"]*1000000/inner["Population"]
inner["GDP/Cap"]

0       3364.923555
1      40257.091923
2       3567.297386
3      36130.824352
4       6020.970561
           ...     
410    11009.977443
411     6591.409137
412    50198.501271
413    23511.138109
414    16698.511347
Name: GDP/Cap, Length: 396, dtype: float64

We will now make an interactive figure that shows the trend in GDP per capita over the years. It should be possible to change which country the data is shown for. 

In [41]:
def plot_e(inner, Country_code): 
    I = inner['Country_code'] == Country_code
    ax=inner.loc[I,:].plot(x='year', y='GDP/Cap', style='-o', legend=False)
    ax.set_xlim(inner['year'].min(), inner['year'].max())
    ax.set_ylabel('GDP per capita in euros')
    ax.set_title(f"GDP per capita 2012-2022 for {Country_code}")
    ax.set_xticks(np.arange(inner['year'].min(), inner['year'].max()+1))

widgets.interact(plot_e, 
    inner = widgets.fixed(inner),
    Country_code = widgets.Dropdown(name='Country_code', 
                                    options=inner.Country_code.unique(), 
                                    value='DK')
); 


interactive(children=(Dropdown(description='Country_code', index=9, options=('AL', 'AT', 'BA', 'BE', 'BG', 'CH…

We will now visualize the data as an interactive scatterplot, showing GDP per capita on the x-axis and the population on the y-axis. 
It should be possible to change the year that is viewed. 

In [42]:
def plot_f(inner, year):
    I = inner['year'] == year
    ax = inner.loc[I,:].plot(x='GDP/Cap', y='Population', style='o', legend=False)
    ax.set_ylabel('Population in millions')
    ax.set_xlabel('GDP per capita in euros')
    ax.set_title(f"Scatterplot of GDP per capita and Population for {year}")
    plt.subplots_adjust(left=0.2, right=1, top=0.9, bottom=0.1)
    plt.show()
    
year_widget = widgets.Dropdown(options=inner['year'].unique(), value=2022, description='Year:')
widgets.interact(plot_f, inner=widgets.fixed(inner), year=year_widget)

interactive(children=(Dropdown(description='Year:', index=10, options=(2012, 2013, 2014, 2015, 2016, 2017, 201…

<function __main__.plot_f(inner, year)>

We will now do some standard caluculations for the data, mean value, average, etc.
In this we also change the format of the output, to make it more readable. 

In [43]:
pd.options.display.float_format = '{:.2f}'.format
inner.describe()

Unnamed: 0,year,GDP,Population,GDP/Cap
count,396.0,396.0,396.0,396.0
mean,2016.93,455078.59,16427120.83,28464.5
std,3.14,732122.54,23181705.27,22191.51
min,2012.0,3353.7,319575.0,3364.92
25%,2014.0,36851.28,2076912.75,11794.94
50%,2017.0,177109.7,6981901.5,20282.0
75%,2020.0,452793.88,11412821.5,40758.39
max,2022.0,3261011.6,83614362.0,98633.75


This will have to be done in a better way. 