# GDP and Population

> **Note the following:** 
> 1. This is *not* meant to be an example of an actual **data analysis project**, just an example of how to structure such a project.
> 1. Remember the general advice on structuring and commenting your code
> 1. The `dataproject.py` file includes a function which can be used multiple times in this notebook.

Imports and set magics:

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from matplotlib_venn import venn2
import pandas_datareader.data as web
 

# it is nessesary to install the extensipn below. 
#%pip install eurostat
import eurostat


# autoreload modules when code is run
%load_ext autoreload
%autoreload 2

# user written modules
import dataproject


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
# We access the data from eurostat adn name it df
df = eurostat.get_data_df('nama_10_gdp')
display(df)


Unnamed: 0,freq,unit,na_item,geo\TIME_PERIOD,1975,1976,1977,1978,1979,1980,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,A,CLV05_MEUR,B1G,AT,,,,,,,...,251167.1,252879.7,255017.1,259996.4,266016.9,272985.2,277001.4,259083.6,269392.5,283659.7
1,A,CLV05_MEUR,B1G,BA,,,,,,,...,9118.0,9224.4,9358.7,9658.1,9978.2,10355.7,10650.6,10358.6,11080.5,11427.6
2,A,CLV05_MEUR,B1G,BE,,,,,,,...,308495.4,313684.3,320657.4,323677.5,328736.7,334655.5,342451.3,325503.1,344132.7,354684.3
3,A,CLV05_MEUR,B1G,BG,,,,,,,...,25127.0,25373.6,26129.3,26806.7,27691.2,28654.5,29721.9,28535.0,30816.2,31876.7
4,A,CLV05_MEUR,B1G,CH,,,,,,,...,381505.9,390405.6,396747.1,405030.0,410565.5,422877.6,427969.5,418277.5,435957.0,445107.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29630,A,PYP_MNAC,YA1,PT,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-27.4,156.0
29631,A,PYP_MNAC,YA1,RO,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29632,A,PYP_MNAC,YA1,SI,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29633,A,PYP_MNAC,YA1,UK,,,,,,,...,,,,,,,,,,


In [25]:
# We choose which rows that we want to see.
    # we have chosen to se the gross domestic product in Chain linked volumes (2015), million euro. 
gdp = df[df['na_item'] == 'B1GQ']
gdp = gdp[gdp['unit']=='CLV15_MEUR']

gdp.head(20)


Unnamed: 0,freq,unit,na_item,geo\TIME_PERIOD,1975,1976,1977,1978,1979,1980,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
3948,A,CLV15_MEUR,B1GQ,AL,,,,,,,...,9866.2,10041.3,10264.1,10604.4,11007.6,11450.0,11689.0,11282.1,,
3949,A,CLV15_MEUR,B1GQ,AT,,,,,,,...,338572.8,340811.7,344269.2,351118.3,359048.5,367756.8,373337.1,349242.1,365156.5,383403.2
3950,A,CLV15_MEUR,B1GQ,BA,,,,,,,...,14017.6,14179.3,14791.1,15270.7,15766.1,16369.8,16842.2,16334.4,17541.8,18225.3
3951,A,CLV15_MEUR,B1GQ,BE,,,,,,,...,402018.8,408364.8,416701.4,421979.7,428814.0,436502.4,446283.8,422356.8,448263.6,462119.8
3952,A,CLV15_MEUR,B1GQ,BG,,,,,,,...,43869.7,44293.9,45812.3,47204.9,48508.8,49811.2,51822.6,49771.2,53571.0,55371.7
3953,A,CLV15_MEUR,B1GQ,CH,,,,,,340397.8,...,601282.1,615411.5,625532.7,638473.0,647174.1,665686.2,673288.2,657293.8,685042.9,699149.6
3954,A,CLV15_MEUR,B1GQ,CY,,,,,,,...,17664.7,17350.9,17944.2,19123.7,20220.2,21362.2,22543.4,21557.7,22988.9,24283.6
3955,A,CLV15_MEUR,B1GQ,CZ,,,,,,,...,157329.9,160888.9,169558.2,173860.4,182846.7,188734.6,194453.0,183752.3,190280.5,194963.2
3956,A,CLV15_MEUR,B1GQ,DE,,,,,,,...,2917237.5,2981695.2,3026180.0,3093663.8,3176581.2,3207750.8,3241644.0,3121807.3,3203816.8,3261011.6
3957,A,CLV15_MEUR,B1GQ,DK,129987.3,137688.5,140263.7,143386.4,148934.2,148214.9,...,262517.1,266768.3,273017.6,281879.6,289833.5,295599.9,300014.6,294030.5,308307.8,320082.1


In [26]:
# We remove the columns freq, unit, na_item, and the years 1975-2011
drop_these = ['freq' , 'unit', 'na_item'] + [str(i) for i in range(1975,2012,1)] 
print(drop_these)

gdp.drop(drop_these, axis=1, inplace=True) # axis = 1 -> columns, inplace=True -> changed, no copy made
gdp.head(10)



['freq', 'unit', 'na_item', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011']


Unnamed: 0,geo\TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
3948,AL,9768.4,9866.2,10041.3,10264.1,10604.4,11007.6,11450.0,11689.0,11282.1,,
3949,AT,338486.5,338572.8,340811.7,344269.2,351118.3,359048.5,367756.8,373337.1,349242.1,365156.5,383403.2
3950,BA,13695.8,14017.6,14179.3,14791.1,15270.7,15766.1,16369.8,16842.2,16334.4,17541.8,18225.3
3951,BE,400181.0,402018.8,408364.8,416701.4,421979.7,428814.0,436502.4,446283.8,422356.8,448263.6,462119.8
3952,BG,44117.0,43869.7,44293.9,45812.3,47204.9,48508.8,49811.2,51822.6,49771.2,53571.0,55371.7
3953,CH,590695.9,601282.1,615411.5,625532.7,638473.0,647174.1,665686.2,673288.2,657293.8,685042.9,699149.6
3954,CY,18910.4,17664.7,17350.9,17944.2,19123.7,20220.2,21362.2,22543.4,21557.7,22988.9,24283.6
3955,CZ,157402.2,157329.9,160888.9,169558.2,173860.4,182846.7,188734.6,194453.0,183752.3,190280.5,194963.2
3956,DE,2904527.6,2917237.5,2981695.2,3026180.0,3093663.8,3176581.2,3207750.8,3241644.0,3121807.3,3203816.8,3261011.6
3957,DK,260089.6,262517.1,266768.3,273017.6,281879.6,289833.5,295599.9,300014.6,294030.5,308307.8,320082.1


In [27]:
# We rename the coloumn geo\TIME_PERIOD
gdp.rename(columns={'geo\TIME_PERIOD': 'Country_code'}, inplace=True)

In [28]:
remove_these = ['EA', 'EA12', 'EA19', 'EA20', 'EU15', 'EU27_2020', 'EU28']
print(remove_these)

for i in remove_these : 
    gdp = gdp[gdp['Country_code']!= i]

gdp

['EA', 'EA12', 'EA19', 'EA20', 'EU15', 'EU27_2020', 'EU28']


Unnamed: 0,Country_code,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
3948,AL,9768.4,9866.2,10041.3,10264.1,10604.4,11007.6,11450.0,11689.0,11282.1,,
3949,AT,338486.5,338572.8,340811.7,344269.2,351118.3,359048.5,367756.8,373337.1,349242.1,365156.5,383403.2
3950,BA,13695.8,14017.6,14179.3,14791.1,15270.7,15766.1,16369.8,16842.2,16334.4,17541.8,18225.3
3951,BE,400181.0,402018.8,408364.8,416701.4,421979.7,428814.0,436502.4,446283.8,422356.8,448263.6,462119.8
3952,BG,44117.0,43869.7,44293.9,45812.3,47204.9,48508.8,49811.2,51822.6,49771.2,53571.0,55371.7
3953,CH,590695.9,601282.1,615411.5,625532.7,638473.0,647174.1,665686.2,673288.2,657293.8,685042.9,699149.6
3954,CY,18910.4,17664.7,17350.9,17944.2,19123.7,20220.2,21362.2,22543.4,21557.7,22988.9,24283.6
3955,CZ,157402.2,157329.9,160888.9,169558.2,173860.4,182846.7,188734.6,194453.0,183752.3,190280.5,194963.2
3956,DE,2904527.6,2917237.5,2981695.2,3026180.0,3093663.8,3176581.2,3207750.8,3241644.0,3121807.3,3203816.8,3261011.6
3957,DK,260089.6,262517.1,266768.3,273017.6,281879.6,289833.5,295599.9,300014.6,294030.5,308307.8,320082.1


In [29]:
# we are resetting the index
gdp.reset_index(inplace = True, drop = True)
gdp

Unnamed: 0,Country_code,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AL,9768.4,9866.2,10041.3,10264.1,10604.4,11007.6,11450.0,11689.0,11282.1,,
1,AT,338486.5,338572.8,340811.7,344269.2,351118.3,359048.5,367756.8,373337.1,349242.1,365156.5,383403.2
2,BA,13695.8,14017.6,14179.3,14791.1,15270.7,15766.1,16369.8,16842.2,16334.4,17541.8,18225.3
3,BE,400181.0,402018.8,408364.8,416701.4,421979.7,428814.0,436502.4,446283.8,422356.8,448263.6,462119.8
4,BG,44117.0,43869.7,44293.9,45812.3,47204.9,48508.8,49811.2,51822.6,49771.2,53571.0,55371.7
5,CH,590695.9,601282.1,615411.5,625532.7,638473.0,647174.1,665686.2,673288.2,657293.8,685042.9,699149.6
6,CY,18910.4,17664.7,17350.9,17944.2,19123.7,20220.2,21362.2,22543.4,21557.7,22988.9,24283.6
7,CZ,157402.2,157329.9,160888.9,169558.2,173860.4,182846.7,188734.6,194453.0,183752.3,190280.5,194963.2
8,DE,2904527.6,2917237.5,2981695.2,3026180.0,3093663.8,3176581.2,3207750.8,3241644.0,3121807.3,3203816.8,3261011.6
9,DK,260089.6,262517.1,266768.3,273017.6,281879.6,289833.5,295599.9,300014.6,294030.5,308307.8,320082.1


We are now adding another dataset; this dataset shows the population in the countries

In [30]:
# we name our parameters
code = 'DEMO_PJAN'
pars = eurostat.get_pars(code)

# We access the data that we need
my_filter_pars = {'startPeriod':2012,'endPeriod': 2022, 'sex': 'T', 'age':'TOTAL'}
population = eurostat.get_data_df(code, filter_pars=my_filter_pars)

population.head()


Unnamed: 0,freq,unit,age,sex,geo\TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,A,NR,TOTAL,T,AD,78115.0,76246.0,,,,,,76177.0,,,
1,A,NR,TOTAL,T,AL,2903008.0,2897770.0,2892394.0,2885796.0,2875592.0,2876591.0,2870324.0,2862427.0,2845955.0,2829741.0,
2,A,NR,TOTAL,T,AM,3274285.0,,,3010598.0,2998577.0,2986151.0,2972732.0,2965269.0,2959694.0,2963251.0,
3,A,NR,TOTAL,T,AT,8408121.0,8451860.0,8507786.0,8584926.0,8700471.0,8772865.0,8822267.0,8858775.0,8901064.0,8932664.0,8978929.0
4,A,NR,TOTAL,T,AZ,9235085.0,9356483.0,9477119.0,9593038.0,9705643.0,9809981.0,9898085.0,9981457.0,10067108.0,10119133.0,


In [31]:
# We rename the column geo\TIME_PERIOD

population.rename(columns={'geo\TIME_PERIOD': 'Country_code'}, inplace=True)

In [32]:
del_coloumns = ['freq' , 'unit', 'age', 'sex']

print( 'These are the deleted columns:' , del_coloumns)

population.drop(columns=del_coloumns, axis=1, inplace=True) 

population.head()

These are the deleted columns: ['freq', 'unit', 'age', 'sex']


Unnamed: 0,Country_code,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AD,78115.0,76246.0,,,,,,76177.0,,,
1,AL,2903008.0,2897770.0,2892394.0,2885796.0,2875592.0,2876591.0,2870324.0,2862427.0,2845955.0,2829741.0,
2,AM,3274285.0,,,3010598.0,2998577.0,2986151.0,2972732.0,2965269.0,2959694.0,2963251.0,
3,AT,8408121.0,8451860.0,8507786.0,8584926.0,8700471.0,8772865.0,8822267.0,8858775.0,8901064.0,8932664.0,8978929.0
4,AZ,9235085.0,9356483.0,9477119.0,9593038.0,9705643.0,9809981.0,9898085.0,9981457.0,10067108.0,10119133.0,


We are now chaning the direction of the two datasets, making them long rather than wide. 

In [38]:
population_long = pd.wide_to_long(population , stubnames='' , i= 'Country_code', j= 'year')



gdp_long = pd.wide_to_long(gdp, stubnames= '', i= 'Country_code' , j= 'year')

print('This is the data for poppulation' , population_long.head(10))
print('This is the data for gdp',gdp_long.head(10))



This is the data for poppulation                              
Country_code year            
AD           2012     78115.0
AL           2012   2903008.0
AM           2012   3274285.0
AT           2012   8408121.0
AZ           2012   9235085.0
BA           2012   3839265.0
BE           2012  11075889.0
BG           2012   7327224.0
BY           2012   9465150.0
CH           2012   7954662.0
This is the data for gdp                             
Country_code year           
AL           2012     9768.4
AT           2012   338486.5
BA           2012    13695.8
BE           2012   400181.0
BG           2012    44117.0
CH           2012   590695.9
CY           2012    18910.4
CZ           2012   157402.2
DE           2012  2904527.6
DK           2012   260089.6


In [44]:
gdp_long.describe

<bound method NDFrame.describe of                            
Country_code year          
AL           2012    9768.4
AT           2012  338486.5
BA           2012   13695.8
BE           2012  400181.0
BG           2012   44117.0
...                     ...
SI           2022   49542.2
SK           2022   90751.6
TR           2022       NaN
UK           2022       NaN
XK           2022    7381.7

[418 rows x 1 columns]>

We will now merge the two datasets, by doing an inner join; 
meaning we choose the observations (countries) which are in both datasets. 

In [40]:
inner = pd.merge(gdp_long, population_long, how = 'inner' , on = ['Country_code' , 'year'])
inner

Unnamed: 0_level_0,Unnamed: 1_level_0,_x,_y
Country_code,year,Unnamed: 2_level_1,Unnamed: 3_level_1
AL,2012,9768.4,2903008.0
AT,2012,338486.5,8408121.0
BA,2012,13695.8,3839265.0
BE,2012,400181.0,11075889.0
BG,2012,44117.0,7327224.0
...,...,...,...
SI,2022,49542.2,2107180.0
SK,2022,90751.6,5434712.0
TR,2022,,
UK,2022,,


In [42]:
inner.rename(columns={'_x': 'GDP', '_y':'Population'}, inplace=True)
inner

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP,Population
Country_code,year,Unnamed: 2_level_1,Unnamed: 3_level_1
AL,2012,9768.4,2903008.0
AT,2012,338486.5,8408121.0
BA,2012,13695.8,3839265.0
BE,2012,400181.0,11075889.0
BG,2012,44117.0,7327224.0
...,...,...,...
SI,2022,49542.2,2107180.0
SK,2022,90751.6,5434712.0
TR,2022,,
UK,2022,,


In [45]:
# we are checking how many datapoints we have data for, 
# eg. how many non-null observations we have. 
inner.count()

GDP           412
Population    398
dtype: int64

We will now remove alle the countries that have NaN's for all the values of either GDP or Populaiton: 

We will now calculate GDP per Capita, for each country and year. 

We will now make an interactive figure that shows the trend in GDP per capita over the years. It should be possible to change which country the data is shown for. 

We will now visualize the data as an interactive scatterplot, showing GDP per capita on the x-axis and the population on the y-axis. 
It should be possible to change the year that is viewed. 

We will now do some standard caluculations for the data, mean value, average, etc. 

In [48]:
inner.describe()

Unnamed: 0,GDP,Population
count,412.0,398.0
mean,437879.1,16520090.0
std,722826.1,23272090.0
min,3353.7,319575.0
25%,26778.33,2078076.0
50%,171709.3,6981902.0
75%,438009.2,11441290.0
max,3261012.0,83614360.0


This will have to be done in a better way. 