# GDP and Population

> **Note the following:** 
> 1. This is *not* meant to be an example of an actual **data analysis project**, just an example of how to structure such a project.
> 1. Remember the general advice on structuring and commenting your code
> 1. The `dataproject.py` file includes a function which can be used multiple times in this notebook.

Imports and set magics:

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from matplotlib_venn import venn2
import pandas_datareader.data as web
 

# it is nessesary to install the extensipn below. 
#%pip install eurostat
import eurostat


# autoreload modules when code is run
%load_ext autoreload
%autoreload 2

# user written modules
import dataproject


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# We access the data from eurostat adn name it df
df = eurostat.get_data_df('nama_10_gdp')
display(df)


Unnamed: 0,freq,unit,na_item,geo\TIME_PERIOD,1975,1976,1977,1978,1979,1980,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,A,CLV05_MEUR,B1G,AT,,,,,,,...,251167.1,252879.7,255017.1,259996.4,266016.9,272985.2,277001.4,259083.6,269392.5,283659.7
1,A,CLV05_MEUR,B1G,BA,,,,,,,...,9118.0,9224.4,9358.7,9658.1,9978.2,10355.7,10650.6,10358.6,11080.5,11427.6
2,A,CLV05_MEUR,B1G,BE,,,,,,,...,308495.4,313684.3,320657.4,323677.5,328736.7,334655.5,342451.3,325503.1,344132.7,354684.3
3,A,CLV05_MEUR,B1G,BG,,,,,,,...,25127.0,25373.6,26129.3,26806.7,27691.2,28654.5,29721.9,28535.0,30816.2,31876.7
4,A,CLV05_MEUR,B1G,CH,,,,,,,...,381505.9,390405.6,396747.1,405030.0,410565.5,422877.6,427969.5,418277.5,435957.0,445107.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29630,A,PYP_MNAC,YA1,PT,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-27.4,156.0
29631,A,PYP_MNAC,YA1,RO,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29632,A,PYP_MNAC,YA1,SI,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29633,A,PYP_MNAC,YA1,UK,,,,,,,...,,,,,,,,,,


In [19]:
# We choose which rows that we want to see.
    # we have chosen to se the gross domestic product in Chain linked volumes (2015), million euro. 
gdp = df[df['na_item'] == 'B1GQ']
gdp = gdp[gdp['unit']=='CLV15_MEUR']

gdp.head(20)


Unnamed: 0,freq,unit,na_item,geo\TIME_PERIOD,1975,1976,1977,1978,1979,1980,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
3948,A,CLV15_MEUR,B1GQ,AL,,,,,,,...,9866.2,10041.3,10264.1,10604.4,11007.6,11450.0,11689.0,11282.1,,
3949,A,CLV15_MEUR,B1GQ,AT,,,,,,,...,338572.8,340811.7,344269.2,351118.3,359048.5,367756.8,373337.1,349242.1,365156.5,383403.2
3950,A,CLV15_MEUR,B1GQ,BA,,,,,,,...,14017.6,14179.3,14791.1,15270.7,15766.1,16369.8,16842.2,16334.4,17541.8,18225.3
3951,A,CLV15_MEUR,B1GQ,BE,,,,,,,...,402018.8,408364.8,416701.4,421979.7,428814.0,436502.4,446283.8,422356.8,448263.6,462119.8
3952,A,CLV15_MEUR,B1GQ,BG,,,,,,,...,43869.7,44293.9,45812.3,47204.9,48508.8,49811.2,51822.6,49771.2,53571.0,55371.7
3953,A,CLV15_MEUR,B1GQ,CH,,,,,,340397.8,...,601282.1,615411.5,625532.7,638473.0,647174.1,665686.2,673288.2,657293.8,685042.9,699149.6
3954,A,CLV15_MEUR,B1GQ,CY,,,,,,,...,17664.7,17350.9,17944.2,19123.7,20220.2,21362.2,22543.4,21557.7,22988.9,24283.6
3955,A,CLV15_MEUR,B1GQ,CZ,,,,,,,...,157329.9,160888.9,169558.2,173860.4,182846.7,188734.6,194453.0,183752.3,190280.5,194963.2
3956,A,CLV15_MEUR,B1GQ,DE,,,,,,,...,2917237.5,2981695.2,3026180.0,3093663.8,3176581.2,3207750.8,3241644.0,3121807.3,3203816.8,3261011.6
3957,A,CLV15_MEUR,B1GQ,DK,129987.3,137688.5,140263.7,143386.4,148934.2,148214.9,...,262517.1,266768.3,273017.6,281879.6,289833.5,295599.9,300014.6,294030.5,308307.8,320082.1


In [20]:
# We remove the columns freq, unit, na_item, and the years 1975-2011
drop_these = ['freq' ,] + [str(i) for i in range(1975,2012,1)]
print(drop_these)

gdp.drop(drop_these, axis=1, inplace=True) # axis = 1 -> columns, inplace=True -> changed, no copy made
gdp.head(10)



['freq', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011']


Unnamed: 0,unit,na_item,geo\TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
3948,CLV15_MEUR,B1GQ,AL,9768.4,9866.2,10041.3,10264.1,10604.4,11007.6,11450.0,11689.0,11282.1,,
3949,CLV15_MEUR,B1GQ,AT,338486.5,338572.8,340811.7,344269.2,351118.3,359048.5,367756.8,373337.1,349242.1,365156.5,383403.2
3950,CLV15_MEUR,B1GQ,BA,13695.8,14017.6,14179.3,14791.1,15270.7,15766.1,16369.8,16842.2,16334.4,17541.8,18225.3
3951,CLV15_MEUR,B1GQ,BE,400181.0,402018.8,408364.8,416701.4,421979.7,428814.0,436502.4,446283.8,422356.8,448263.6,462119.8
3952,CLV15_MEUR,B1GQ,BG,44117.0,43869.7,44293.9,45812.3,47204.9,48508.8,49811.2,51822.6,49771.2,53571.0,55371.7
3953,CLV15_MEUR,B1GQ,CH,590695.9,601282.1,615411.5,625532.7,638473.0,647174.1,665686.2,673288.2,657293.8,685042.9,699149.6
3954,CLV15_MEUR,B1GQ,CY,18910.4,17664.7,17350.9,17944.2,19123.7,20220.2,21362.2,22543.4,21557.7,22988.9,24283.6
3955,CLV15_MEUR,B1GQ,CZ,157402.2,157329.9,160888.9,169558.2,173860.4,182846.7,188734.6,194453.0,183752.3,190280.5,194963.2
3956,CLV15_MEUR,B1GQ,DE,2904527.6,2917237.5,2981695.2,3026180.0,3093663.8,3176581.2,3207750.8,3241644.0,3121807.3,3203816.8,3261011.6
3957,CLV15_MEUR,B1GQ,DK,260089.6,262517.1,266768.3,273017.6,281879.6,289833.5,295599.9,300014.6,294030.5,308307.8,320082.1


In [21]:
# We rename the coloumn geo\TIME_PERIOD
gdp.rename(columns={'geo\TIME_PERIOD': 'Country_code'}, inplace=True)

In [22]:
remove_these = ['EA', 'EA12', 'EA19', 'EA20', 'EU15', 'EU27_2020', 'EU28']
print(remove_these)

for i in remove_these : 
    gdp = gdp[gdp['Country_code']!= i]

gdp

['EA', 'EA12', 'EA19', 'EA20', 'EU15', 'EU27_2020', 'EU28']


Unnamed: 0,unit,na_item,Country_code,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
3948,CLV15_MEUR,B1GQ,AL,9768.4,9866.2,10041.3,10264.1,10604.4,11007.6,11450.0,11689.0,11282.1,,
3949,CLV15_MEUR,B1GQ,AT,338486.5,338572.8,340811.7,344269.2,351118.3,359048.5,367756.8,373337.1,349242.1,365156.5,383403.2
3950,CLV15_MEUR,B1GQ,BA,13695.8,14017.6,14179.3,14791.1,15270.7,15766.1,16369.8,16842.2,16334.4,17541.8,18225.3
3951,CLV15_MEUR,B1GQ,BE,400181.0,402018.8,408364.8,416701.4,421979.7,428814.0,436502.4,446283.8,422356.8,448263.6,462119.8
3952,CLV15_MEUR,B1GQ,BG,44117.0,43869.7,44293.9,45812.3,47204.9,48508.8,49811.2,51822.6,49771.2,53571.0,55371.7
3953,CLV15_MEUR,B1GQ,CH,590695.9,601282.1,615411.5,625532.7,638473.0,647174.1,665686.2,673288.2,657293.8,685042.9,699149.6
3954,CLV15_MEUR,B1GQ,CY,18910.4,17664.7,17350.9,17944.2,19123.7,20220.2,21362.2,22543.4,21557.7,22988.9,24283.6
3955,CLV15_MEUR,B1GQ,CZ,157402.2,157329.9,160888.9,169558.2,173860.4,182846.7,188734.6,194453.0,183752.3,190280.5,194963.2
3956,CLV15_MEUR,B1GQ,DE,2904527.6,2917237.5,2981695.2,3026180.0,3093663.8,3176581.2,3207750.8,3241644.0,3121807.3,3203816.8,3261011.6
3957,CLV15_MEUR,B1GQ,DK,260089.6,262517.1,266768.3,273017.6,281879.6,289833.5,295599.9,300014.6,294030.5,308307.8,320082.1


In [23]:
# we are resetting the index
gdp.reset_index(inplace = True, drop = True)
gdp

Unnamed: 0,unit,na_item,Country_code,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,CLV15_MEUR,B1GQ,AL,9768.4,9866.2,10041.3,10264.1,10604.4,11007.6,11450.0,11689.0,11282.1,,
1,CLV15_MEUR,B1GQ,AT,338486.5,338572.8,340811.7,344269.2,351118.3,359048.5,367756.8,373337.1,349242.1,365156.5,383403.2
2,CLV15_MEUR,B1GQ,BA,13695.8,14017.6,14179.3,14791.1,15270.7,15766.1,16369.8,16842.2,16334.4,17541.8,18225.3
3,CLV15_MEUR,B1GQ,BE,400181.0,402018.8,408364.8,416701.4,421979.7,428814.0,436502.4,446283.8,422356.8,448263.6,462119.8
4,CLV15_MEUR,B1GQ,BG,44117.0,43869.7,44293.9,45812.3,47204.9,48508.8,49811.2,51822.6,49771.2,53571.0,55371.7
5,CLV15_MEUR,B1GQ,CH,590695.9,601282.1,615411.5,625532.7,638473.0,647174.1,665686.2,673288.2,657293.8,685042.9,699149.6
6,CLV15_MEUR,B1GQ,CY,18910.4,17664.7,17350.9,17944.2,19123.7,20220.2,21362.2,22543.4,21557.7,22988.9,24283.6
7,CLV15_MEUR,B1GQ,CZ,157402.2,157329.9,160888.9,169558.2,173860.4,182846.7,188734.6,194453.0,183752.3,190280.5,194963.2
8,CLV15_MEUR,B1GQ,DE,2904527.6,2917237.5,2981695.2,3026180.0,3093663.8,3176581.2,3207750.8,3241644.0,3121807.3,3203816.8,3261011.6
9,CLV15_MEUR,B1GQ,DK,260089.6,262517.1,266768.3,273017.6,281879.6,289833.5,295599.9,300014.6,294030.5,308307.8,320082.1


We are now adding another dataset; this dataset shows the population in the countries

In [24]:
# we name our parameters
code = 'DEMO_PJAN'
pars = eurostat.get_pars(code)

# We access the data that we need
my_filter_pars = {'startPeriod':2012,'endPeriod': 2022, 'sex': 'T', 'age':'TOTAL'}
population = eurostat.get_data_df(code, filter_pars=my_filter_pars)

population.head()


Unnamed: 0,freq,unit,age,sex,geo\TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,A,NR,TOTAL,T,AD,78115.0,76246.0,,,,,,76177.0,,,
1,A,NR,TOTAL,T,AL,2903008.0,2897770.0,2892394.0,2885796.0,2875592.0,2876591.0,2870324.0,2862427.0,2845955.0,2829741.0,
2,A,NR,TOTAL,T,AM,3274285.0,,,3010598.0,2998577.0,2986151.0,2972732.0,2965269.0,2959694.0,2963251.0,
3,A,NR,TOTAL,T,AT,8408121.0,8451860.0,8507786.0,8584926.0,8700471.0,8772865.0,8822267.0,8858775.0,8901064.0,8932664.0,8978929.0
4,A,NR,TOTAL,T,AZ,9235085.0,9356483.0,9477119.0,9593038.0,9705643.0,9809981.0,9898085.0,9981457.0,10067108.0,10119133.0,


In [25]:
# We rename the column geo\TIME_PERIOD

population.rename(columns={'geo\TIME_PERIOD': 'Country_code'}, inplace=True)

In [26]:
del_coloumns = ['freq' , 'unit', 'age', 'sex']

print( 'These are the deleted columns:' , del_coloumns)

population.drop(columns=del_coloumns, axis=1, inplace=True) 

population.head()

These are the deleted columns: ['freq', 'unit', 'age', 'sex']


Unnamed: 0,Country_code,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AD,78115.0,76246.0,,,,,,76177.0,,,
1,AL,2903008.0,2897770.0,2892394.0,2885796.0,2875592.0,2876591.0,2870324.0,2862427.0,2845955.0,2829741.0,
2,AM,3274285.0,,,3010598.0,2998577.0,2986151.0,2972732.0,2965269.0,2959694.0,2963251.0,
3,AT,8408121.0,8451860.0,8507786.0,8584926.0,8700471.0,8772865.0,8822267.0,8858775.0,8901064.0,8932664.0,8978929.0
4,AZ,9235085.0,9356483.0,9477119.0,9593038.0,9705643.0,9809981.0,9898085.0,9981457.0,10067108.0,10119133.0,


We will now merge the two datasets, by doing an inner join; 
meaning we choose the observations (countries) which are in both datasets. 

In [27]:
inner = pd.merge(gdp, population, how = 'inner' , on = ['Country_code' ])
inner

Unnamed: 0,unit,na_item,Country_code,2012_x,2013_x,2014_x,2015_x,2016_x,2017_x,2018_x,...,2013_y,2014_y,2015_y,2016_y,2017_y,2018_y,2019_y,2020_y,2021_y,2022_y
0,CLV15_MEUR,B1GQ,AL,9768.4,9866.2,10041.3,10264.1,10604.4,11007.6,11450.0,...,2897770.0,2892394.0,2885796.0,2875592.0,2876591.0,2870324.0,2862427.0,2845955.0,2829741.0,
1,CLV15_MEUR,B1GQ,AT,338486.5,338572.8,340811.7,344269.2,351118.3,359048.5,367756.8,...,8451860.0,8507786.0,8584926.0,8700471.0,8772865.0,8822267.0,8858775.0,8901064.0,8932664.0,8978929.0
2,CLV15_MEUR,B1GQ,BA,13695.8,14017.6,14179.3,14791.1,15270.7,15766.1,16369.8,...,,,,,,,,,,
3,CLV15_MEUR,B1GQ,BE,400181.0,402018.8,408364.8,416701.4,421979.7,428814.0,436502.4,...,11137974.0,11180840.0,11237274.0,11311117.0,11351727.0,11398589.0,11455519.0,11522440.0,11554767.0,11617623.0
4,CLV15_MEUR,B1GQ,BG,44117.0,43869.7,44293.9,45812.3,47204.9,48508.8,49811.2,...,7284552.0,7245677.0,7202198.0,7153784.0,7101859.0,7050034.0,7000039.0,6951482.0,6916548.0,6838937.0
5,CLV15_MEUR,B1GQ,CH,590695.9,601282.1,615411.5,625532.7,638473.0,647174.1,665686.2,...,8039060.0,8139631.0,8237666.0,8327126.0,8419550.0,8484130.0,8544527.0,8606033.0,8670300.0,8738791.0
6,CLV15_MEUR,B1GQ,CY,18910.4,17664.7,17350.9,17944.2,19123.7,20220.2,21362.2,...,865878.0,858000.0,847008.0,848319.0,854802.0,864236.0,875899.0,888005.0,896007.0,904705.0
7,CLV15_MEUR,B1GQ,CZ,157402.2,157329.9,160888.9,169558.2,173860.4,182846.7,188734.6,...,10516125.0,10512419.0,10538275.0,10553843.0,10578820.0,10610055.0,10649800.0,10693939.0,10701777.0,10516707.0
8,CLV15_MEUR,B1GQ,DE,2904527.6,2917237.5,2981695.2,3026180.0,3093663.8,3176581.2,3207750.8,...,80523746.0,80767463.0,81197537.0,82175684.0,82521653.0,82792351.0,83019213.0,83166711.0,83155031.0,83237124.0
9,CLV15_MEUR,B1GQ,DK,260089.6,262517.1,266768.3,273017.6,281879.6,289833.5,295599.9,...,5602628.0,5627235.0,5659715.0,5707251.0,5748769.0,5781190.0,5806081.0,5822763.0,5840045.0,5873420.0


**Note:**
Maybe we should change the datasets to long instead of wide, before we combine the two datasets,
 it might look better???