# Library importation

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Code for countries of OECD

In [2]:
oecd_countries = {'AUS': 'Australia', 'AUT': 'Austria', 'BEL': 'Belgium', 'CAN': 'Canada', 'CHL': 'Chile', 'COL': 'Colombia', 'CZE': 'Czech Republic', 'DNK': 'Denmark', 'EST': 'Estonia', 'FIN': 'Finland', 'FRA': 'France', 'DEU': 'Germany', 'GRC': 'Greece', 'HUN': 'Hungary', 'ISL': 'Iceland', 'IRL': 'Ireland', 'ISR': 'Israel', 'ITA': 'Italy', 'JPN': 'Japan', 'KOR': 'Korea', 'LVA': 'Latvia', 'LTU': 'Lithuania', 'LUX': 'Luxembourg', 'MEX': 'Mexico', 'NLD': 'Netherlands', 'NZL': 'New Zealand', 'NOR': 'Norway', 'POL': 'Poland', 'PRT': 'Portugal', 'SVK': 'Slovakia', 'SVN': 'Slovenia', 'ESP': 'Spain', 'SWE': 'Sweden', 'CHE': 'Switzerland', 'TUR': 'Turkey', 'GBR': 'United Kingdom', 'USA': 'United States'}

# Data importation

## Starting with the NEET

We select the neet corresponding from 15 to 29 yo and from OECD countries only and selecting all the years

In [3]:
df = pd.read_csv('./data/education_system/NEET_men_women_altogether.csv')
df = df[(df.SUBJECT=='15_29')].drop(['INDICATOR','SUBJECT','MEASURE','FREQUENCY','Flag Codes'],axis=1)
df = df[(df.LOCATION).isin(oecd_countries.keys())]
df = df.reset_index(drop=True)

In [4]:
df.columns = ['Country','Time','NEET']

## Looking at Expenditure on LMP

### Preparing the dataframe

In [5]:
exp_lmp = pd.read_csv('./data/labour_market/public_exp_LMP.csv')
exp_lmp = exp_lmp[['LFS_COUNTRY','PROG','Time','Value']]
exp_lmp = exp_lmp[exp_lmp.PROG == 100].drop('PROG',axis=1).reset_index(drop=True)

In [6]:
exp_lmp.columns=['Country', 'Time', 'Exp_LMP']

In [7]:
type(exp_lmp['Time'])

pandas.core.series.Series

### Adding the values to the DF

In [8]:
list_value=[]
for el in df.itertuples():
    country,time = el[1],el[2]
    tmp = exp_lmp[(exp_lmp.Country==country)&(exp_lmp.Time==time)]
    if len(tmp.index)==0:
        list_value.append(None)
    else:
        list_value.append(tmp.Exp_LMP.values[0])
df['Exp_LMP']= list_value
df['Exp_LMP_2']=df['Exp_LMP']**2
df['Exp_LMP_3']=df['Exp_LMP']**3

In [9]:
df

Unnamed: 0,Country,Time,NEET,Exp_LMP,Exp_LMP_2,Exp_LMP_3
0,AUS,1997,16.034595,,,
1,AUS,1998,14.880802,,,
2,AUS,1999,13.637552,,,
3,AUS,2000,13.192960,,,
4,AUS,2001,12.979000,,,
...,...,...,...,...,...,...
691,LTU,2015,13.743647,0.53,0.2809,0.148877
692,LTU,2016,11.403278,0.51,0.2601,0.132651
693,LTU,2017,11.220660,0.54,0.2916,0.157464
694,LTU,2018,10.495108,0.66,0.4356,0.287496


## Looking at STR

### Preparing the dataset

In [10]:
STR = pd.read_csv('./data/education_system/student_teacher_ratio.csv')
STR = STR[['COUNTRY','ISC11_LEVEL_CAT','Year','Value']]
STR_L1 = STR[STR.ISC11_LEVEL_CAT == 'L1'].drop('ISC11_LEVEL_CAT',axis=1).reset_index(drop=True)

In [11]:
STR_L1.columns = ['Country', 'Time', 'STR']

In [12]:
STR_L1

Unnamed: 0,Country,Time,STR
0,AUS,2013,15.615
1,AUS,2014,15.612
2,AUS,2015,15.433
3,AUS,2016,15.168
4,AUS,2017,15.124
...,...,...,...
228,SVN,2014,15.862
229,SVN,2015,15.888
230,SVN,2016,14.282
231,SVN,2017,14.450


### Adding the values to the DF

In [13]:
list_value=[]
for el in df.itertuples():
    country,time = el[1],el[2]
    tmp = STR_L1[(STR_L1.Country==country)&(STR_L1.Time==time)]
    if len(tmp.index)==0:
        list_value.append(None)
    else:
        list_value.append(tmp.STR.values[0])
df['STR']= list_value
df['STR_2']=df['STR']**2
df['STR_3']=df['STR']**3

In [14]:
df=df.dropna()

In [15]:
df

Unnamed: 0,Country,Time,NEET,Exp_LMP,Exp_LMP_2,Exp_LMP_3,STR,STR_2,STR_3
16,AUS,2013,13.015899,0.87,0.7569,0.658503,15.615,243.828225,3807.377733
17,AUS,2014,12.647472,0.93,0.8649,0.804357,15.612,243.734544,3805.183701
18,AUS,2015,11.831610,0.91,0.8281,0.753571,15.433,238.177489,3675.793188
19,AUS,2016,11.352150,0.86,0.7396,0.636056,15.168,230.068224,3489.674822
20,AUS,2017,10.946128,0.85,0.7225,0.614125,15.124,228.735376,3459.393827
...,...,...,...,...,...,...,...,...,...
690,LTU,2014,14.185811,0.43,0.1849,0.079507,10.233,104.714289,1071.541319
691,LTU,2015,13.743647,0.53,0.2809,0.148877,10.256,105.185536,1078.782857
692,LTU,2016,11.403278,0.51,0.2601,0.132651,10.470,109.620900,1147.730823
693,LTU,2017,11.220660,0.54,0.2916,0.157464,10.619,112.763161,1197.432007


## Looking at economic features

### GDP and LogGDP

In [16]:
gdp_per_person = pd.read_csv('./data/economic_features/GDP_per_capita.csv', sep=';')
gdp_per_person_oecd = pd.DataFrame(columns=gdp_per_person.columns)
for code in oecd_countries.keys():
    gdp_per_person_oecd = gdp_per_person_oecd.append(gdp_per_person[gdp_per_person['Country Code']==code])
gdp_2015_oecd = gdp_per_person_oecd[['Country Code', '2015']]
gdp_2017_oecd = gdp_per_person_oecd[['Country Code', '2017']]
gdp_1960_2019_oecd_full = gdp_per_person_oecd.drop(['Country Name', 'Indicator Name', 'Indicator Code'], axis=1).reset_index(drop=True)

gdp_1960_2019_oecd = pd.DataFrame(columns=['Country', 'Time', 'GDP'])
for i in range(len(gdp_1960_2019_oecd_full)):
    country = gdp_1960_2019_oecd_full.iloc[i][0]
    for j in range(1,len(gdp_1960_2019_oecd_full.columns)):
        year = gdp_1960_2019_oecd_full.columns[j]
        gdp = gdp_1960_2019_oecd_full.iloc[i][j]
        gdp_1960_2019_oecd = gdp_1960_2019_oecd.append({'Country': country, 'Time': int(year), 'GDP': gdp}, ignore_index=True)
    gdp_1960_2019_oecd = gdp_1960_2019_oecd.dropna()

In [17]:
list_value=[]
for el in df.itertuples():
    country,time = el[1],el[2]
    tmp = gdp_1960_2019_oecd[(gdp_1960_2019_oecd.Country==country)&(gdp_1960_2019_oecd.Time==time)]
    if len(tmp.index)==0:
        list_value.append(None)
    else:
        list_value.append(tmp.GDP.values[0])
df['GDP']= list_value
df['LogGDP']=np.log(df['GDP'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


### CPI

In [18]:
cpi_2000_2019 = pd.read_csv('./data/economic_features/CPI_2000_2019.csv')
cpi_2000_2019 = cpi_2000_2019[['LOCATION', 'TIME', 'Value']]
cpi_2000_2019.columns = ['Country', 'Time', 'CPI']

In [19]:
list_value=[]
for el in df.itertuples():
    country,time = el[1],el[2]
    tmp = cpi_2000_2019[(cpi_2000_2019.Country==country)&(cpi_2000_2019.Time==time)]
    if len(tmp.index)==0:
        list_value.append(None)
    else:
        list_value.append(tmp.CPI.values[0])
df['CPI']= list_value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


### DEBT

In [20]:
debt_2000_2019 = pd.read_csv('./data/economic_features/DEBT_2000_2019.csv')
debt_2000_2019 = debt_2000_2019[['LOCATION', 'TIME', 'Value']]
debt_2000_2019.columns = ['Country', 'Time', 'DEBT']

In [21]:
list_value=[]
for el in df.itertuples():
    country,time = el[1],el[2]
    tmp = debt_2000_2019[(debt_2000_2019.Country==country)&(debt_2000_2019.Time==time)]
    if len(tmp.index)==0:
        list_value.append(None)
    else:
        list_value.append(tmp.DEBT.values[0])
df['DEBT']= list_value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


## Looking at education features

### Years of schooling

In [22]:
years_schooling = pd.read_csv('./data/education_system/mean-years-of-schooling-world.csv')
years_schooling=years_schooling.drop('Entity',axis=1)
years_schooling.columns=['Country', 'Time', 'Years_schooling']

In [23]:
list_value=[]
for el in df.itertuples():
    country,time = el[1],el[2]
    tmp = years_schooling[(years_schooling.Country==country)&(years_schooling.Time==time)]
    if len(tmp.index)==0:
        list_value.append(None)
    else:
        list_value.append(tmp.Years_schooling.values[0])
df['Years_schooling']= list_value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


### Average class size

In [24]:
avg_class_size = pd.read_csv('./data/education_system/Avg_class_size_primary_education.csv')
avg_class_size = avg_class_size[(avg_class_size.ISC11_LEVEL_CAT=='L1')&(avg_class_size.REF_SECTOR=='INST_T')&(avg_class_size.INDICATOR=='PERS_AVG_CLASS')]
avg_class_size = avg_class_size[['COUNTRY','YEAR','Value']]
avg_class_size.columns = ['Country', 'Time', 'Avg_class_size']

In [25]:
list_value=[]
for el in df.itertuples():
    country,time = el[1],el[2]
    tmp = avg_class_size[(avg_class_size.Country==country)&(avg_class_size.Time==time)]
    if len(tmp.index)==0:
        list_value.append(None)
    else:
        list_value.append(tmp.Avg_class_size.values[0])
df['Avg_class_size']= list_value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


### Expenditure on education

In [26]:
spendings_educ = pd.read_csv('./data/education_system/spendings_in_education.csv')
spendings_educ = spendings_educ[(spendings_educ.ISC11=='L1')& (spendings_educ.EXPENDITURE_TYPE=='T')]
spendings_educ = spendings_educ[['COUNTRY', 'YEAR','Value']]
spendings_educ.columns = ['Country', 'Time', 'Exp_educ']

In [27]:
spendings_educ

Unnamed: 0,Country,Time,Exp_educ
98,AUS,1995,
99,AUS,2000,4549.9614
100,AUS,2005,6003.6421
101,AUS,2008,7103.9819
102,AUS,2009,8588.7402
...,...,...,...
14885,KOR,2014,
14886,KOR,2015,
14887,KOR,2016,11074.5190
14888,KOR,2017,11701.7900


In [28]:
list_value=[]
for el in df.itertuples():
    country,time = el[1],el[2]
    tmp = spendings_educ[(spendings_educ.Country==country)&(spendings_educ.Time==time)]
    if len(tmp.index)==0:
        list_value.append(None)
    else:
        list_value.append(tmp.Exp_educ.values[0])
df['Exp_educ']= list_value

df['LogExp_educ']= np.log(df['Exp_educ'],where=~np.isnan(np.array(df['Exp_educ'])))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


## Labour indicators

### Protection of workers

In [29]:
strictness = pd.read_csv('./data/labour_market/strictness of employment protection from 1998 to 2018.csv')
strictness = strictness[(strictness.SERIES == 'EPRC_V2')]
strictness = strictness[['COUNTRY','TIME','Value']]
strictness.columns = ['Country', 'Time', 'Strictness_of_workers']

In [30]:
list_value=[]
for el in df.itertuples():
    country,time = el[1],el[2]
    tmp = strictness[(strictness.Country==country)&(strictness.Time==time)]
    if len(tmp.index)==0:
        list_value.append(None)
    else:
        list_value.append(tmp.Strictness_of_workers.values[0])
df['Strictness_of_workers']= list_value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


### Percentage of Part time employement

In [31]:
ft_pt_employ = pd.read_csv('./data/labour_market/percentage of full time part time employment.csv')
ft_pt_employ = ft_pt_employ[(ft_pt_employ.AGE==1524)&(ft_pt_employ.EMPSTAT=='TE')&(ft_pt_employ.SEX=='MW')]
ft_pt_employ = ft_pt_employ[['COUNTRY','SERIES','Time','Value']]
ft_pt_employ.columns = ['Country','Series','Time','Value']

pt_employ = ft_pt_employ[ft_pt_employ.Series=='PT'].drop('Series',axis=1)

In [32]:
list_value = []
for el in df.itertuples():
    country, time = el[1], el[2]
    tmp = pt_employ[(pt_employ.Country == country) & (pt_employ.Time == time)]
    if len(tmp.index) == 0:
        list_value.append(None)
    else:
        list_value.append(tmp.Value.values[0])
df['PT_employ'] = list_value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


### Average duration of unemployement

In [33]:
avg_duration_unemployment = pd.read_csv('./data/labour_market/average duration of unemployment.csv')
avg_duration_unemployment = avg_duration_unemployment[(avg_duration_unemployment.SEX=='MW')&(avg_duration_unemployment.AGE==1524)]
avg_duration_unemployment = avg_duration_unemployment[['COUNTRY','TIME','Value']]
avg_duration_unemployment.columns = ['Country', 'Time','Avg_dur_unemployment']
len(avg_duration_unemployment)

184

In [34]:
list_value = []
for el in df.itertuples():
    country, time = el[1], el[2]
    tmp = avg_duration_unemployment[(avg_duration_unemployment.Country == country) & (avg_duration_unemployment.Time == time)]
    if len(tmp.index) == 0:
        list_value.append(None)
    else:
        list_value.append(tmp.Avg_dur_unemployment.values[0])
df['Avg_dur_unemployment'] = list_value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [35]:
df = sm.add_constant(df)
df

  return ptp(axis=axis, out=out, **kwargs)


Unnamed: 0,const,Country,Time,NEET,Exp_LMP,Exp_LMP_2,Exp_LMP_3,STR,STR_2,STR_3,...,LogGDP,CPI,DEBT,Years_schooling,Avg_class_size,Exp_educ,LogExp_educ,Strictness_of_workers,PT_employ,Avg_dur_unemployment
16,1.0,AUS,2013,13.015899,0.87,0.7569,0.658503,15.615,243.828225,3807.377733,...,11.129468,2.449889,55.70968,12.6,23.725,9241.9922,9.131513,2.011,46.205162,5.955686
17,1.0,AUS,2014,12.647472,0.93,0.8649,0.804357,15.612,243.734544,3805.183701,...,11.043094,2.487923,61.36842,12.7,23.859,9257.9980,9.133243,2.011,46.603383,6.953608
18,1.0,AUS,2015,11.831610,0.91,0.8281,0.753571,15.433,238.177489,3675.793188,...,10.946512,1.508367,64.18005,12.8,23.821,9524.7178,9.161646,2.011,47.087093,6.972733
19,1.0,AUS,2016,11.352150,0.86,0.7396,0.636056,15.168,230.068224,3489.674822,...,10.819201,1.276991,68.39156,12.9,23.669,10022.5670,9.212595,2.011,48.964695,7.214289
20,1.0,AUS,2017,10.946128,0.85,0.7225,0.614125,15.124,228.735376,3459.393827,...,10.897257,1.948647,65.60463,12.9,23.613,10238.4130,9.233902,2.011,49.731342,6.959253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,1.0,LTU,2014,14.185811,0.43,0.1849,0.079507,10.233,104.714289,1071.541319,...,9.715045,0.103758,52.58745,13.0,15.742,5288.8008,8.573347,2.697,11.871613,
691,1.0,LTU,2015,13.743647,0.53,0.2809,0.148877,10.256,105.185536,1078.782857,...,9.565090,-0.884097,53.33852,13.0,15.976,5523.3896,8.616747,2.697,8.560302,
692,1.0,LTU,2016,11.403278,0.51,0.2601,0.132651,10.470,109.620900,1147.730823,...,9.615680,0.905525,50.88945,13.0,16.247,6170.3911,8.727518,2.697,11.594822,
693,1.0,LTU,2017,11.220660,0.54,0.2916,0.157464,10.619,112.763161,1197.432007,...,9.734205,3.722889,47.00182,13.0,16.945,6339.7036,8.754587,2.697,12.785031,


In [36]:
df.to_csv('./data/panel_data/Full_DB.csv',index=False)