In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from linearmodels import PanelOLS

Pour les plages de dates disponibles :
* NEET et variables explicatives : $2013-2018$
* Variables economiques : $2000-2019$
* Variables d'éducation : $2013-2017$

Pour le NEET rate, j'ai pris les $15-29$ ans.

Pour les variables qui dépendent du niveau d'éducation, j'ai pris $L1$ partout.

# Import data and create panel data sets

## Import data

In [2]:
panel_data_l1 = pd.read_csv('./panel_data_l1.csv')
df_eco_features = pd.read_csv('./economic_features.csv')
df_educ_features_l1 = pd.read_csv('./educ_features_l1.csv')

## Create panel data set for the NEET rate and the 3 explanatory variables

In [3]:
oecd_countries = {'AUS': 'Australia', 'AUT': 'Austria', 'BEL': 'Belgium', 'CAN': 'Canada', 'CHL': 'Chile', 'COL': 'Colombia', 'CZE': 'Czech Republic', 'DNK': 'Denmark', 'EST': 'Estonia', 'FIN': 'Finland', 'FRA': 'France', 'DEU': 'Germany', 'GRC': 'Greece', 'HUN': 'Hungary', 'ISL': 'Iceland', 'IRL': 'Ireland', 'ISR': 'Israel', 'ITA': 'Italy', 'JPN': 'Japan', 'KOR': 'Korea', 'LVA': 'Latvia', 'LTU': 'Lithuania', 'LUX': 'Luxembourg', 'MEX': 'Mexico', 'NLD': 'Netherlands', 'NZL': 'New Zealand', 'NOR': 'Norway', 'POL': 'Poland', 'PRT': 'Portugal', 'SVK': 'Slovakia', 'SVN': 'Slovenia', 'ESP': 'Spain', 'SWE': 'Sweden', 'CHE': 'Switzerland', 'TUR': 'Turkey', 'GBR': 'United Kingdom', 'USA': 'United States'}
code_countries = [code for code in oecd_countries.keys()]
years = [year for year in range(2013,2019)]

In [4]:
np.unique(panel_data_l1.Country)

array(['AUS', 'BEL', 'CAN', 'CHL', 'CZE', 'DEU', 'ESP', 'EST', 'FRA',
       'HUN', 'IRL', 'ISR', 'JPN', 'LTU', 'LUX', 'LVA', 'MEX', 'NLD',
       'NZL', 'POL', 'PRT', 'SVK', 'SVN', 'USA'], dtype=object)

In [5]:
# create binary variables for each oecd country
for code in np.unique(panel_data_l1.Country):
    bin_var = (panel_data_l1.Country == code).astype(int)
    panel_data_l1['bin_'+code] = bin_var

In [6]:
# create a group id for each different country for the clustered standard errors
groupid = []
previous_country,gid = 'AUS', 0
for code in panel_data_l1.Country:
    current_country = code
    if current_country==previous_country:
        groupid.append(gid)
    else:
        gid += 1
        groupid.append(gid)
    previous_country = current_country
panel_data_l1['groupid'] = groupid

In [7]:
panel_data_l1

Unnamed: 0,Country,Time,NEET,Exp_LMP,STR,Min_Wage,bin_AUS,bin_BEL,bin_CAN,bin_CHL,...,bin_LVA,bin_MEX,bin_NLD,bin_NZL,bin_POL,bin_PRT,bin_SVK,bin_SVN,bin_USA,groupid
0,AUS,2013,13.015899,0.87,15.615,23283.766881,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AUS,2014,12.647472,0.93,15.612,23356.492667,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUS,2015,11.831610,0.91,15.433,23641.395398,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AUS,2016,11.352150,0.86,15.168,23915.419580,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUS,2017,10.946128,0.85,15.124,24128.731046,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,USA,2014,15.047262,0.28,15.435,16285.276127,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
124,USA,2015,14.380193,0.28,15.354,16265.980260,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
125,USA,2016,14.118049,0.26,15.216,16063.328012,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
126,USA,2017,13.280724,0.24,15.182,15728.297963,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23


## Create panel data set for the economic features

In [8]:
df_eco_features

Unnamed: 0,Country,Time,GDP,CPI,DEBT
0,AUS,2000,21679.247842,4.457435,41.14750
1,AUS,2001,19490.861110,4.407135,40.40488
2,AUS,2002,20082.483267,2.981575,38.67284
3,AUS,2003,23447.031001,2.732596,35.66726
4,AUS,2004,30430.676437,2.343255,32.31054
...,...,...,...,...,...
652,USA,2015,56839.381774,0.118627,136.43000
653,USA,2016,57951.584082,1.261583,138.11100
654,USA,2017,60062.222313,2.130110,134.67420
655,USA,2018,62996.471285,2.442583,136.17960


## Create panel data set for the education indicators

In [9]:
df_educ_features_l1

Unnamed: 0,Country,Time,Years_schooling,Avg_class_size,Exp_educ
0,AUS,2013,12.6,23.725,9241.9922
1,AUS,2014,12.7,23.859,9257.9980
2,AUS,2015,12.8,23.821,9524.7178
3,AUS,2016,12.9,23.669,10022.5670
4,AUS,2017,12.9,23.613,10238.4130
...,...,...,...,...,...
128,GBR,2013,12.6,25.404,10615.3770
129,GBR,2014,12.7,25.296,11276.6310
130,GBR,2015,12.8,25.988,11715.1060
131,GBR,2016,12.9,25.937,11350.0200


## Create panel data set for the labour market indicators

# OLS on panel data

In [10]:
X_bols, Y_bols = panel_data_l1[['Exp_LMP', 'STR', 'Min_Wage']], panel_data_l1['NEET']
X_bols = sm.add_constant(X_bols)

In [11]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)
calc_vif(X_bols)

Unnamed: 0,variables,VIF
0,const,26.576924
1,Exp_LMP,1.380412
2,STR,1.053941
3,Min_Wage,1.410139


## Basic OLS without state or time fixed effects

In [12]:
basic_ols = sm.OLS(Y_bols, X_bols).fit()

In [13]:
print(basic_ols.summary())

                            OLS Regression Results                            
Dep. Variable:                   NEET   R-squared:                       0.609
Model:                            OLS   Adj. R-squared:                  0.599
Method:                 Least Squares   F-statistic:                     64.30
Date:                Sun, 28 Mar 2021   Prob (F-statistic):           3.81e-25
Time:                        16:13:45   Log-Likelihood:                -292.26
No. Observations:                 128   AIC:                             592.5
Df Residuals:                     124   BIC:                             603.9
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.8227      1.099     12.579      0.0

## OLS with only state fixed effects

### Use the n-1 binary variables for states, France is the reference

In [21]:
Y_sfe = np.array(panel_data_l1['NEET'])
# X_sfe = sm.add_constant(X_sfe)

In [22]:
print(np.unique(X_sfe.Country))

['AUS' 'BEL' 'CAN' 'CHL' 'CZE' 'DEU' 'ESP' 'EST' 'FRA' 'HUN' 'IRL' 'ISR'
 'JPN' 'LTU' 'LUX' 'LVA' 'MEX' 'NLD' 'NZL' 'POL' 'PRT' 'SVK' 'SVN' 'USA']


In [32]:
aus=panel_data_l1.Country=='AUS'
bel=panel_data_l1.Country=='BEL'
can=panel_data_l1.Country=='CAN'
chl=panel_data_l1.Country=='CHL'
cze=panel_data_l1.Country=='CZE'
deu=panel_data_l1.Country=='DEU'
esp=panel_data_l1.Country=='ESP'
est=panel_data_l1.Country=='EST'
fra=panel_data_l1.Country=='FRA'
hun=panel_data_l1.Country=='HUN'
irl=panel_data_l1.Country=='IRL'
isr=panel_data_l1.Country=='ISR'
jpn=panel_data_l1.Country=='JPN'
ltu=panel_data_l1.Country=='LTU'
lux=panel_data_l1.Country=='LUX'
lva=panel_data_l1.Country=='LVA'
mex=panel_data_l1.Country=='MEX'
nld=panel_data_l1.Country=='NLD'
nzl=panel_data_l1.Country=='NZL'
pol=panel_data_l1.Country=='POL'
prt=panel_data_l1.Country=='PRT'
svk=panel_data_l1.Country=='SVK'
svn=panel_data_l1.Country=='SVN'
usa=panel_data_l1.Country=='USA'

In [24]:
panel_data_l1

Unnamed: 0,Country,Time,NEET,Exp_LMP,STR,Min_Wage,bin_AUS,bin_BEL,bin_CAN,bin_CHL,...,bin_LVA,bin_MEX,bin_NLD,bin_NZL,bin_POL,bin_PRT,bin_SVK,bin_SVN,bin_USA,groupid
0,AUS,2013,13.015899,0.87,15.615,23283.766881,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AUS,2014,12.647472,0.93,15.612,23356.492667,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUS,2015,11.831610,0.91,15.433,23641.395398,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AUS,2016,11.352150,0.86,15.168,23915.419580,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUS,2017,10.946128,0.85,15.124,24128.731046,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,USA,2014,15.047262,0.28,15.435,16285.276127,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
124,USA,2015,14.380193,0.28,15.354,16265.980260,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
125,USA,2016,14.118049,0.26,15.216,16063.328012,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
126,USA,2017,13.280724,0.24,15.182,15728.297963,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23


In [35]:
X_sfe = np.column_stack((panel_data_l1.Exp_LMP, panel_data_l1.STR, panel_data_l1.Min_Wage, aus, bel, can, chl, cze, deu, esp, est, hun, irl, isr, jpn, ltu, lux, lva, mex, nld, nzl, pol, prt, svk, svn, usa))
X_sfe = sm.add_constant(X_sfe)

In [42]:
sfe_ols = sm.OLS(Y_sfe, X_sfe).fit(cov_type='cluster', cov_kwds = {'groups': np.array(panel_data_l1['groupid'])})
print(sfe_ols.summary(yname='NEET rate', xname=['const', 'Exp_LMP', 'STR', 'Min_Wage', 'aus', 'bel', 'can', 'chl', 'cze', 'deu', 'esp', 'est', 'hun', 'irl', 'isr', 'jpn', 'ltu', 'lux', 'lva', 'mex', 'nld', 'nzl', 'pol', 'prt', 'svk', 'svn', 'usa']))

                            OLS Regression Results                            
Dep. Variable:              NEET rate   R-squared:                       0.957
Model:                            OLS   Adj. R-squared:                  0.946
Method:                 Least Squares   F-statistic:                     7.473
Date:                Sun, 28 Mar 2021   Prob (F-statistic):            0.00115
Time:                        16:38:04   Log-Likelihood:                -151.05
No. Observations:                 128   AIC:                             356.1
Df Residuals:                     101   BIC:                             433.1
Df Model:                          26                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         21.1731      6.299      3.362      0.0



In [57]:
test_df = panel_data_l1[['Country', 'Time', 'NEET', 'Exp_LMP', 'STR', 'Min_Wage']]
test_df = test_df.set_index(['Country', 'Time'])
test_df = sm.add_constant(test_df)

In [59]:
panel_ols = PanelOLS(test_df.NEET, test_df[['const', 'Exp_LMP', 'STR', 'Min_Wage']], time_effects=True, entity_effects=True).fit(cov_type='clustered', cluster_time=True, cluster_entity=True)

In [60]:
print(panel_ols)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.2751
Estimator:                   PanelOLS   R-squared (Between):              0.5838
No. Observations:                 128   R-squared (Within):               0.6048
Date:                Sun, Mar 28 2021   R-squared (Overall):              0.5592
Time:                        16:39:52   Log-likelihood                   -146.49
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      12.141
Entities:                          24   P-value                           0.0000
Avg Obs:                       5.3333   Distribution:                    F(3,96)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             9.6895
                            

## Panel OLS with economic variables

In [47]:
added_features = df_eco_features[df_eco_features.Time.isin(years)].reset_index(drop=True)
df_eco_neet = pd.DataFrame(columns=['Country', 'Time', 'NEET', 'Exp_LMP', 'STR', 'Min_Wage', 'GDP', 'CPI', 'DEBT'])

In [48]:
for obs in panel_data_l1.itertuples():
    country, time = obs[1], obs[2]
    new_el = added_features.loc[(added_features.Country==country)&(added_features.Time==time)]
    if len(new_el.index) > 0:
        line = {'Country':country, 'Time':time, 'NEET':obs[3], 'Exp_LMP':obs[4], 'STR':obs[5], 'Min_Wage':obs[6], 'GDP':new_el.GDP.values[0], 'CPI':new_el.CPI.values[0], 'DEBT':new_el.DEBT.values[0]}
        df_eco_neet = df_eco_neet.append(line, ignore_index=True)

In [49]:
df_eco_neet = df_eco_neet.set_index(['Country', 'Time'])

In [50]:
df_eco_neet = sm.add_constant(df_eco_neet)

In [51]:
df_eco_neet

Unnamed: 0_level_0,Unnamed: 1_level_0,const,NEET,Exp_LMP,STR,Min_Wage,GDP,CPI,DEBT
Country,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AUS,2013,1.0,13.015899,0.87,15.615,23283.766881,68150.107041,2.449889,55.70968
AUS,2014,1.0,12.647472,0.93,15.612,23356.492667,62510.791171,2.487923,61.36842
AUS,2015,1.0,11.831610,0.91,15.433,23641.395398,56755.721712,1.508367,64.18005
AUS,2016,1.0,11.352150,0.86,15.168,23915.419580,49971.131456,1.276991,68.39156
AUS,2017,1.0,10.946128,0.85,15.124,24128.731046,54027.966818,1.948647,65.60463
...,...,...,...,...,...,...,...,...,...
USA,2014,1.0,15.047262,0.28,15.435,16285.276127,55064.744548,1.622223,135.58440
USA,2015,1.0,14.380193,0.28,15.354,16265.980260,56839.381774,0.118627,136.43000
USA,2016,1.0,14.118049,0.26,15.216,16063.328012,57951.584082,1.261583,138.11100
USA,2017,1.0,13.280724,0.24,15.182,15728.297963,60062.222313,2.130110,134.67420


In [52]:
panel_eco_ols = PanelOLS(df_eco_neet.NEET, df_eco_neet.drop('NEET', axis=1), time_effects=True, entity_effects=True).fit(cov_type='clustered', cluster_time=True, cluster_entity=True)

In [53]:
print(panel_eco_ols)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.3596
Estimator:                   PanelOLS   R-squared (Between):              0.4281
No. Observations:                 122   R-squared (Within):               0.6300
Date:                Sun, Mar 28 2021   R-squared (Overall):              0.4747
Time:                        16:38:23   Log-likelihood                   -133.98
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      8.2346
Entities:                          23   P-value                           0.0000
Avg Obs:                       5.3043   Distribution:                    F(6,88)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             9.1849
                            