In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from linearmodels import PanelOLS

Pour les plages de dates disponibles :
* NEET et variables explicatives : $2013-2018$
* Variables economiques : $2000-2019$
* Variables d'éducation : $2013-2017$

Pour le NEET rate, j'ai pris les $15-29$ ans.

Pour les variables qui dépendent du niveau d'éducation, j'ai pris $L1$ partout.

# Import data and create panel data sets

## Import data

In [2]:
panel_data_l1 = pd.read_csv('./data/panel_data/panel_data_l1.csv')
panel_data_l2_l3 = pd.read_csv('./data/panel_data/panel_data_l2_l3.csv')
df_eco_features = pd.read_csv('./data/panel_data/economic_features.csv')
df_educ_features_l1 = pd.read_csv('./data/panel_data/educ_features_l1.csv')
df_labour_features = pd.read_csv('./data/panel_data/labour_market.csv')

## Create panel data set for the NEET rate and the 3 explanatory variables

In [3]:
oecd_countries = {'AUS': 'Australia', 'AUT': 'Austria', 'BEL': 'Belgium', 'CAN': 'Canada', 'CHL': 'Chile', 'COL': 'Colombia', 'CZE': 'Czech Republic', 'DNK': 'Denmark', 'EST': 'Estonia', 'FIN': 'Finland', 'FRA': 'France', 'DEU': 'Germany', 'GRC': 'Greece', 'HUN': 'Hungary', 'ISL': 'Iceland', 'IRL': 'Ireland', 'ISR': 'Israel', 'ITA': 'Italy', 'JPN': 'Japan', 'KOR': 'Korea', 'LVA': 'Latvia', 'LTU': 'Lithuania', 'LUX': 'Luxembourg', 'MEX': 'Mexico', 'NLD': 'Netherlands', 'NZL': 'New Zealand', 'NOR': 'Norway', 'POL': 'Poland', 'PRT': 'Portugal', 'SVK': 'Slovakia', 'SVN': 'Slovenia', 'ESP': 'Spain', 'SWE': 'Sweden', 'CHE': 'Switzerland', 'TUR': 'Turkey', 'GBR': 'United Kingdom', 'USA': 'United States'}
code_countries = [code for code in oecd_countries.keys()]
years = [year for year in range(2013,2019)]

In [4]:
np.unique(panel_data_l1.Country)

array(['AUS', 'BEL', 'CAN', 'CHL', 'CZE', 'DEU', 'ESP', 'EST', 'FRA',
       'HUN', 'IRL', 'ISR', 'JPN', 'LTU', 'LUX', 'LVA', 'MEX', 'NLD',
       'NZL', 'POL', 'PRT', 'SVK', 'SVN', 'USA'], dtype=object)

In [5]:
bin_df = pd.DataFrame()
# create binary variables for each oecd country
for code in np.unique(panel_data_l1.Country):
    bin_var = (panel_data_l1.Country == code).astype(int)
    bin_df['bin_'+code] = bin_var

In [6]:
panel_data_l1

Unnamed: 0,Country,Time,NEET,Exp_LMP,STR,Min_Wage
0,AUS,2013,13.015899,0.87,15.615,23283.766881
1,AUS,2014,12.647472,0.93,15.612,23356.492667
2,AUS,2015,11.831610,0.91,15.433,23641.395398
3,AUS,2016,11.352150,0.86,15.168,23915.419580
4,AUS,2017,10.946128,0.85,15.124,24128.731046
...,...,...,...,...,...,...
123,USA,2014,15.047262,0.28,15.435,16285.276127
124,USA,2015,14.380193,0.28,15.354,16265.980260
125,USA,2016,14.118049,0.26,15.216,16063.328012
126,USA,2017,13.280724,0.24,15.182,15728.297963


## Test whether there are non-linear effects for Exp_LMP

### Test for L1 level of education

In [7]:
exp_lmp = panel_data_l1.Exp_LMP
panel_data_l1['Exp_LMP_2'] = exp_lmp**2
panel_data_l1['Exp_LMP_3'] = exp_lmp**3
panel_data_l1_cubic = panel_data_l1.set_index(['Country', 'Time'])

In [8]:
panel_data_l1_cubic = sm.add_constant(panel_data_l1_cubic)

  return ptp(axis=axis, out=out, **kwargs)


In [9]:
res_exp_lmp = sm.OLS(panel_data_l1_cubic.NEET, panel_data_l1_cubic[['const', 'Exp_LMP', 'Exp_LMP_2']]).fit()
hyp_exp_lmp = '(Exp_LMP_2=0)'
print(res_exp_lmp.f_test(hyp_exp_lmp))

<F test: F=array([[60.16842744]]), p=2.653496660339645e-12, df_denom=125, df_num=1>


In [10]:
print(res_exp_lmp.summary())

                            OLS Regression Results                            
Dep. Variable:                   NEET   R-squared:                       0.331
Model:                            OLS   Adj. R-squared:                  0.320
Method:                 Least Squares   F-statistic:                     30.89
Date:                Thu, 01 Apr 2021   Prob (F-statistic):           1.25e-11
Time:                        15:27:29   Log-Likelihood:                -326.61
No. Observations:                 128   AIC:                             659.2
Df Residuals:                     125   BIC:                             667.8
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         18.5363      0.787     23.551      0.0

### Test for L2 and L3 levels

In [11]:
panel_data_l2_l3['STR_2']=(panel_data_l2_l3.STR)**2
panel_data_l2_l3['STR_3']=(panel_data_l2_l3.STR)**3
panel_data_l2_l3['Exp_LMP_2'] = (panel_data_l2_l3.Exp_LMP)**2
panel_data_l2_l3['Exp_LMP_3'] = (panel_data_l2_l3.Exp_LMP)**3
panel_data_l2_l3_cubic = panel_data_l2_l3.set_index(['Country', 'Time'])

In [12]:
panel_data_l2_l3_cubic = sm.add_constant(panel_data_l2_l3_cubic)

  return ptp(axis=axis, out=out, **kwargs)


In [13]:
res_exp_lmp_l2_l3 = sm.OLS(panel_data_l2_l3_cubic.NEET, panel_data_l2_l3_cubic[['const', 'Exp_LMP', 'Exp_LMP_2','Exp_LMP_3']]).fit()
hyp_exp_lmp_l2_l3 = '(Exp_LMP_2=0),(Exp_LMP_3=0)'
print(res_exp_lmp_l2_l3.f_test(hyp_exp_lmp_l2_l3))
print(res_exp_lmp_l2_l3.summary())

<F test: F=array([[18.55410462]]), p=5.782850770803401e-08, df_denom=158, df_num=2>
                            OLS Regression Results                            
Dep. Variable:                   NEET   R-squared:                       0.191
Model:                            OLS   Adj. R-squared:                  0.176
Method:                 Least Squares   F-statistic:                     12.45
Date:                Thu, 01 Apr 2021   Prob (F-statistic):           2.38e-07
Time:                        15:27:29   Log-Likelihood:                -447.22
No. Observations:                 162   AIC:                             902.4
Df Residuals:                     158   BIC:                             914.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------

## Test for STR

### Test for L1 level of education

In [14]:
stratio = panel_data_l1.STR
panel_data_l1['STR_2'] = stratio**2
panel_data_l1['STR_3'] = stratio**3
panel_data_l1_cubic = panel_data_l1.set_index(['Country', 'Time'])

In [15]:
panel_data_l1_cubic = sm.add_constant(panel_data_l1_cubic)

  return ptp(axis=axis, out=out, **kwargs)


In [16]:
res_str = sm.OLS(panel_data_l1_cubic.NEET, panel_data_l1_cubic[['const', 'STR', 'STR_2', 'STR_3']]).fit()
hyp_str = '(STR_2=0), (STR_3=0)'
print(res_str.f_test(hyp_str))

<F test: F=array([[10.54292626]]), p=5.907318528775072e-05, df_denom=124, df_num=2>


In [17]:
print(res_str.summary())

                            OLS Regression Results                            
Dep. Variable:                   NEET   R-squared:                       0.260
Model:                            OLS   Adj. R-squared:                  0.242
Method:                 Least Squares   F-statistic:                     14.51
Date:                Thu, 01 Apr 2021   Prob (F-statistic):           3.67e-08
Time:                        15:27:29   Log-Likelihood:                -333.05
No. Observations:                 128   AIC:                             674.1
Df Residuals:                     124   BIC:                             685.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -26.4619     14.478     -1.828      0.0

### Test for L2 and L3

## Panel OLS with Exp_LMP_2, STR_2 and STR_3

### With STR corresponding to L1

In [18]:
panel_non_lin = PanelOLS(panel_data_l1_cubic.NEET, panel_data_l1_cubic[['const', 'Exp_LMP', 'Exp_LMP_2', 'STR', 'STR_2', 'STR_3']]).fit(cov_type='clustered')
print(panel_non_lin)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.4378
Estimator:                   PanelOLS   R-squared (Between):              0.4378
No. Observations:                 128   R-squared (Within):               0.0923
Date:                Thu, Apr 01 2021   R-squared (Overall):              0.4378
Time:                        15:27:29   Log-likelihood                   -315.46
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      18.997
Entities:                          24   P-value                           0.0000
Avg Obs:                       5.3333   Distribution:                   F(5,122)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             67.444
                            

#### With time fixed effects only

In [19]:
panel_non_lin_tfe = PanelOLS(panel_data_l1_cubic.NEET, panel_data_l1_cubic[['const', 'Exp_LMP', 'Exp_LMP_2', 'STR', 'STR_2', 'STR_3']],time_effects=True).fit(cov_type='clustered',cluster_time=True)
print(panel_non_lin_tfe)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.4299
Estimator:                   PanelOLS   R-squared (Between):              0.4330
No. Observations:                 128   R-squared (Within):               0.1215
Date:                Thu, Apr 01 2021   R-squared (Overall):              0.4364
Time:                        15:27:29   Log-likelihood                   -310.22
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      17.642
Entities:                          24   P-value                           0.0000
Avg Obs:                       5.3333   Distribution:                   F(5,117)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):          1.601e+04
                            

#### With state and time fixed effects

In [20]:
panel_non_lin_stfe = PanelOLS(panel_data_l1_cubic.NEET, panel_data_l1_cubic[['const', 'Exp_LMP', 'Exp_LMP_2', 'STR', 'STR_2', 'STR_3']], time_effects=True,entity_effects=True).fit(cov_type='clustered', cluster_time=True,cluster_entity=True)
print(panel_non_lin_stfe)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.2193
Estimator:                   PanelOLS   R-squared (Between):             -0.4321
No. Observations:                 128   R-squared (Within):               0.2736
Date:                Thu, Apr 01 2021   R-squared (Overall):             -0.3509
Time:                        15:27:30   Log-likelihood                   -151.23
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      5.2810
Entities:                          24   P-value                           0.0003
Avg Obs:                       5.3333   Distribution:                    F(5,94)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             5.2974
                            

### With STR corresponding to L2_L3

In [21]:
panel_non_lin_l2_l3 = PanelOLS(panel_data_l2_l3_cubic.NEET,panel_data_l2_l3_cubic[['const', 'Exp_LMP', 'Exp_LMP_2', 'STR', 'STR_2', 'STR_3']], time_effects=True, entity_effects=True).fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
print(panel_non_lin_l2_l3)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.1515
Estimator:                   PanelOLS   R-squared (Between):             -0.4685
No. Observations:                 162   R-squared (Within):               0.2999
Date:                Thu, Apr 01 2021   R-squared (Overall):             -0.4152
Time:                        15:27:30   Log-likelihood                   -199.57
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      4.3201
Entities:                          31   P-value                           0.0012
Avg Obs:                       5.2258   Distribution:                   F(5,121)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             1.1195
                            

## Create panel data set for the economic features

In [22]:
df_eco_features

Unnamed: 0,Country,Time,GDP,CPI,DEBT
0,AUS,2000,21679.247842,4.457435,41.14750
1,AUS,2001,19490.861110,4.407135,40.40488
2,AUS,2002,20082.483267,2.981575,38.67284
3,AUS,2003,23447.031001,2.732596,35.66726
4,AUS,2004,30430.676437,2.343255,32.31054
...,...,...,...,...,...
652,USA,2015,56839.381774,0.118627,136.43000
653,USA,2016,57951.584082,1.261583,138.11100
654,USA,2017,60062.222313,2.130110,134.67420
655,USA,2018,62996.471285,2.442583,136.17960


In [23]:
gdp = df_eco_features.GDP
lgdp = np.log(gdp)

In [24]:
df_eco_features['GDP'] = lgdp
df_eco_features.columns = ['Country', 'Time', 'LogGDP', 'CPI', 'DEBT']

In [25]:
df_eco_features

Unnamed: 0,Country,Time,LogGDP,CPI,DEBT
0,AUS,2000,9.984111,4.457435,41.14750
1,AUS,2001,9.877701,4.407135,40.40488
2,AUS,2002,9.907603,2.981575,38.67284
3,AUS,2003,10.062499,2.732596,35.66726
4,AUS,2004,10.323206,2.343255,32.31054
...,...,...,...,...,...
652,USA,2015,10.947985,0.118627,136.43000
653,USA,2016,10.967363,1.261583,138.11100
654,USA,2017,11.003136,2.130110,134.67420
655,USA,2018,11.050834,2.442583,136.17960


## Create panel data set for the education indicators

In [26]:
df_educ_features_l1

Unnamed: 0,Country,Time,Years_schooling,Avg_class_size,Exp_educ
0,AUS,2013,12.6,23.725,9241.9922
1,AUS,2014,12.7,23.859,9257.9980
2,AUS,2015,12.8,23.821,9524.7178
3,AUS,2016,12.9,23.669,10022.5670
4,AUS,2017,12.9,23.613,10238.4130
...,...,...,...,...,...
128,GBR,2013,12.6,25.404,10615.3770
129,GBR,2014,12.7,25.296,11276.6310
130,GBR,2015,12.8,25.988,11715.1060
131,GBR,2016,12.9,25.937,11350.0200


## Create panel data set for the labour market indicators

In [27]:
df_labour_features

Unnamed: 0,Country,Time,protection_of_workers,short_time_workers,involuntary_pt_workers,ft_and_pt_employ,marginally_attached_workers,employees_bargain,years_schooling,avg_class_size,educ_spendings
0,AUS,2016,1.7,0.892538,27.952453,19.342875,5.644458,60.0,12.9,22.158,8795.3633
1,AUS,2014,1.7,0.532556,28.277395,18.917433,5.584873,60.1,12.7,23.785,8107.4346
2,AUT,2013,1.8,0.4797,11.341942,18.91349,3.342615,98.0,11.9,21.022,10486.676
3,AUT,2014,1.8,0.392421,10.955043,19.574962,3.614805,98.0,12.1,20.977,10661.813
4,AUT,2015,1.8,0.454537,11.85259,19.793941,3.678977,98.0,12.1,20.921,11193.469
5,EST,2015,1.934,0.526973,12.138937,6.022114,4.059337,18.6,12.7,18.148,5838.8569
6,FIN,2014,2.518,0.466532,23.873874,7.418025,5.135235,89.3,12.4,19.656,8765.7246
7,FIN,2015,2.518,0.435119,25.849081,7.945387,5.302583,89.3,12.4,19.656,9286.6465
8,FRA,2014,2.812,0.077543,38.833678,12.448917,1.401585,98.5,11.4,25.262,6860.5566
9,DEU,2013,2.332,0.101407,14.437775,20.824191,1.469701,57.6,14.0,24.285,7958.0161


## Check for multicollinearity

In [28]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

# OLS on panel data

## Panel OLS

Je pense qu'il faut supprimer la variable `Min_Wage`, c'est elle qui pose des problèmes de multicollinéarité, et en plus ça facilite notre étude comme ça on a plus que 2 variables explicatives.

### Without state or time fixed effects

In [29]:
panel_data_l1_ols = panel_data_l1.set_index(['Country', 'Time'])
panel_data_l1_ols = sm.add_constant(panel_data_l1_ols)

  return ptp(axis=axis, out=out, **kwargs)


In [30]:
basic_ols = PanelOLS(panel_data_l1_ols.NEET, panel_data_l1_ols[['const', 'Exp_LMP', 'STR']]).fit(cov_type='clustered')
print(basic_ols)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.1582
Estimator:                   PanelOLS   R-squared (Between):              0.1462
No. Observations:                 128   R-squared (Within):               0.1655
Date:                Thu, Apr 01 2021   R-squared (Overall):              0.1582
Time:                        15:27:30   Log-likelihood                   -341.29
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      11.744
Entities:                          24   P-value                           0.0000
Avg Obs:                       5.3333   Distribution:                   F(2,125)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             11.956
                            

### With only state fixed effects

In [31]:
panel_ols_sfe = PanelOLS(panel_data_l1_ols.NEET, panel_data_l1_ols[['const', 'Exp_LMP', 'STR']], entity_effects=True).fit(cov_type='clustered', cluster_entity=True)

In [32]:
print(panel_ols_sfe)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.4193
Estimator:                   PanelOLS   R-squared (Between):             -0.7526
No. Observations:                 128   R-squared (Within):               0.4193
Date:                Thu, Apr 01 2021   R-squared (Overall):             -0.6214
Time:                        15:27:30   Log-likelihood                   -185.20
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      36.830
Entities:                          24   P-value                           0.0000
Avg Obs:                       5.3333   Distribution:                   F(2,102)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             40.866
                            

### With both state and time fixed effects

In [33]:
panel_ols_stfe = PanelOLS(panel_data_l1_ols.NEET, panel_data_l1_ols[['const', 'Exp_LMP', 'STR']], entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity=True, cluster_time=True)

In [34]:
print(panel_ols_stfe)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.1816
Estimator:                   PanelOLS   R-squared (Between):             -0.1802
No. Observations:                 128   R-squared (Within):               0.3225
Date:                Thu, Apr 01 2021   R-squared (Overall):             -0.1037
Time:                        15:27:30   Log-likelihood                   -154.25
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      10.764
Entities:                          24   P-value                           0.0001
Avg Obs:                       5.3333   Distribution:                    F(2,97)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             10.622
                            

## Panel OLS with economic features

In [36]:
added_features = df_eco_features[df_eco_features.Time.isin(years)].reset_index(drop=True)
df_eco_neet = pd.DataFrame(columns=['Country', 'Time', 'NEET', 'Exp_LMP', 'STR', 'Exp_LMP_2', 'STR_2', 'STR_3', 'LogGDP', 'CPI', 'DEBT'])

In [37]:
for obs in panel_data_l1.itertuples():
    country, time = obs[1], obs[2]
    new_el = added_features.loc[(added_features.Country==country)&(added_features.Time==time)]
    if len(new_el.index) > 0:
        line = {'Country':country, 'Time':time, 'NEET':obs[3], 'Exp_LMP':obs[4], 'STR':obs[5], 'Exp_LMP_2':obs[7], 'STR_2':obs[9], 'STR_3':obs[10], 'LogGDP':new_el.LogGDP.values[0], 'CPI':new_el.CPI.values[0], 'DEBT':new_el.DEBT.values[0]}
        df_eco_neet = df_eco_neet.append(line, ignore_index=True)

In [38]:
df_eco_neet_ols = df_eco_neet.set_index(['Country', 'Time'])

In [39]:
df_eco_neet_ols = sm.add_constant(df_eco_neet_ols)

In [40]:
df_eco_neet_ols

Unnamed: 0_level_0,Unnamed: 1_level_0,const,NEET,Exp_LMP,STR,Exp_LMP_2,STR_2,STR_3,LogGDP,CPI,DEBT
Country,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AUS,2013,1.0,13.015899,0.87,15.615,0.7569,243.828225,3807.377733,11.129468,2.449889,55.70968
AUS,2014,1.0,12.647472,0.93,15.612,0.8649,243.734544,3805.183701,11.043094,2.487923,61.36842
AUS,2015,1.0,11.831610,0.91,15.433,0.8281,238.177489,3675.793188,10.946512,1.508367,64.18005
AUS,2016,1.0,11.352150,0.86,15.168,0.7396,230.068224,3489.674822,10.819201,1.276991,68.39156
AUS,2017,1.0,10.946128,0.85,15.124,0.7225,228.735376,3459.393827,10.897257,1.948647,65.60463
...,...,...,...,...,...,...,...,...,...,...,...
USA,2014,1.0,15.047262,0.28,15.435,0.0784,238.239225,3677.222438,10.916265,1.622223,135.58440
USA,2015,1.0,14.380193,0.28,15.354,0.0784,235.745316,3619.633582,10.947985,0.118627,136.43000
USA,2016,1.0,14.118049,0.26,15.216,0.0676,231.526656,3522.909598,10.967363,1.261583,138.11100
USA,2017,1.0,13.280724,0.24,15.182,0.0576,230.493124,3499.346609,11.003136,2.130110,134.67420


### Regression without time fixed effect

In [61]:
panel_eco_ols = PanelOLS(df_eco_neet_ols.NEET, df_eco_neet_ols.drop(['NEET'], axis=1)).fit(cov_type='clustered')

In [62]:
print(panel_eco_ols)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.6077
Estimator:                   PanelOLS   R-squared (Between):              0.5914
No. Observations:                 122   R-squared (Within):               0.3497
Date:                Thu, Apr 01 2021   R-squared (Overall):              0.6077
Time:                        15:31:08   Log-likelihood                   -280.52
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      21.877
Entities:                          23   P-value                           0.0000
Avg Obs:                       5.3043   Distribution:                   F(8,113)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             118.95
                            

### Regression with time fixed effect

In [59]:
panel_eco_ols_tfe = PanelOLS(df_eco_neet_ols.NEET, df_eco_neet_ols.drop(['NEET'], axis=1), time_effects=True).fit(cov_type='clustered', cluster_time=True)

In [60]:
print(panel_eco_ols_tfe)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.6022
Estimator:                   PanelOLS   R-squared (Between):              0.5915
No. Observations:                 122   R-squared (Within):               0.3706
Date:                Thu, Apr 01 2021   R-squared (Overall):              0.6066
Time:                        15:30:29   Log-likelihood                   -275.73
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      20.439
Entities:                          23   P-value                           0.0000
Avg Obs:                       5.3043   Distribution:                   F(8,108)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):          3.097e+14
                            

## Panel OLS with education features

In [90]:
added_educ_features = df_educ_features_l1[df_educ_features_l1.Time.isin(years)].reset_index(drop=True)
df_educ_neet = pd.DataFrame(columns=['Country', 'Time', 'NEET', 'Exp_LMP', 'STR', 'Exp_LMP_2', 'STR_2', 'STR_3', 'Years_schooling', 'Avg_class_size', 'LogExp_educ'])

In [92]:
for obs in panel_data_l1.itertuples():
    country, time = obs[1], obs[2]
    new_el = added_educ_features.loc[(added_educ_features.Country==country)&(added_educ_features.Time==time)]
    if len(new_el.index) > 0:
        line = {'Country':country, 'Time':time, 'NEET':obs[3], 'Exp_LMP':obs[4], 'STR':obs[5], 'Exp_LMP_2':obs[7], 'STR_2':obs[9], 'STR_3':obs[10], 'Years_schooling':new_el.Years_schooling.values[0], 'Avg_class_size':new_el.Avg_class_size.values[0], 'LogExp_educ':np.log(new_el.Exp_educ.values[0])}
        df_educ_neet = df_educ_neet.append(line, ignore_index=True)

In [93]:
df_educ_neet_ols = df_educ_neet.set_index(['Country', 'Time'])

In [94]:
df_educ_neet_ols = sm.add_constant(df_educ_neet_ols)

  return ptp(axis=axis, out=out, **kwargs)


In [95]:
df_educ_neet_ols

Unnamed: 0_level_0,Unnamed: 1_level_0,const,NEET,Exp_LMP,STR,Exp_LMP_2,STR_2,STR_3,Years_schooling,Avg_class_size,LogExp_educ
Country,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AUS,2013,1.0,13.015899,0.87,15.615,0.7569,243.828225,3807.377733,12.6,23.725,9.131513
AUS,2014,1.0,12.647472,0.93,15.612,0.8649,243.734544,3805.183701,12.7,23.859,9.133243
AUS,2015,1.0,11.831610,0.91,15.433,0.8281,238.177489,3675.793188,12.8,23.821,9.161646
AUS,2016,1.0,11.352150,0.86,15.168,0.7396,230.068224,3489.674822,12.9,23.669,9.212595
AUS,2017,1.0,10.946128,0.85,15.124,0.7225,228.735376,3459.393827,12.9,23.613,9.233902
...,...,...,...,...,...,...,...,...,...,...,...
ESP,2013,1.0,27.151373,3.53,13.756,12.4609,189.227536,2603.013985,9.5,21.606,8.849025
ESP,2014,1.0,24.314959,3.11,13.544,9.6721,183.439936,2484.510493,9.7,21.713,8.861131
ESP,2015,1.0,22.820313,2.58,13.655,6.6564,186.459025,2546.097986,9.7,21.878,8.903371
ESP,2016,1.0,21.675491,2.30,13.564,5.2900,183.982096,2495.533150,9.8,21.875,8.962530


### Regression without time fixed effect

In [96]:
panel_educ_ols = PanelOLS(df_educ_neet_ols.NEET, df_educ_neet_ols.drop(['NEET'], axis=1)).fit(cov_type='clustered')

In [97]:
print(panel_educ_ols)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.8069
Estimator:                   PanelOLS   R-squared (Between):              0.8790
No. Observations:                  79   R-squared (Within):               0.2813
Date:                Thu, Apr 01 2021   R-squared (Overall):              0.8069
Time:                        16:12:14   Log-likelihood                   -156.73
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      36.573
Entities:                          17   P-value                           0.0000
Avg Obs:                       4.6471   Distribution:                    F(8,70)
Min Obs:                       3.0000                                           
Max Obs:                       5.0000   F-statistic (robust):             178.31
                            

### Regression with time fixed effect

In [98]:
panel_educ_ols_tfe = PanelOLS(df_educ_neet_ols.NEET, df_educ_neet_ols.drop(['NEET'], axis=1), time_effects=True).fit(cov_type='clustered', cluster_time=True)

In [99]:
print(panel_educ_ols_tfe)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.8109
Estimator:                   PanelOLS   R-squared (Between):              0.8783
No. Observations:                  79   R-squared (Within):               0.2800
Date:                Thu, Apr 01 2021   R-squared (Overall):              0.8062
Time:                        16:14:13   Log-likelihood                   -153.32
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      35.385
Entities:                          17   P-value                           0.0000
Avg Obs:                       4.6471   Distribution:                    F(8,66)
Min Obs:                       3.0000                                           
Max Obs:                       5.0000   F-statistic (robust):         -1.772e+13
                            

## Panel OLS with economic and education features

Ici, supprimer juste `Min_Wage` c'est pas suffisant pour retirer la multicollinéarité, il faut enlever en plus `Exp_educ`. Après c'est une variable de contrôle alors je sais pas si c'est si grave que ça.

In [43]:
reduced_obs = df_eco_neet[df_eco_neet.Time.isin(years[:-1])].reset_index(drop=True)
df_eco_educ_neet = pd.DataFrame(columns=['Country', 'Time', 'NEET', 'Exp_LMP', 'STR', 'Exp_LMP_2', 'STR_2', 'STR_3', 'LogGDP', 'CPI', 'DEBT', 'Years_schooling', 'Avg_class_size', 'LogExp_educ'])

In [44]:
for obs in reduced_obs.itertuples():
    country, time = obs[1], obs[2]
    new_el = df_educ_features_l1.loc[(df_educ_features_l1.Country==country)&(df_educ_features_l1.Time==time)]
    if len(new_el.index) > 0:
        line = {'Country':country, 'Time':time, 'NEET':obs[3], 'Exp_LMP':obs[4], 'STR':obs[5], 'Exp_LMP_2':obs[6], 'STR_2':obs[7], 'STR_3':obs[8], 'LogGDP':obs[9], 'CPI':obs[10], 'DEBT':obs[11], 'Years_schooling':new_el.Years_schooling.values[0], 'Avg_class_size':new_el.Avg_class_size.values[0], 'LogExp_educ':np.log(new_el.Exp_educ.values[0])}
        df_eco_educ_neet = df_eco_educ_neet.append(line, ignore_index=True)

In [45]:
df_eco_educ_neet

Unnamed: 0,Country,Time,NEET,Exp_LMP,STR,Exp_LMP_2,STR_2,STR_3,LogGDP,CPI,DEBT,Years_schooling,Avg_class_size,LogExp_educ
0,AUS,2013,13.015899,0.87,15.615,0.7569,243.828225,3807.377733,11.129468,2.449889,55.70968,12.6,23.725,9.131513
1,AUS,2014,12.647472,0.93,15.612,0.8649,243.734544,3805.183701,11.043094,2.487923,61.36842,12.7,23.859,9.133243
2,AUS,2015,11.831610,0.91,15.433,0.8281,238.177489,3675.793188,10.946512,1.508367,64.18005,12.8,23.821,9.161646
3,AUS,2016,11.352150,0.86,15.168,0.7396,230.068224,3489.674822,10.819201,1.276991,68.39156,12.9,23.669,9.212595
4,AUS,2017,10.946128,0.85,15.124,0.7225,228.735376,3459.393827,10.897257,1.948647,65.60463,12.9,23.613,9.233902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,ESP,2013,27.151373,3.53,13.756,12.4609,189.227536,2603.013985,10.277102,1.408546,106.55730,9.5,21.606,8.849025
75,ESP,2014,24.314959,3.11,13.544,9.6721,183.439936,2484.510493,10.290841,-0.150870,119.47340,9.7,21.713,8.861131
76,ESP,2015,22.820313,2.58,13.655,6.6564,186.459025,2546.097986,10.155491,-0.500461,117.07530,9.7,21.878,8.903371
77,ESP,2016,21.675491,2.30,13.564,5.2900,183.982096,2495.533150,10.185102,-0.202672,117.33050,9.8,21.875,8.962530


In [46]:
df_eco_educ_neet_ols = df_eco_educ_neet.set_index(['Country', 'Time'])

In [47]:
df_eco_educ_neet_ols = sm.add_constant(df_eco_educ_neet_ols)

In [48]:
panel_eco_educ_ols_te = PanelOLS(df_eco_educ_neet_ols.NEET, df_eco_educ_neet_ols.drop(['NEET'], axis=1), time_effects=True).fit(cov_type='clustered', cluster_time=True)

In [49]:
print(panel_eco_educ_ols_te)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.8262
Estimator:                   PanelOLS   R-squared (Between):              0.8840
No. Observations:                  79   R-squared (Within):               0.2723
Date:                Thu, Apr 01 2021   R-squared (Overall):              0.8111
Time:                        15:27:32   Log-likelihood                   -149.98
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      27.233
Entities:                          17   P-value                           0.0000
Avg Obs:                       4.6471   Distribution:                   F(11,63)
Min Obs:                       3.0000                                           
Max Obs:                       5.0000   F-statistic (robust):         -7.928e+14
                            

In [50]:
panel_eco_educ_ols = PanelOLS(df_eco_educ_neet_ols.NEET, df_eco_educ_neet_ols.drop(['NEET'], axis=1)).fit(cov_type='clustered')

In [51]:
print(panel_eco_educ_ols)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.8170
Estimator:                   PanelOLS   R-squared (Between):              0.8865
No. Observations:                  79   R-squared (Within):               0.3151
Date:                Thu, Apr 01 2021   R-squared (Overall):              0.8170
Time:                        15:27:32   Log-likelihood                   -154.61
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      27.199
Entities:                          17   P-value                           0.0000
Avg Obs:                       4.6471   Distribution:                   F(11,67)
Min Obs:                       3.0000                                           
Max Obs:                       5.0000   F-statistic (robust):             162.53
                            

to print summary to latex format: print(panel_eco_educ_ols.summary.as_latex())

## Panel OLS with economic, education and labour market features

In [110]:
reduced_obs = df_eco_educ_neet[df_eco_educ_neet.Time.isin(years[:-1])].reset_index(drop=True)
df_eco_educ_lm_neet = pd.DataFrame(columns=['Country', 'Time', 'NEET', 'Exp_LMP','Exp_LMP_2', 'STR', 'STR_2','STR_3','LogGDP', 'CPI', 'DEBT', 'Years_schooling', 'Avg_class_size', 'LogExp_educ','Protection_of_Workers','PT_FT_employ'])

In [114]:
for obs in reduced_obs.itertuples():
    country, time = obs[1], obs[2]
    new_el = df_labour_features.loc[(df_labour_features.Country==country)&(df_labour_features.Time==time)]
    if len(new_el.index) > 0:
        line = {'Country':country, 'Time':time, 'NEET':obs[3], 'Exp_LMP':obs[4],'Exp_LMP_2':obs[6], 'STR':obs[5],'STR_2':obs[7],'STR_3':obs[8], 'LogGDP':obs[9], 'CPI':obs[10], 'DEBT':obs[11], 'Years_schooling':obs[12], 'Avg_class_size':obs[13], 'LogExp_educ':obs[14],'Protection_of_Workers':new_el.protection_of_workers.values[0],'PT_FT_employ':(new_el.ft_and_pt_employ.values[0])/100}
        df_eco_educ_lm_neet = df_eco_educ_lm_neet.append(line, ignore_index=True)

In [116]:
df_eco_educ_lm_neet_ols = df_eco_educ_lm_neet.set_index(['Country', 'Time'])

In [117]:
df_eco_educ_lm_neet_ols = sm.add_constant(df_eco_educ_lm_neet_ols)

  return ptp(axis=axis, out=out, **kwargs)


In [121]:
panel_eco_educ_lm_ols = PanelOLS(df_eco_educ_lm_neet_ols.NEET, df_eco_educ_lm_neet_ols.drop(['NEET'], axis=1), time_effects=True).fit(cov_type='clustered', cluster_time=True)

In [122]:
print(panel_eco_educ_lm_ols)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.9873
Estimator:                   PanelOLS   R-squared (Between):              0.9475
No. Observations:                  23   R-squared (Within):               0.0357
Date:                Thu, Apr 01 2021   R-squared (Overall):              0.9008
Time:                        17:44:34   Log-likelihood                   -13.169
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      35.748
Entities:                          12   P-value                           0.0001
Avg Obs:                       1.9167   Distribution:                    F(13,6)
Min Obs:                       1.0000                                           
Max Obs:                       3.0000   F-statistic (robust):         -7.942e+14
                            