# Library importation

In [1]:
import numpy as np
import pandas as pd
from linearmodels import PanelOLS
from linearmodels.panel.results import PanelModelComparison
import statsmodels.api as sm

# Data importation

In [2]:
df = pd.read_csv('./data/panel_data/Full_DB.csv')

In [3]:
df.columns

Index(['const', 'Country', 'Time', 'NEET', 'Exp_LMP', 'Exp_LMP_2', 'Exp_LMP_3',
       'STR', 'STR_2', 'STR_3', 'GDP', 'LogGDP', 'CPI', 'DEBT',
       'Years_schooling', 'Avg_class_size', 'Exp_educ', 'LogExp_educ',
       'Strictness_of_workers', 'PT_employ', 'Avg_dur_unemployment'],
      dtype='object')

# Regressions

## With explanatory variables only

### Without time and entity effects

In [4]:
df_explanatory = df[['const','Country','Time','NEET','Exp_LMP','Exp_LMP_2','STR','STR_2','STR_3']]
# df_explanatory = df_explanatory.dropna().reset_index(drop=True)
df_explanatory = df_explanatory.set_index(['Country', 'Time'])

In [5]:
df_explanatory

Unnamed: 0_level_0,Unnamed: 1_level_0,const,NEET,Exp_LMP,Exp_LMP_2,STR,STR_2,STR_3
Country,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AUS,2013,1.0,13.015899,0.87,0.7569,15.615,243.828225,3807.377733
AUS,2014,1.0,12.647472,0.93,0.8649,15.612,243.734544,3805.183701
AUS,2015,1.0,11.831610,0.91,0.8281,15.433,238.177489,3675.793188
AUS,2016,1.0,11.352150,0.86,0.7396,15.168,230.068224,3489.674822
AUS,2017,1.0,10.946128,0.85,0.7225,15.124,228.735376,3459.393827
...,...,...,...,...,...,...,...,...
LTU,2014,1.0,14.185811,0.43,0.1849,10.233,104.714289,1071.541319
LTU,2015,1.0,13.743647,0.53,0.2809,10.256,105.185536,1078.782857
LTU,2016,1.0,11.403278,0.51,0.2601,10.470,109.620900,1147.730823
LTU,2017,1.0,11.220660,0.54,0.2916,10.619,112.763161,1197.432007


In [6]:
panel_explanatory_OLS = PanelOLS(df_explanatory.NEET, df_explanatory.drop('NEET', axis=1)).fit(cov_type='clustered')

In [7]:
print(panel_explanatory_OLS)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.2586
Estimator:                   PanelOLS   R-squared (Between):              0.1785
No. Observations:                 167   R-squared (Within):               0.1610
Date:                Fri, Apr 02 2021   R-squared (Overall):              0.2586
Time:                        13:02:03   Log-likelihood                   -451.96
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      11.230
Entities:                          31   P-value                           0.0000
Avg Obs:                       5.3871   Distribution:                   F(5,161)
Min Obs:                       2.0000                                           
Max Obs:                       9.0000   F-statistic (robust):             70.129
                            

### With Time effect only

In [8]:
panel_explanatory_OLS_tfe = PanelOLS(df_explanatory.NEET, df_explanatory.drop('NEET', axis=1),time_effects = True).fit(cov_type='clustered', cluster_time=True)

In [9]:
print(panel_explanatory_OLS_tfe)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.2470
Estimator:                   PanelOLS   R-squared (Between):              0.1845
No. Observations:                 167   R-squared (Within):               0.1643
Date:                Fri, Apr 02 2021   R-squared (Overall):              0.2573
Time:                        13:02:03   Log-likelihood                   -447.32
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      10.039
Entities:                          31   P-value                           0.0000
Avg Obs:                       5.3871   Distribution:                   F(5,153)
Min Obs:                       2.0000                                           
Max Obs:                       9.0000   F-statistic (robust):             3778.9
                            

>We see that the influence of the time fixed effect cannot be omitted as the p-value for the F-test for Poolability is null showing that the influence of the time effect is not null.

### With Time and entity effects

In [31]:
panel_explanatory_OLS_stfe = PanelOLS(df_explanatory.NEET, df_explanatory.drop('NEET', axis=1),entity_effects = True).fit(cov_type='clustered',cluster_effect=True)

In [32]:
print(panel_explanatory_OLS_stfe)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.3977
Estimator:                   PanelOLS   R-squared (Between):             -0.7485
No. Observations:                 167   R-squared (Within):               0.3977
Date:                Fri, Apr 02 2021   R-squared (Overall):             -0.6738
Time:                        13:07:04   Log-likelihood                   -228.88
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      17.302
Entities:                          31   P-value                           0.0000
Avg Obs:                       5.3871   Distribution:                   F(5,131)
Min Obs:                       2.0000                                           
Max Obs:                       9.0000   F-statistic (robust):             18.757
                            

>We see here that the use of the entity fixed effect is not statistically significant:
   * First, for each coefficient the p-value is very high
   * Second, the p-value for the F-test for Poolability shows that the entity fixed effect is null

In [12]:
results = [panel_explanatory_OLS,panel_explanatory_OLS_tfe,panel_explanatory_OLS_stfe]
print(PanelModelComparison(results,precision='std_errors',stars=True))

                          Model Comparison                         
                               Model 0        Model 1       Model 2
-------------------------------------------------------------------
Dep. Variable                     NEET           NEET          NEET
Estimator                     PanelOLS       PanelOLS      PanelOLS
No. Observations                   167            167           167
Cov. Est.                    Clustered      Clustered     Clustered
R-squared                       0.2586         0.2470        0.1745
R-Squared (Within)              0.1610         0.1643        0.3029
R-Squared (Between)             0.1785         0.1845       -0.2528
R-Squared (Overall)             0.2586         0.2573       -0.2024
F-statistic                     11.230         10.039        5.1991
P-value (F-stat)                0.0000         0.0000        0.0002
const                         -21.512*     -21.830***        1.2941
                              (12.089)       (6.

## With economic features only

In [13]:
df_eco = df[['const','Country','Time','NEET','Exp_LMP','Exp_LMP_2','STR','STR_2','STR_3','LogGDP','CPI','DEBT']]
df_eco = df_eco.dropna().reset_index(drop=True)
df_eco = df_eco.set_index(['Country', 'Time'])

In [14]:
panel_eco_OLS = PanelOLS(df_eco.NEET,df_eco.drop('NEET',axis=1), time_effects=True).fit(cov_type='clustered', cluster_time=True)

In [15]:
print(panel_eco_OLS)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.5226
Estimator:                   PanelOLS   R-squared (Between):              0.4604
No. Observations:                 161   R-squared (Within):               0.3105
Date:                Fri, Apr 02 2021   R-squared (Overall):              0.5232
Time:                        13:02:04   Log-likelihood                   -397.06
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      19.701
Entities:                          30   P-value                           0.0000
Avg Obs:                       5.3667   Distribution:                   F(8,144)
Min Obs:                       2.0000                                           
Max Obs:                       9.0000   F-statistic (robust):          6.036e+11
                            

## With education features only

In [16]:
df_edu = df[['const','Country','Time','NEET','Exp_LMP','Exp_LMP_2','STR','STR_2','STR_3','Years_schooling','LogExp_educ']]
df_edu = df_edu.dropna().reset_index(drop=True)
df_edu = df_edu.set_index(['Country', 'Time'])

In [17]:
panel_edu_OLS = PanelOLS(df_edu.NEET,df_edu.drop('NEET',axis=1), time_effects=True).fit(cov_type='clustered', cluster_time=True)

In [18]:
print(panel_edu_OLS)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.6561
Estimator:                   PanelOLS   R-squared (Between):              0.6457
No. Observations:                 135   R-squared (Within):               0.2756
Date:                Fri, Apr 02 2021   R-squared (Overall):              0.6593
Time:                        13:02:04   Log-likelihood                   -310.72
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      32.707
Entities:                          29   P-value                           0.0000
Avg Obs:                       4.6552   Distribution:                   F(7,120)
Min Obs:                       1.0000                                           
Max Obs:                       8.0000   F-statistic (robust):          6.185e+12
                            

## With labor features only

In [19]:
df_labor = df[['const','Country','Time','NEET','Exp_LMP','Exp_LMP_2','STR','STR_2','STR_3','Strictness_of_workers','PT_employ']]
df_labor = df_labor.dropna().reset_index(drop=True)
df_labor = df_labor.set_index(['Country', 'Time'])

In [20]:
panel_labor_OLS = PanelOLS(df_labor.NEET,df_labor.drop('NEET',axis=1), time_effects=True).fit(cov_type='clustered', cluster_time=True)

In [21]:
print(panel_labor_OLS)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.3632
Estimator:                   PanelOLS   R-squared (Between):              0.3119
No. Observations:                 161   R-squared (Within):               0.1690
Date:                Fri, Apr 02 2021   R-squared (Overall):              0.3757
Time:                        13:02:04   Log-likelihood                   -420.64
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      11.813
Entities:                          30   P-value                           0.0000
Avg Obs:                       5.3667   Distribution:                   F(7,145)
Min Obs:                       2.0000                                           
Max Obs:                       9.0000   F-statistic (robust):          2.214e+08
                            

## With education and economic features

In [22]:
df_eco_edu = df[['const','Country','Time','NEET','Exp_LMP','Exp_LMP_2','STR','STR_2','STR_3','LogGDP','CPI','DEBT','Years_schooling','LogExp_educ']]
df_eco_edu = df_eco_edu.dropna().reset_index(drop=True)
df_eco_edu = df_eco_edu.set_index(['Country', 'Time'])

In [23]:
panel_eco_edu_OLS = PanelOLS(df_eco_edu.NEET,df_eco_edu.drop('NEET',axis=1), time_effects=True).fit(cov_type='clustered', cluster_time=True)

In [24]:
print(panel_eco_edu_OLS)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.7232
Estimator:                   PanelOLS   R-squared (Between):              0.7352
No. Observations:                 130   R-squared (Within):               0.2670
Date:                Fri, Apr 02 2021   R-squared (Overall):              0.7240
Time:                        13:02:04   Log-likelihood                   -287.21
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      29.256
Entities:                          28   P-value                           0.0000
Avg Obs:                       4.6429   Distribution:                  F(10,112)
Min Obs:                       1.0000                                           
Max Obs:                       8.0000   F-statistic (robust):          -8.25e+13
                            

## With labor, economic and education features

In [25]:
df_all = df[['const','Country','Time','NEET','Exp_LMP','Exp_LMP_2','STR','STR_2','STR_3','LogGDP','CPI','DEBT','Years_schooling','LogExp_educ','Strictness_of_workers','PT_employ']]
df_all = df_all.dropna().reset_index(drop=True)
df_all = df_all.set_index(['Country', 'Time'])

In [26]:
panel_all_OLS = PanelOLS(df_all.NEET,df_all.drop('NEET',axis=1), time_effects=True).fit(cov_type='clustered', cluster_time=True)

In [27]:
print(panel_all_OLS)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.7386
Estimator:                   PanelOLS   R-squared (Between):              0.7547
No. Observations:                 125   R-squared (Within):               0.1901
Date:                Fri, Apr 02 2021   R-squared (Overall):              0.7412
Time:                        13:02:05   Log-likelihood                   -274.99
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      24.726
Entities:                          27   P-value                           0.0000
Avg Obs:                       4.6296   Distribution:                  F(12,105)
Min Obs:                       1.0000                                           
Max Obs:                       8.0000   F-statistic (robust):          8.146e+14
                            

## Recapitulative Table

In [28]:
full_res = [panel_eco_OLS,panel_edu_OLS,panel_eco_edu_OLS,panel_labor_OLS,panel_all_OLS]
print(PanelModelComparison(full_res,precision='std_errors',stars=True))

                                          Model Comparison                                          
                                 Model 0        Model 1        Model 2        Model 3        Model 4
----------------------------------------------------------------------------------------------------
Dep. Variable                       NEET           NEET           NEET           NEET           NEET
Estimator                       PanelOLS       PanelOLS       PanelOLS       PanelOLS       PanelOLS
No. Observations                     161            135            130            161            125
Cov. Est.                      Clustered      Clustered      Clustered      Clustered      Clustered
R-squared                         0.5226         0.6561         0.7232         0.3632         0.7386
R-Squared (Within)                0.3105         0.2756         0.2670         0.1690         0.1901
R-Squared (Between)               0.4604         0.6457         0.7352         0.3119      