In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from linearmodels import PanelOLS

Les années communes à tous les datasets sont $2013-2018$.

Pour le neet rate, j'ai pris les $15-29$ ans. Pour le STR, j'ai pris le niveau $L1$.

# Import data and create panel data sets

## Import data

In [2]:
neet_rate = pd.read_csv('./neet_rate_1997_2018.csv')
exp_lmp = pd.read_csv('./exp_LMP_2004_2018.csv')
student_teacher_ratio = pd.read_csv('./str_2013_2018.csv')
min_wage = pd.read_csv('./min_wage_2001_2019.csv')
df_eco_features = pd.read_csv('./economic_features.csv')

## Create panel data set for the NEET rate and the 3 explanatory variables

In [3]:
neet_rate = neet_rate[neet_rate.Subject == '15_29'].drop(['Subject'], axis=1).reset_index(drop=True)
student_teacher_ratio = student_teacher_ratio[student_teacher_ratio['ISC11_LEVEL_CAT']=='L1']

In [4]:
oecd_countries = {'AUS': 'Australia', 'AUT': 'Austria', 'BEL': 'Belgium', 'CAN': 'Canada', 'CHL': 'Chile', 'COL': 'Colombia', 'CZE': 'Czech Republic', 'DNK': 'Denmark', 'EST': 'Estonia', 'FIN': 'Finland', 'FRA': 'France', 'DEU': 'Germany', 'GRC': 'Greece', 'HUN': 'Hungary', 'ISL': 'Iceland', 'IRL': 'Ireland', 'ISR': 'Israel', 'ITA': 'Italy', 'JPN': 'Japan', 'KOR': 'Korea', 'LVA': 'Latvia', 'LTU': 'Lithuania', 'LUX': 'Luxembourg', 'MEX': 'Mexico', 'NLD': 'Netherlands', 'NZL': 'New Zealand', 'NOR': 'Norway', 'POL': 'Poland', 'PRT': 'Portugal', 'SVK': 'Slovakia', 'SVN': 'Slovenia', 'ESP': 'Spain', 'SWE': 'Sweden', 'CHE': 'Switzerland', 'TUR': 'Turkey', 'GBR': 'United Kingdom', 'USA': 'United States'}
code_countries = [code for code in oecd_countries.keys()]

In [5]:
full_df = pd.DataFrame(columns=['Country', 'Time', 'NEET', 'Exp_LMP', 'STR', 'Min_Wage'])
years = [year for year in range(2013,2019,1)]
for country in oecd_countries.keys():
    neet_rate_country = neet_rate[(neet_rate.Country==country)&(neet_rate.Time.isin(years))]
    exp_lmp_country = exp_lmp[(exp_lmp.Country==country)&(exp_lmp.Time.isin(years))]
    student_teacher_ratio_country = student_teacher_ratio[(student_teacher_ratio.Country==country)&(student_teacher_ratio.Time.isin(years))]
    min_wage_country = min_wage[(min_wage.Country==country)&(min_wage.Time.isin(years))]
    for year in years:
        neet = neet_rate_country[neet_rate_country.Time==year]['NEET'].values
        exp_lmp_value = exp_lmp_country[exp_lmp_country.Time==year]['Exp_LMP'].values
        str_value = student_teacher_ratio_country[student_teacher_ratio_country.Time==year]['STR'].values
        min_wage_value = min_wage_country[min_wage_country.Time==year]['Min_Wage'].values
        if (neet.size>0) and (exp_lmp_value.size>0) and (str_value.size>0) and (min_wage_value.size>0):
            if (neet==neet) and (exp_lmp_value==exp_lmp_value) and (str_value==str_value) and (min_wage_value==min_wage_value):
                full_df = full_df.append({'Country': country, 'Time': year, 'NEET': neet[0], 'Exp_LMP': exp_lmp_value[0], 'STR': str_value[0], 'Min_Wage': min_wage_value[0]}, ignore_index=True)

In [6]:
np.unique(full_df.Country)

array(['AUS', 'BEL', 'CAN', 'CHL', 'CZE', 'DEU', 'ESP', 'EST', 'FRA',
       'HUN', 'IRL', 'ISR', 'JPN', 'LTU', 'LUX', 'LVA', 'MEX', 'NLD',
       'NZL', 'POL', 'PRT', 'SVK', 'SVN', 'USA'], dtype=object)

In [7]:
# create binary variables for each oecd country
for code in np.unique(full_df.Country):
    bin_var = (full_df.Country == code).astype(int)
    full_df['bin_'+code] = bin_var

In [8]:
# create a group id for each different country for the clustered standard errors
groupid = []
previous_country,gid = 'AUS', 0
for code in full_df.Country:
    current_country = code
    if current_country==previous_country:
        groupid.append(gid)
    else:
        gid += 1
        groupid.append(gid)
    previous_country = current_country
full_df['groupid'] = groupid

In [9]:
full_df = full_df.set_index(['Country', 'Time'])
full_df

Unnamed: 0_level_0,Unnamed: 1_level_0,NEET,Exp_LMP,STR,Min_Wage,bin_AUS,bin_BEL,bin_CAN,bin_CHL,bin_CZE,bin_DEU,...,bin_LVA,bin_MEX,bin_NLD,bin_NZL,bin_POL,bin_PRT,bin_SVK,bin_SVN,bin_USA,groupid
Country,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AUS,2013,13.015899,0.87,15.615,23283.766881,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AUS,2014,12.647472,0.93,15.612,23356.492667,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AUS,2015,11.831610,0.91,15.433,23641.395398,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AUS,2016,11.352150,0.86,15.168,23915.419580,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AUS,2017,10.946128,0.85,15.124,24128.731046,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USA,2014,15.047262,0.28,15.435,16285.276127,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
USA,2015,14.380193,0.28,15.354,16265.980260,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
USA,2016,14.118049,0.26,15.216,16063.328012,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23
USA,2017,13.280724,0.24,15.182,15728.297963,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,23


## Create panel data set for the economic features

In [20]:
df_eco_features = df_eco_features.set_index(['Country', 'Time'])

## Create panel data set for the education indicators

## Create panel data set for the labour market indicators

# OLS on panel data

In [10]:
X_bols, Y_bols = full_df[['Exp_LMP', 'STR', 'Min_Wage']], full_df['NEET']
X_bols = sm.add_constant(X_bols)

In [11]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)
calc_vif(X_bols)

Unnamed: 0,variables,VIF
0,const,26.576924
1,Exp_LMP,1.380412
2,STR,1.053941
3,Min_Wage,1.410139


## Basic OLS without state or time fixed effects

In [12]:
basic_ols = sm.OLS(Y_bols, X_bols).fit()

In [13]:
print(basic_ols.summary())

                            OLS Regression Results                            
Dep. Variable:                   NEET   R-squared:                       0.609
Model:                            OLS   Adj. R-squared:                  0.599
Method:                 Least Squares   F-statistic:                     64.30
Date:                Sun, 28 Mar 2021   Prob (F-statistic):           3.81e-25
Time:                        09:33:29   Log-Likelihood:                -292.26
No. Observations:                 128   AIC:                             592.5
Df Residuals:                     124   BIC:                             603.9
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.8227      1.099     12.579      0.0

## OLS with only state fixed effects

### Use the n-1 binary variables for states, France is the reference

In [14]:
X_sfe, Y_sfe = full_df.drop(['NEET', 'Min_Wage', 'STR', 'bin_FRA', 'groupid'], axis=1), full_df['NEET']
X_sfe = sm.add_constant(X_sfe)

In [15]:
sfe_ols = sm.OLS(Y_sfe, X_sfe).fit(cov_type='cluster', cov_kwds = {'groups': full_df['groupid']})
print(sfe_ols.summary())

                            OLS Regression Results                            
Dep. Variable:                   NEET   R-squared:                       0.925
Model:                            OLS   Adj. R-squared:                  0.907
Method:                 Least Squares   F-statistic:                     32.62
Date:                Sun, 28 Mar 2021   Prob (F-statistic):           8.14e-06
Time:                        09:33:29   Log-Likelihood:                -186.69
No. Observations:                 128   AIC:                             423.4
Df Residuals:                     103   BIC:                             494.7
Df Model:                          24                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.1360      1.884      1.134      0.2



In [16]:
test_df = full_df[['NEET', 'Exp_LMP', 'STR', 'Min_Wage']]
test_df = sm.add_constant(test_df)

In [17]:
panel_ols = PanelOLS(test_df.NEET, test_df[['const', 'Exp_LMP', 'STR', 'Min_Wage']]).fit(cov_type='unadjusted')

In [18]:
print(panel_ols)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   NEET   R-squared:                        0.6087
Estimator:                   PanelOLS   R-squared (Between):              0.6326
No. Observations:                 128   R-squared (Within):               0.5537
Date:                Sun, Mar 28 2021   R-squared (Overall):              0.6087
Time:                        09:33:29   Log-likelihood                   -292.26
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      64.297
Entities:                          24   P-value                           0.0000
Avg Obs:                       5.3333   Distribution:                   F(3,124)
Min Obs:                       2.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             64.297
                            