In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import folium
import seaborn as sns
import statsmodels.api as sm
import sklearn.preprocessing as sk
import seaborn as sns
from statsmodels.stats import diagnostic as dng
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
sns.set()

# création du modèle de régression linéaire

## import des variables

### variables de contrôle liées au marché du travail

In [2]:
employment_rate = pd.read_csv('./data/labour_market/EMP_RATE_by_birth_place_and_EDUC.csv')
min_wage = pd.read_csv('./data/labour_market/real minimum wage from 2001 to 2019.csv')
avg_duration_unemployment = pd.read_csv('./data/labour_market/average duration of unemployment.csv')
strictness_emply_protec = pd.read_csv('./data/labour_market/strictness of employment protection from 1998 to 2018.csv')
short_time_workers = pd.read_csv('./data/labour_market/percentage of short time workers from 2000.csv')
involuntary_pt_workers = pd.read_csv('./data/labour_market/percentage of involuntary part time workers from 2000.csv')
well_matched_skills_jobs = pd.read_csv('./data/labour_market/percentage of well matched skills for jobs.csv', dtype='unicode')
ft_and_pt_employ = pd.read_csv('./data/labour_market/percentage of full time part time employment.csv')
marginally_attached_workers = pd.read_csv('./data/labour_market/percentage of marginally attached workers from 2000.csv')
public_exp_LMP = pd.read_csv('./data/labour_market/public_exp_LMP.csv')
employees_bargain = pd.read_csv('./data/labour_market/percentage of employees that can bargain from 1998.csv')

### variables de contrôle liées à l'état du système éducatif

In [3]:
stratio = pd.read_csv('./data/education_system/student_teacher_ratio.csv')
educ_spendings = pd.read_csv('./data/education_system/spendings_in_education.csv')
vet_share = pd.read_csv('./data/education_system/share_of_VETs_from_2013_2018.csv')
neet = pd.read_csv("./neet_rate_1997_2018.csv")
years_school = pd.read_csv("./data/education_system/mean-years-of-schooling-world.csv")
level_from_parents = pd.read_csv("./data/education_system/education_level_depending_on_those_of_the_parents_2012_2015.csv")
avg_class_size = pd.read_csv("./data/education_system/Avg_class_size_primary_education.csv")

### variables de contrôles économiques

In [4]:
economic_features = pd.read_csv('./economic_features.csv').dropna(subset = ['GDP', 'CPI', 'DEBT'], inplace = False)

## création de la dataframe contenant les pays ayant toutes les variables disponibles sur une année pour chaque année

In [5]:
oecd_countries = {'AUS': 'Australia', 'AUT': 'Austria', 'BEL': 'Belgium', 'CAN': 'Canada', 'CHL': 'Chile', 'COL': 'Colombia', 'CZE': 'Czech Republic', 'DNK': 'Denmark', 'EST': 'Estonia', 'FIN': 'Finland', 'FRA': 'France', 'DEU': 'Germany', 'GRC': 'Greece', 'HUN': 'Hungary', 'ISL': 'Iceland', 'IRL': 'Ireland', 'ISR': 'Israel', 'ITA': 'Italy', 'JPN': 'Japan', 'KOR': 'Korea', 'LVA': 'Latvia', 'LTU': 'Lithuania', 'LUX': 'Luxembourg', 'MEX': 'Mexico', 'NLD': 'Netherlands', 'NZL': 'New Zealand', 'NOR': 'Norway', 'POL': 'Poland', 'PRT': 'Portugal', 'SVK': 'Slovakia', 'SVN': 'Slovenia', 'ESP': 'Spain', 'SWE': 'Sweden', 'CHE': 'Switzerland', 'TUR': 'Turkey', 'GBR': 'United Kingdom', 'USA': 'United States'}
neet_countries = neet["Country"].unique()
neet_countries_name = []
for country in neet_countries:
    neet_countries_name.append(oecd_countries[country])

In [6]:
min_wage.dropna(axis = 0, subset = ['Value'], inplace = True) ##causal variable 
employment_rate.dropna(subset = ['Value'], inplace = True) ## seulement pour 2015 ...
avg_duration_unemployment.dropna(subset = ['Value'], inplace = True)
strictness_emply_protec.dropna(subset = ['Value'], inplace = True)
short_time_workers.dropna(subset = ['Value'], inplace = True)
involuntary_pt_workers.dropna(subset = ['Value'], inplace = True)
well_matched_skills_jobs.dropna(subset = ['Value'], inplace = True)
ft_and_pt_employ.dropna(subset = ['Value'], inplace = True)
marginally_attached_workers.dropna(subset = ['Value'], inplace = True)
public_exp_LMP.dropna(subset = ['Value'], inplace = True) ##causal variable 
employees_bargain.dropna(subset = ['Value'], inplace = True)
stratio.dropna(subset = ['Value'], inplace = True) ##causal variable 
educ_spendings.dropna(subset = ['Value'], inplace = True)
vet_share.dropna(subset = ['Value'], inplace = True)
years_school.dropna(subset = ['Average Total Years of Schooling for Adult Population (Lee-Lee (2016), Barro-Lee (2018) and UNDP (2018))'], inplace = True)
level_from_parents.dropna(subset = ['Value'], inplace = True) ##seulement 2012 et 2015 ...
avg_class_size.dropna(subset = ['Value'], inplace = True)


In [7]:
well_matched_skills_jobs = well_matched_skills_jobs.astype({"Year" : int})
educ_spendings = educ_spendings[educ_spendings.Year != "Latest year"]
educ_spendings = educ_spendings.astype({"Year" : int})

In [8]:
avg_duration_unemployment = avg_duration_unemployment[(avg_duration_unemployment.Time >= 2013) & (avg_duration_unemployment.Time <= 2018)]
strictness_emply_protec = strictness_emply_protec[(strictness_emply_protec.Time >= 2013) & (strictness_emply_protec.Time <= 2018)]
short_time_workers = short_time_workers[(short_time_workers.Time >= 2013) & (short_time_workers.Time <= 2018)]
involuntary_pt_workers = involuntary_pt_workers[(involuntary_pt_workers.Time >= 2013) & (involuntary_pt_workers.Time <= 2018)]
well_matched_skills_jobs = well_matched_skills_jobs[(well_matched_skills_jobs.Year >= 2013) & (well_matched_skills_jobs.Year <= 2018)] #do after
ft_and_pt_employ = ft_and_pt_employ[(ft_and_pt_employ.Time >= 2013) & (ft_and_pt_employ.Time <= 2018)]
marginally_attached_workers = marginally_attached_workers[(marginally_attached_workers.Time >= 2013) & (marginally_attached_workers.Time <= 2018)]
employees_bargain = employees_bargain[(employees_bargain.Year >= 2013) & (employees_bargain.Year <= 2018)]
educ_spendings = educ_spendings[(educ_spendings.Year >= 2013) & (educ_spendings.Year <= 2018)]
vet_share = vet_share[(vet_share.Year >= 2013) & (vet_share.Year <= 2018)]
years_school = years_school[(years_school.Year >= 2013) & (years_school.Year <= 2018)]
avg_class_size = avg_class_size[(avg_class_size.Year >= 2013) & (avg_class_size.Year <= 2018)]
economic_features = economic_features[(economic_features.Time >= 2013) & (economic_features.Time < 2019)]

In [9]:
avg_duration_unemployment = avg_duration_unemployment[(avg_duration_unemployment.Age == "Total") & (avg_duration_unemployment.Sex == "All persons")]
avg_duration_unemployment = avg_duration_unemployment[['COUNTRY', 'Time', 'Value']]
strictness_emply_protec = strictness_emply_protec[strictness_emply_protec.Series == "Version 4 (2013-2019)"][['COUNTRY', 'Time', 'Value']]
short_time_workers.rename(columns = {"Employment status" : "status"}, inplace = True)
short_time_workers = short_time_workers[(short_time_workers.Age == "Total") & (short_time_workers.status == "Total employment") & (short_time_workers.Sex == "All persons")][['COUNTRY', 'Time', 'Value']]
involuntary_pt_workers.rename(columns = {"Employment status" : "status"}, inplace = True)
involuntary_pt_workers = involuntary_pt_workers[(involuntary_pt_workers.Age == "Total") & (involuntary_pt_workers.status == "Total employment") & (involuntary_pt_workers.Sex == "All persons") & (involuntary_pt_workers.SERIES == "SHINV_PT")][['COUNTRY', 'Time', 'Value']]
ft_and_pt_employ = ft_and_pt_employ[(ft_and_pt_employ.Sex == 'All persons') & (ft_and_pt_employ.Age == '25 to 54') & (ft_and_pt_employ.Series == 'Part-time employment') & (ft_and_pt_employ.EMPSTAT == "TE")][['COUNTRY', 'Time', 'Value']]
marginally_attached_workers = marginally_attached_workers[(marginally_attached_workers.Sex == 'All persons') & (marginally_attached_workers.Age == 'Total')][['COUNTRY', 'Time', 'Value']]
employees_bargain = employees_bargain[['COU', 'Year', 'Value']]
vet_share = vet_share[(vet_share.ISC11_LEVEL_CAT == 'L4_C5_SW')][['COUNTRY', 'Year', 'Value']]
years_school = years_school[['Entity', 'Year', 'Average Total Years of Schooling for Adult Population (Lee-Lee (2016), Barro-Lee (2018) and UNDP (2018))']]
avg_class_size = avg_class_size[(avg_class_size.ISC11_LEVEL_CAT == "L2") & (avg_class_size.REF_SECTOR == "INST_T") & (avg_class_size.INDICATOR == "PERS_AVG_CLASS")][['COUNTRY', 'Year', 'Value']]
educ_spendings = educ_spendings[(educ_spendings.ISC11 == "L1") & (educ_spendings.REF_SECTOR == "S13") & (educ_spendings.COUNTERPART_SECTOR == "INST_T")][['COUNTRY', 'Year', 'Value']]

In [10]:
employees_bargain.rename(columns = {"Year" : "Time"}, inplace = True)
vet_share.rename(columns = {"Year" : "Time"}, inplace = True)
years_school.rename(columns = {"Year" : "Time"}, inplace = True)
avg_class_size.rename(columns = {"Year" : "Time"}, inplace = True)
educ_spendings.rename(columns = {"Year" : "Time"}, inplace = True)
years_school.rename(columns = {"Entity" : "Country"}, inplace = True)
employees_bargain.rename(columns = {"COU" : "Country"}, inplace = True)
avg_duration_unemployment.rename(columns = {"COUNTRY" : "Country"}, inplace = True)
strictness_emply_protec.rename(columns = {"COUNTRY" : "Country"}, inplace = True)
short_time_workers.rename(columns = {"COUNTRY" : "Country"}, inplace = True)
involuntary_pt_workers.rename(columns = {"COUNTRY" : "Country"}, inplace = True)
ft_and_pt_employ.rename(columns = {"COUNTRY" : "Country"}, inplace = True)
marginally_attached_workers.rename(columns = {"COUNTRY" : "Country"}, inplace = True)
vet_share.rename(columns = {"COUNTRY" : "Country"}, inplace = True)
avg_class_size.rename(columns = {"COUNTRY" : "Country"}, inplace = True)
educ_spendings.rename(columns = {"COUNTRY" : "Country"}, inplace = True)

In [11]:
avg_duration_unemployment.rename(columns = {'Value' : 'avg_unemploy'}, inplace = True)
strictness_emply_protec.rename(columns = {'Value' : 'protection of workers'}, inplace = True)
short_time_workers.rename(columns = {'Value' : 'short_time_workers'}, inplace = True)
involuntary_pt_workers.rename(columns = {'Value' : 'involuntary_pt_workers'}, inplace = True)
ft_and_pt_employ.rename(columns = {'Value' : 'ft_and_pt_employ'}, inplace = True)
marginally_attached_workers.rename(columns = {'Value' : 'marginally_attached_workers'}, inplace = True)
employees_bargain.rename(columns = {'Value' : 'employees_bargain'}, inplace = True)
vet_share.rename(columns = {'Value' : 'vet_share'}, inplace = True)
years_school.rename(columns = {'Average Total Years of Schooling for Adult Population (Lee-Lee (2016), Barro-Lee (2018) and UNDP (2018))' : 'years_schooling'}, inplace = True)
avg_class_size.rename(columns = {'Value' : 'avg_class_size'}, inplace = True)
educ_spendings.rename(columns = {'Value' : 'educ_spendings'}, inplace = True)

In [12]:
list(set(strictness_emply_protec['Country'].unique()) & set(short_time_workers['Country'].unique()) & set(involuntary_pt_workers['Country'].unique()) & set(ft_and_pt_employ['Country'].unique()) & set(marginally_attached_workers['Country'].unique()) & set(employees_bargain['Country'].unique()) & set(avg_class_size['Country'].unique()) & set(educ_spendings['Country'].unique()))

['PRT',
 'FRA',
 'AUT',
 'FIN',
 'CZE',
 'SVN',
 'GBR',
 'DNK',
 'HUN',
 'JPN',
 'EST',
 'ITA',
 'LVA',
 'ESP',
 'AUS',
 'SVK',
 'LTU',
 'DEU',
 'GRC',
 'TUR',
 'SWE',
 'POL']

In [13]:
l = []
for country in neet_countries:
    for year in {2013, 2014, 2015, 2016, 2017, 2018}:
        dict = {}
        avg_unemploy = False
        protection_of_workers = False
        st_workers = False
        in_pt_workers = False
        ft_and_pt_employment = False
        marginally_att_workers = False
        emplo_bargain = False
        vetshare = False
        years_schooling = False
        avgclass_size = False
        educspendings = False
        if country in avg_duration_unemployment['Country'].unique():
            if year in avg_duration_unemployment[avg_duration_unemployment.Country == country]['Time'].unique():
                avg_unemploy = avg_duration_unemployment[(avg_duration_unemployment.Country == country) & (avg_duration_unemployment.Time == year)]['avg_unemploy'].values[0]
        if country in strictness_emply_protec['Country'].unique():
            if year in strictness_emply_protec[strictness_emply_protec.Country == country]['Time'].unique():
                protection_of_workers = strictness_emply_protec[(strictness_emply_protec.Country == country) & (strictness_emply_protec.Time == year)]['protection of workers'].values[0]
        if country in short_time_workers['Country'].unique():
            if year in short_time_workers[short_time_workers.Country == country]['Time'].unique():
                st_workers = short_time_workers[(short_time_workers.Country == country) & (short_time_workers.Time == year)]['short_time_workers'].values[0]
        if country in involuntary_pt_workers['Country'].unique():
            if year in involuntary_pt_workers[involuntary_pt_workers.Country == country]['Time'].unique():
                in_pt_workers = involuntary_pt_workers[(involuntary_pt_workers.Country == country) & (involuntary_pt_workers.Time == year)]['involuntary_pt_workers'].values[0]
        if country in ft_and_pt_employ['Country'].unique():
            if year in ft_and_pt_employ[ft_and_pt_employ.Country == country]['Time'].unique():
                ft_and_pt_employment = ft_and_pt_employ[(ft_and_pt_employ.Country == country) & (ft_and_pt_employ.Time == year)]['ft_and_pt_employ'].values[0]
        if country in marginally_attached_workers['Country'].unique():
            if year in marginally_attached_workers[marginally_attached_workers.Country == country]['Time'].unique():
                marginally_att_workers = marginally_attached_workers[(marginally_attached_workers.Country == country) & (marginally_attached_workers.Time == year)]['marginally_attached_workers'].values[0]
        if country in employees_bargain['Country'].unique():
            if year in employees_bargain[employees_bargain.Country == country]['Time'].unique():
                emplo_bargain = employees_bargain[(employees_bargain.Country == country) & (employees_bargain.Time == year)]['employees_bargain'].values[0]
        if country in vet_share['Country'].unique():
            if year in vet_share[vet_share.Country == country]['Time'].unique():
                vetshare = vet_share[(vet_share.Country == country) & (vet_share.Time == year)]['vet_share'].values[0]
        if oecd_countries[country] in years_school['Country'].unique():
            if year in years_school[years_school.Country == oecd_countries[country]]['Time'].unique():
                years_schooling = years_school[(years_school.Country == oecd_countries[country]) & (years_school.Time == year)]['years_schooling'].values[0]
        if country in avg_class_size['Country'].unique():
            if year in avg_class_size[avg_class_size.Country == country]['Time'].unique():
                avgclass_size = avg_class_size[(avg_class_size.Country == country) & (avg_class_size.Time == year)]['avg_class_size'].values[0]
        if country in educ_spendings['Country'].unique():
            if year in educ_spendings[educ_spendings.Country == country]['Time'].unique():
                educspendings = educ_spendings[(educ_spendings.Country == country) & (educ_spendings.Time == year)]['educ_spendings'].values[0]
                
        if (protection_of_workers != False) and (st_workers != False) and (in_pt_workers != False) and (ft_and_pt_employment != False) and (marginally_att_workers != False) and (emplo_bargain != False) and (years_schooling != False) and (avgclass_size != False) and (educspendings != False):
            dict = {"Country" : country, "Time" : year, 'protection_of_workers' : protection_of_workers, 'short_time_workers' : st_workers, 'involuntary_pt_workers' : in_pt_workers, 'ft_and_pt_employ' : ft_and_pt_employment, 'marginally_attached_workers' : marginally_att_workers, 'employees_bargain' : emplo_bargain, 'years_schooling' : years_schooling, 'avg_class_size' : avgclass_size, 'educ_spendings' : educspendings}
            #print(dict)
            l.append(dict)
global_data = pd.DataFrame(l)

In [14]:
global_data

Unnamed: 0,Country,Time,protection_of_workers,short_time_workers,involuntary_pt_workers,ft_and_pt_employ,marginally_attached_workers,employees_bargain,years_schooling,avg_class_size,educ_spendings
0,AUS,2016,1.7,0.892538,27.952453,19.342875,5.644458,60.0,12.9,22.158,8795.3633
1,AUS,2014,1.7,0.532556,28.277395,18.917433,5.584873,60.1,12.7,23.785,8107.4346
2,AUT,2013,1.8,0.4797,11.341942,18.91349,3.342615,98.0,11.9,21.022,10486.676
3,AUT,2014,1.8,0.392421,10.955043,19.574962,3.614805,98.0,12.1,20.977,10661.813
4,AUT,2015,1.8,0.454537,11.85259,19.793941,3.678977,98.0,12.1,20.921,11193.469
5,EST,2015,1.934,0.526973,12.138937,6.022114,4.059337,18.6,12.7,18.148,5838.8569
6,FIN,2014,2.518,0.466532,23.873874,7.418025,5.135235,89.3,12.4,19.656,8765.7246
7,FIN,2015,2.518,0.435119,25.849081,7.945387,5.302583,89.3,12.4,19.656,9286.6465
8,FRA,2014,2.812,0.077543,38.833678,12.448917,1.401585,98.5,11.4,25.262,6860.5566
9,DEU,2013,2.332,0.101407,14.437775,20.824191,1.469701,57.6,14.0,24.285,7958.0161


In [15]:
global_data.to_csv('./labour_market.csv', index=False)