<a href="https://colab.research.google.com/github/VittorioBartolomeoSecondin/DVISProject-CAValli_Team/blob/main/Preprocessing_story2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries and connecting to Google Drive

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import json
import csv
import requests
import os
import zipfile
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Reading and preprocessing data

In [2]:
all_files = glob.glob("/content/drive/MyDrive/DV_project/story2/*.xlsx")
all_files.sort()
print(all_files)

['/content/drive/MyDrive/DV_project/story2/early_leavers_by_sex_labour.xlsx', '/content/drive/MyDrive/DV_project/story2/pop_edu_sex_age_absolute.xlsx', '/content/drive/MyDrive/DV_project/story2/pop_edu_sex_age_percentage.xlsx', '/content/drive/MyDrive/DV_project/story2/poverty_exclusion_risk.xlsx', '/content/drive/MyDrive/DV_project/story2/poverty_exclusion_risk_by_age.xlsx', '/content/drive/MyDrive/DV_project/story2/poverty_exclusion_risk_by_education.xlsx']


## Dictionaries: country --> abbreviation (and viceversa)

We create 2 dictionaries linking each country to its short abbreviation (*country_abbreviations*) and viceversa (*country_abbreviations_reversed*).

In [6]:
country_abbreviations = {
    'European Union - 27 countries (from 2020)': 'EU',
    'Belgium': 'BEL',
    'Bulgaria': 'BGR',
    'Czechia': 'CZE',
    'Denmark': 'DNK',
    'Germany': 'DEU',
    'Estonia': 'EST',
    'Ireland': 'IRL',
    'Greece': 'GRC',
    'Spain': 'ESP',
    'France': 'FRA',
    'Croatia': 'HRV',
    'Italy': 'ITA',
    'Cyprus': 'CYP',
    'Latvia': 'LVA',
    'Lithuania': 'LTU',
    'Luxembourg': 'LUX',
    'Hungary': 'HUN',
    'Malta': 'MLT',
    'Netherlands': 'NLD',
    'Austria': 'AUT',
    'Poland': 'POL',
    'Portugal': 'PRT',
    'Romania': 'ROU',
    'Slovenia': 'SVN',
    'Slovakia': 'SVK',
    'Finland': 'FIN',
    'Sweden': 'SWE',
    'Iceland': 'ISL',
    'Norway': 'NOR',
    'Switzerland': 'CHE',
    'Montenegro': 'MNE',
    'North Macedonia': 'MKD',
    'Serbia': 'SRB',
    'Türkiye': 'TUR',
}

country_abbreviations_reversed = {value: key for key, value in country_abbreviations.items()}

## Education level: absolute value

We import the 36 sheets of the *pop_edu_sex_age_absolute* excel file (skipping useless rows/columns and avoiding the print of unharmful warnings) in a dictionary of datasets named *education_abs_sheets*.

In [None]:
years = range(2009, 2023)
sheet_names = [f'Sheet {i}' for i in range(1, 37)]
file_path = '/content/drive/MyDrive/DV_project/story2/pop_edu_sex_age_absolute.xlsx'
rows_to_skip = list(range(0, 10)) + [11, 12, 13, 44] + list(range(49, 58))
selected_columns = ['TIME'] + [str(year) for year in years]

# Just to suppress the unharmful warnings we were getting
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category = UserWarning, module = "openpyxl")
    # Dictionary with datasets: 'education_abs_sheet1', ..., 'education_abs_sheet36'
    education_abs_sheets = {f'education_abs_sheet{i}': pd.read_excel(file_path, sheet_name = sheet_name, skiprows = rows_to_skip, na_values = ':', usecols = selected_columns)
                            for i, sheet_name in enumerate(sheet_names, start = 1)}

We add a column *Abbreviation* to the 36 datasets.

In [None]:
for i in range(1, 37):
    education_abs_sheets[f'education_abs_sheet{i}'].rename(columns = {'TIME': 'Country'}, inplace = True)
    education_abs_sheets[f'education_abs_sheet{i}']['Abbreviation'] = education_abs_sheets[f'education_abs_sheet{i}']['Country'].map(country_abbreviations)

We sum data about 15-24 y.o. people and data about 25-29 y.o. people to have data coherent to what we used in story 1 (people aged between 15 and 29).

We create 3 different dataset, one for each educational level (low, medium and high).

In [None]:
low1 = education_abs_sheets['education_abs_sheet2']
low2 = education_abs_sheets['education_abs_sheet8']

medium1 = education_abs_sheets['education_abs_sheet3']
medium2 = education_abs_sheets['education_abs_sheet9']

high1 = education_abs_sheets['education_abs_sheet6']
high2 = education_abs_sheets['education_abs_sheet12']

In [None]:
low1.set_index(['Country', 'Abbreviation'], inplace = True)
low2.set_index(['Country', 'Abbreviation'], inplace = True)
low = low1.fillna(1000000).add(low2.fillna(1000000), fill_value = 0).round(1)
low = low.where(low < 1000000)
low1.reset_index(inplace = True)
low2.reset_index(inplace = True)
low.reset_index(inplace = True)

In [None]:
medium1.set_index(['Country', 'Abbreviation'], inplace = True)
medium2.set_index(['Country', 'Abbreviation'], inplace = True)
medium = medium1.fillna(1000000).add(medium2.fillna(1000000), fill_value = 0).round(1)
medium = medium.where(medium < 1000000)
medium1.reset_index(inplace = True)
medium2.reset_index(inplace = True)
medium.reset_index(inplace = True)

In [None]:
high1.set_index(['Country', 'Abbreviation'], inplace = True)
high2.set_index(['Country', 'Abbreviation'], inplace = True)
high = high1.fillna(1000000).add(high2.fillna(1000000), fill_value = 0).round(1)
high = high.where(high < 1000000)
high1.reset_index(inplace = True)
high2.reset_index(inplace = True)
high.reset_index(inplace = True)

### Stacked barcharts

We create one dataset for each year to use for the stacked barchart (percentage).

In [None]:
def create_stacked_dataset(year):
    stacked = pd.DataFrame()

    low_year = low[['Country', 'Abbreviation', year]].rename(columns = {year: 'low'})
    medium_year = medium[['Country', 'Abbreviation', year]].rename(columns = {year: 'medium'})
    high_year = high[['Country', 'Abbreviation', year]].rename(columns = {year: 'high'})

    stacked = pd.merge(low_year, medium_year, on = ['Country', 'Abbreviation'])
    stacked = pd.merge(stacked, high_year, on = ['Country', 'Abbreviation'])
    stacked = stacked.dropna(subset = ['low', 'medium', 'high'])

    return stacked

for year in range(2009, 2023):
    stacked_year = create_stacked_dataset(str(year))
    stacked_year = stacked_year.sort_values(by = 'Country')
    stacked_year.to_csv(f"stacked{year}_alphabetic.csv", index = False, na_rep = 'nan')

for year in range(2009, 2023):
    stacked_year = create_stacked_dataset(str(year))
    stacked_year = stacked_year.sort_values(by = 'low', ascending = False)
    stacked_year.to_csv(f"stacked{year}_low.csv", index = False, na_rep = 'nan')

for year in range(2009, 2023):
    stacked_year = create_stacked_dataset(str(year))
    stacked_year['total'] = stacked_year[['low', 'medium', 'high']].sum(axis = 1)
    stacked_year['low_percentage'] = (stacked_year['low'] / stacked_year['total']) * 100
    stacked_year = stacked_year.sort_values(by = 'low_percentage', ascending = False)
    stacked_year.drop(['total', 'low_percentage'], axis = 1, inplace = True)
    stacked_year.to_csv(f"stacked{year}.csv", index = False, na_rep = 'nan')

### Donut charts

We do the same for males.

In [None]:
low1_M = education_abs_sheets['education_abs_sheet14']
low2_M = education_abs_sheets['education_abs_sheet20']

medium1_M = education_abs_sheets['education_abs_sheet15']
medium2_M = education_abs_sheets['education_abs_sheet21']

high1_M = education_abs_sheets['education_abs_sheet18']
high2_M = education_abs_sheets['education_abs_sheet24']

In [None]:
low1_M.set_index(['Country', 'Abbreviation'], inplace = True)
low2_M.set_index(['Country', 'Abbreviation'], inplace = True)
low_M = low1_M.fillna(1000000).add(low2_M.fillna(1000000), fill_value = 0).round(1)
low_M = low_M.where(low_M < 1000000)
low1_M.reset_index(inplace = True)
low2_M.reset_index(inplace = True)
low_M.reset_index(inplace = True)

In [None]:
medium1_M.set_index(['Country', 'Abbreviation'], inplace = True)
medium2_M.set_index(['Country', 'Abbreviation'], inplace = True)
medium_M = medium1_M.fillna(1000000).add(medium2_M.fillna(1000000), fill_value = 0).round(1)
medium_M = medium_M.where(medium_M < 1000000)
medium1_M.reset_index(inplace = True)
medium2_M.reset_index(inplace = True)
medium_M.reset_index(inplace = True)

In [None]:
high1_M.set_index(['Country', 'Abbreviation'], inplace = True)
high2_M.set_index(['Country', 'Abbreviation'], inplace = True)
high_M = high1_M.fillna(1000000).add(high2_M.fillna(1000000), fill_value = 0).round(1)
high_M = high_M.where(high_M < 1000000)
high1_M.reset_index(inplace = True)
high2_M.reset_index(inplace = True)
high_M.reset_index(inplace = True)

We do the same for females.

In [None]:
low1_F = education_abs_sheets['education_abs_sheet26']
low2_F = education_abs_sheets['education_abs_sheet32']

medium1_F = education_abs_sheets['education_abs_sheet27']
medium2_F = education_abs_sheets['education_abs_sheet33']

high1_F = education_abs_sheets['education_abs_sheet30']
high2_F = education_abs_sheets['education_abs_sheet36']

In [None]:
low1_F.set_index(['Country', 'Abbreviation'], inplace = True)
low2_F.set_index(['Country', 'Abbreviation'], inplace = True)
low_F = low1_F.fillna(1000000).add(low2_F.fillna(1000000), fill_value = 0).round(1)
low_F = low_F.where(low_F < 1000000)
low1_F.reset_index(inplace = True)
low2_F.reset_index(inplace = True)
low_F.reset_index(inplace = True)

In [None]:
medium1_F.set_index(['Country', 'Abbreviation'], inplace = True)
medium2_F.set_index(['Country', 'Abbreviation'], inplace = True)
medium_F = medium1_F.fillna(1000000).add(medium2_F.fillna(1000000), fill_value = 0).round(1)
medium_F = medium_F.where(medium_F < 1000000)
medium1_F.reset_index(inplace = True)
medium2_F.reset_index(inplace = True)
medium_F.reset_index(inplace = True)

In [None]:
high1_F.set_index(['Country', 'Abbreviation'], inplace = True)
high2_F.set_index(['Country', 'Abbreviation'], inplace = True)
high_F = high1_F.fillna(1000000).add(high2_F.fillna(1000000), fill_value = 0).round(1)
high_F = high_F.where(high_F < 1000000)
high1_F.reset_index(inplace = True)
high2_F.reset_index(inplace = True)
high_F.reset_index(inplace = True)

We extract info on differences between M and F.

In [None]:
levels = ['low', 'medium', 'high']
genders = ['M', 'F']

sum_datasets = {}
difference_datasets = {}
percentage_datasets = {}

for level in levels:
    for gender in genders:

        male_dataframe = f"{level}_M"
        female_dataframe = f"{level}_F"
        sum_dataframe = f"{level}_sum"
        difference_dataframe = f"{level}_difference"
        percentage_dataframe = f"{level}_percentage"

        male_df = locals()[male_dataframe]
        female_df = locals()[female_dataframe]

        sum_df = male_df.copy()
        sum_df.iloc[:, 2:] = male_df.iloc[:, 2:] + female_df.iloc[:, 2:]
        sum_datasets[sum_dataframe] = sum_df

        difference_df = male_df.copy()
        difference_df.iloc[:, 2:] = male_df.iloc[:, 2:] - female_df.iloc[:, 2:]
        difference_datasets[difference_dataframe] = difference_df

        percentage_df = male_df.copy()
        percentage_df.iloc[:, 2:] = (difference_df.iloc[:, 2:] / sum_df.iloc[:, 2:]) * 100
        percentage_datasets[percentage_dataframe] = percentage_df

For each year (from 2009 to 2022) we retrieve the top 3 countries with highest percentual difference between M and F (for low, medium and high education level), considering just the ABSOLUTE VALUE of this percentage (NOT considering the sign).

In [None]:
def ordinal(n):
    suffix = 'th' if 11 <= n <= 13 else {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
    return f"{n}{suffix}"

# Function to convert the percentage column to numeric
def convert_to_numeric(df, column):
    df[column] = pd.to_numeric(df[column], errors = 'coerce')
    return df

level_datasets = {
    'low': {'M': low_M, 'F': low_F},
    'medium': {'M': medium_M, 'F': medium_F},
    'high': {'M': high_M, 'F': high_F}
}

for year in range(2009, 2023):
    print(f"\nIn the year {year}:")
    donut_data = []
    for level in levels:
        percentage_df = percentage_datasets[f"{level}_percentage"]

        # Reshape data using melt
        melted_df = pd.melt(percentage_df, id_vars = ['Country'], var_name = 'Year', value_name = f"{level}_difference")

        # Convert 'Year' and difference columns to numeric
        melted_df = convert_to_numeric(melted_df, 'Year')
        melted_df = convert_to_numeric(melted_df, f"{level}_difference")

        # Get the top 3 countries with the highest difference in absolute percentage
        top_countries = melted_df[melted_df['Year'] == year].sort_values(by = f"{level}_difference", key = lambda x: x.abs(), ascending = False).head(3)[['Country', f"{level}_difference"]]

        for i, (country, percentage) in enumerate(top_countries.itertuples(index = False, name = None), start = 1):
            print(f"The {ordinal(i)} country with the highest difference in percentage between M and F for {level} education is {country} with a percentage of {percentage:.2f}%")
            valueM = level_datasets[level]['M'].loc[ level_datasets[level]['M']['Country'] == country, str(year) ].values[0]
            valueF = level_datasets[level]['F'].loc[ level_datasets[level]['F']['Country'] == country, str(year) ].values[0]
            donut_data.append({'name': country, 'abbreviation': country_abbreviations.get(country),'level': level, 'rank': i, 'valueM': valueM, 'valueF': valueF})

    donut_df = pd.DataFrame(donut_data)
    donut_df.to_csv(f"donut{year}.csv", index = False)


In the year 2009:
The 1st country with the highest difference in percentage between M and F for low education is Spain with a percentage of 14.90%
The 2nd country with the highest difference in percentage between M and F for low education is Greece with a percentage of 12.12%
The 3rd country with the highest difference in percentage between M and F for low education is Malta with a percentage of 11.71%
The 1st country with the highest difference in percentage between M and F for medium education is Bulgaria with a percentage of 12.53%
The 2nd country with the highest difference in percentage between M and F for medium education is Türkiye with a percentage of 11.79%
The 3rd country with the highest difference in percentage between M and F for medium education is North Macedonia with a percentage of 11.33%
The 1st country with the highest difference in percentage between M and F for high education is Croatia with a percentage of -33.95%
The 2nd country with the highest difference in pe

For each year (from 2009 to 2022) we retrieve the top 3 countries with highest percentual difference between M and F (for low, medium and high education level), considering THE SIGN of this percentage (NOT considering the absolute value).

In [None]:
for year in range(2009, 2023):
    print(f"\nIn the year {year}:")

    for level in levels:
        percentage_df = percentage_datasets[f"{level}_percentage"]

        # Reshape data using melt
        melted_df = pd.melt(percentage_df, id_vars=['Country'], var_name='Year', value_name=f"{level}_difference")

        # Convert 'Year' and difference columns to numeric
        melted_df = convert_to_numeric(melted_df, 'Year')
        melted_df = convert_to_numeric(melted_df, f"{level}_difference")

        # Get the top 3 countries with the highest difference in percentage
        top_countries = melted_df[melted_df['Year'] == year].nlargest(3, f"{level}_difference")[['Country', f"{level}_difference"]]

        # Print information for each country
        for i, (country, percentage) in enumerate(top_countries.itertuples(index = False, name = None), start=1):
            print(f"The {ordinal(i)} country with the highest difference in percentage between M and F for {level} education is {country} with a percentage of {percentage:.2f}%")


In the year 2009:
The 1st country with the highest difference in percentage between M and F for low education is Spain with a percentage of 14.90%
The 2nd country with the highest difference in percentage between M and F for low education is Greece with a percentage of 12.12%
The 3rd country with the highest difference in percentage between M and F for low education is Malta with a percentage of 11.71%
The 1st country with the highest difference in percentage between M and F for medium education is Bulgaria with a percentage of 12.53%
The 2nd country with the highest difference in percentage between M and F for medium education is Türkiye with a percentage of 11.79%
The 3rd country with the highest difference in percentage between M and F for medium education is North Macedonia with a percentage of 11.33%
The 1st country with the highest difference in percentage between M and F for high education is Switzerland with a percentage of 2.22%
The 2nd country with the highest difference in 

## Education level: percentage --> we are NOT using this dataset for now!

We import the 18 sheets of the *pop_edu_sex_age_percentage* excel file (skipping useless rows/columns and avoiding the print of unharmful warnings) in a dictionary of datasets named *education_per_sheets*.

In [None]:
years = range(2009, 2023)
sheet_names = [f'Sheet {i}' for i in range(1, 19)]
file_path = '/content/drive/MyDrive/DV_project/story2/pop_edu_sex_age_percentage.xlsx'
rows_to_skip = list(range(0, 10)) + [11, 12, 13, 44] + list(range(49, 56))
selected_columns = ['TIME'] + [str(year) for year in years]

# Just to suppress the unharmful warnings we were getting
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category = UserWarning, module = "openpyxl")
    # Dictionary with datasets: 'education_per_sheet1', ..., 'education_per_sheet36'
    education_per_sheets = {f'education_per_sheet{i}': pd.read_excel(file_path, sheet_name = sheet_name, skiprows = rows_to_skip, na_values = ':', usecols = selected_columns)
                            for i, sheet_name in enumerate(sheet_names, start = 1)}

We add a column *Abbreviation* to the 18 datasets.

In [None]:
for i in range(1, 19):
    education_per_sheets[f'education_per_sheet{i}'].rename(columns = {'TIME': 'Country'}, inplace = True)
    education_per_sheets[f'education_per_sheet{i}']['Abbreviation'] = education_per_sheets[f'education_per_sheet{i}']['Country'].map(country_abbreviations)

## Enrolment by age --> from 15 to 29, in years 2013 to 2021

In [175]:
enrolment = pd.read_csv("/content/drive/MyDrive/DV_project/story2/enrolment_by_age.csv")

In [176]:
distinct_values_dict = {col: enrolment[col].unique() for col in enrolment.columns}
for col, values in distinct_values_dict.items():
    print(f"\nDistinct values for column '{col}': {values}")


Distinct values for column 'COUNTRY': ['AUT' 'BEL' 'CZE' 'DNK' 'FIN' 'FRA' 'DEU' 'GRC' 'HUN' 'ISL' 'IRL' 'ITA'
 'LUX' 'NLD' 'NOR' 'POL' 'PRT' 'SVK' 'ESP' 'SWE' 'CHE' 'TUR' 'EST' 'LVA'
 'LTU' 'SVN' 'BGR' 'HRV' 'ROU']

Distinct values for column 'Country': ['Austria' 'Belgium' 'Czechia' 'Denmark' 'Finland' 'France' 'Germany'
 'Greece' 'Hungary' 'Iceland' 'Ireland' 'Italy' 'Luxembourg' 'Netherlands'
 'Norway' 'Poland' 'Portugal' 'Slovak Republic' 'Spain' 'Sweden'
 'Switzerland' 'Türkiye' 'Estonia' 'Latvia' 'Lithuania' 'Slovenia'
 'Bulgaria' 'Croatia' 'Romania']

Distinct values for column 'SEX': ['F' 'M' '_T']

Distinct values for column 'Gender': ['Female' 'Male' 'Total']

Distinct values for column 'AGE': ['Y15' 'Y16' 'Y17' 'Y18' 'Y19' 'Y20' 'Y21' 'Y22' 'Y23' 'Y24' 'Y25' 'Y26'
 'Y27' 'Y28' 'Y29' '_T']

Distinct values for column 'Age': ['15 years' '16 years' '17 years' '18 years' '19 years' '20 years'
 '21 years' '22 years' '23 years' '24 years' '25 years' '26 years'
 '27 years' '28 ye

In [177]:
filtered_enrolment = enrolment[~enrolment['Flag Codes'].isin(['m', 'o']) \
                               & (enrolment['Value'].notna()) \
                               & (enrolment['Value']!= 0) \
                               & (enrolment['INTENSITY'] == '_T') \
                               & (enrolment['SEX'] == '_T')
                               ]

In [178]:
enrolment_final = filtered_enrolment.copy()
enrolment_final.drop(['SEX', 'Gender', 'AGE', 'INTENSITY', 'Intensity', 'YEAR', 'Flag Codes', 'Flags'], axis = 1, inplace = True)
enrolment_final['Value'] = enrolment_final['Value'].round(0).astype(int)
enrolment_final.rename(columns = {'COUNTRY': 'Abbreviation'}, inplace = True)
enrolment_final['Country'] = enrolment_final['Abbreviation'].map(country_abbreviations_reversed)

In [179]:
filtered_distinct_values_dict = {col: enrolment_final[col].unique() for col in enrolment_final.columns}
for col, values in filtered_distinct_values_dict.items():
    print(f"\nDistinct values for column '{col}': {values}")


Distinct values for column 'Abbreviation': ['AUT' 'BEL' 'CZE' 'DNK' 'FIN' 'FRA' 'DEU' 'GRC' 'HUN' 'ISL' 'IRL' 'ITA'
 'LUX' 'NLD' 'NOR' 'POL' 'PRT' 'SVK' 'ESP' 'SWE' 'CHE' 'TUR' 'EST' 'LVA'
 'LTU' 'SVN' 'BGR' 'HRV' 'ROU']

Distinct values for column 'Country': ['Austria' 'Belgium' 'Czechia' 'Denmark' 'Finland' 'France' 'Germany'
 'Greece' 'Hungary' 'Iceland' 'Ireland' 'Italy' 'Luxembourg' 'Netherlands'
 'Norway' 'Poland' 'Portugal' 'Slovakia' 'Spain' 'Sweden' 'Switzerland'
 'Türkiye' 'Estonia' 'Latvia' 'Lithuania' 'Slovenia' 'Bulgaria' 'Croatia'
 'Romania']

Distinct values for column 'Age': ['15 years' '16 years' '17 years' '18 years' '19 years' '20 years'
 '21 years' '22 years' '23 years' '24 years' '25 years' '26 years'
 '27 years' '28 years' '29 years' 'Total']

Distinct values for column 'EDUCATION_LEV': ['ISCED11_1' 'ISCED11_2' 'ISCED11_24' 'ISCED11_3' 'ISCED11_34'
 'ISCED11_35' 'ISCED11_5' 'ISCED11_55' 'ISCED11_7' 'ISCED11_5T8'
 'ISCED11_T' 'ISCED11_25' 'ISCED11_4' 'ISCED11_44' 

In [180]:
education_level_dict = dict(zip(enrolment_final['EDUCATION_LEV'].unique(), enrolment_final['Education level'].unique()))
sorted_education_level_dict = dict(sorted(education_level_dict.items()))
sorted_education_level_dict

{'ISCED11_0': 'Early childhood education',
 'ISCED11_01': 'Early childhood educational development',
 'ISCED11_02': 'Pre-primary education',
 'ISCED11_1': 'Primary education',
 'ISCED11_2': 'Lower secondary education',
 'ISCED11_24': 'Lower secondary general education',
 'ISCED11_25': 'Lower secondary vocational education',
 'ISCED11_3': 'Upper secondary education',
 'ISCED11_34': 'Upper secondary general education',
 'ISCED11_35': 'Upper secondary vocational education',
 'ISCED11_4': 'Post-secondary non-tertiary education',
 'ISCED11_44': 'Post-secondary non-tertiary general education',
 'ISCED11_45': 'Post-secondary non-tertiary vocational education',
 'ISCED11_5': 'Short-cycle tertiary education',
 'ISCED11_54': 'Short-cycle tertiary general education',
 'ISCED11_55': 'Short-cycle tertiary vocational education',
 'ISCED11_5T8': 'Tertiary education',
 'ISCED11_6': 'Bachelor’s or equivalent level',
 'ISCED11_7': 'Master’s or equivalent level',
 'ISCED11_8': 'Doctoral or equivalent lev

In [181]:
years = range(2013, 2022)
enrolment_dict = {}

for year in years:
    current_year_df = enrolment_final[enrolment_final['Year'] == year].copy()
    enrolment_dict[f'enrolment{year}_unfiltered'] = current_year_df
    current_year_df = current_year_df[(current_year_df['Education level'] != 'Total') \
                                      & (current_year_df['Age'] != 'Total')]
    enrolment_dict[f'enrolment{year}'] = current_year_df

In [182]:
years = range(2013, 2022)
enrolment_country = {}

for country in enrolment_final['Country'].unique():
    enrolment_country[country] = {}

    for year in years:
        current_data = enrolment_final[(enrolment_final['Country'] == country) & (enrolment_final['Year'] == year)].copy()
        enrolment_country[country][f'enrolment{year}_unfiltered'] = current_data.copy()
        current_data = current_data[(current_data['Education level'] != 'Total') & (current_data['Age'] != 'Total')]
        enrolment_country[country][f'enrolment{year}'] = current_data.copy()

### Streamgraphs

In [172]:
streamgraph_dataframes = {}

for country in enrolment_final['Country'].unique():
    for year in years:

        df = enrolment_country[country][f'enrolment{year}'].copy()
        pivoted_df = df.pivot_table(index = 'Age', columns = 'EDUCATION_LEV', values = 'Value', fill_value = 0) ### Pivot the dataset
        pivoted_df = pivoted_df.reset_index() ### Reset the index to make 'Age' a regular column
        streamgraph_dataframes[f'enrolment{year}'] = pivoted_df ### Store the pivoted DataFrame in the dictionary

        export_filename = f'streamgraph_{country_abbreviations.get(country)}_{year}.csv'
        pivoted_df.to_csv(export_filename, index = False)

## Early leavers: 18 to 24 and percentage value

We import the 15 sheets of the *early_leavers_by_sex_labour* excel file (skipping useless rows/columns and avoiding the print of unharmful warnings) in a dictionary of datasets named *early_leavers_sheets*.

In [7]:
years = range(2009, 2023)
sheet_names = [f'Sheet {i}' for i in range(1, 16)]
file_path = '/content/drive/MyDrive/DV_project/story2/early_leavers_by_sex_labour.xlsx'
rows_to_skip = list(range(0, 10)) + [11, 13, 44] + list(range(49, 58))
selected_columns = ['TIME'] + [str(year) for year in years]

# Just to suppress the unharmful warnings we were getting
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category = UserWarning, module = "openpyxl")
    # Dictionary with datasets: 'early_leavers_sheet1', ..., 'early_leavers_sheet15'
    early_leavers_sheets = {f'early_leavers_sheet{i}': pd.read_excel(file_path, sheet_name = sheet_name, skiprows = rows_to_skip, na_values = ':', usecols = selected_columns)
                            for i, sheet_name in enumerate(sheet_names, start = 1)}

We add a column *Abbreviation* to the 15 datasets.

In [8]:
for i in range(1, 16):
    early_leavers_sheets[f'early_leavers_sheet{i}'].rename(columns = {'TIME': 'Country'}, inplace = True)
    early_leavers_sheets[f'early_leavers_sheet{i}']['Abbreviation'] = early_leavers_sheets[f'early_leavers_sheet{i}']['Country'].map(country_abbreviations)

### Lollipop charts

In [47]:
early_leavers_datasets = {}

for year in years:
    early_leavers_datasets[f'early_leavers_{year}'] = early_leavers_sheets['early_leavers_sheet1'][['Country', str(year), 'Abbreviation']].copy()
    early_leavers_datasets[f'early_leavers_{year}'].dropna(subset = [str(year)], inplace = True)
    early_leavers_datasets[f'early_leavers_{year}'].sort_values(by = [str(year)], ascending = [False], inplace = True)

In [48]:
header = ['name', 'abundance', 'abbreviation']

for year in years:
  df = early_leavers_datasets[f'early_leavers_{year}'].copy()

  df.to_csv(f'lollipop{year}.csv', index = False, header = header)

  df['sorting_key'] = (df['Abbreviation'] == 'EU')
  df = df.sort_values(by = ['sorting_key', str(year), 'Country'], ascending = [False, False, True]).drop('sorting_key', axis = 1).reset_index(drop = True)
  df.to_csv(f'lollipop{year}_EU.csv', index = False, header = header)

### Pyramid/Waffle charts --> EU, ITA, TUR

In [44]:
early_leavers_sheets[f'early_leavers_sheet1']['Total'] = round(early_leavers_sheets[f'early_leavers_sheet1'].iloc[:, 1:15].mean(axis = 1), 1)
total_df = early_leavers_sheets[f'early_leavers_sheet1'][['Country', 'Abbreviation', 'Total']]

early_leavers_sheets[f'early_leavers_sheet2']['Employed'] = round(early_leavers_sheets[f'early_leavers_sheet2'].iloc[:, 1:15].mean(axis = 1), 1)
employed_df = early_leavers_sheets[f'early_leavers_sheet2'][['Country', 'Abbreviation', 'Employed']]

early_leavers_sheets[f'early_leavers_sheet3']['Unemployed'] = round(early_leavers_sheets[f'early_leavers_sheet3'].iloc[:, 1:15].mean(axis = 1), 1)
unemployed_df = early_leavers_sheets[f'early_leavers_sheet3'][['Country', 'Abbreviation', 'Unemployed']]

early_leavers_sheets[f'early_leavers_sheet4']['Want to work'] = round(early_leavers_sheets[f'early_leavers_sheet4'].iloc[:, 1:15].mean(axis = 1), 1)
want_to_work_df = early_leavers_sheets[f'early_leavers_sheet4'][['Country', 'Abbreviation', 'Want to work']]

early_leavers_sheets[f'early_leavers_sheet5']['Do not want to work'] = round(early_leavers_sheets[f'early_leavers_sheet5'].iloc[:, 1:15].mean(axis = 1), 1)
do_not_want_to_work_df = early_leavers_sheets[f'early_leavers_sheet5'][['Country', 'Abbreviation', 'Do not want to work']]

In [45]:
pyramid_df = pd.merge(total_df, employed_df, on = ['Country', 'Abbreviation'], how = 'inner') \
               .merge(unemployed_df, on = ['Country', 'Abbreviation'], how = 'inner') \
               .merge(want_to_work_df, on = ['Country', 'Abbreviation'], how = 'inner') \
               .merge(do_not_want_to_work_df, on = ['Country', 'Abbreviation'], how = 'inner')

In [46]:
pyramid_recap_df = pyramid_df[pyramid_df['Abbreviation'].isin(['ITA', 'TUR', 'EU'])]
pyramid_recap_df

Unnamed: 0,Country,Abbreviation,Total,Employed,Unemployed,Want to work,Do not want to work
0,European Union - 27 countries (from 2020),EU,11.3,4.9,6.4,4.3,2.2
12,Italy,ITA,15.2,5.5,9.6,7.3,2.4
34,Türkiye,TUR,36.2,16.0,20.1,5.2,14.9


# Download all CSV files

We download all the CSV files we created.

In [173]:
directory_path = '/content'
file_type = ".csv"
files_to_download = glob.glob(f"{directory_path}/*{file_type}")
zip_filename = "all_streamgraphs.zip"
with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in files_to_download:
        zipf.write(file, arcname = os.path.basename(file))
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

We remove all the CSV files we downloaded.

In [174]:
for file in files_to_download:
    os.remove(file)