<a href="https://colab.research.google.com/github/VittorioBartolomeoSecondin/DVISProject-CAValli_Team/blob/main/Preprocessing_story2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries and connecting to Google Drive

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import json
import csv
import requests
import os
import zipfile
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Reading the data

In [6]:
all_files = glob.glob("/content/drive/MyDrive/DV_project/story2/*.xlsx")
all_files.sort()
print(all_files)

['/content/drive/MyDrive/DV_project/story2/early_leavers_by_sex_labour.xlsx', '/content/drive/MyDrive/DV_project/story2/pop_edu_sex_age_absolute.xlsx', '/content/drive/MyDrive/DV_project/story2/pop_edu_sex_age_percentage.xlsx', '/content/drive/MyDrive/DV_project/story2/poverty_exclusion_risk.xlsx', '/content/drive/MyDrive/DV_project/story2/poverty_exclusion_risk_by_age.xlsx', '/content/drive/MyDrive/DV_project/story2/poverty_exclusion_risk_by_education.xlsx']


## Education level: absolute value

We import the 36 sheets of the *pop_edu_sex_age_absolute* excel file (skipping useless rows/columns and avoiding the print of unharmful warnings) in a dictionary of datasets named *education_abs_sheets*.

In [7]:
sheet_names = [f'Sheet {i}' for i in range(1, 37)]
file_path = '/content/drive/MyDrive/DV_project/story2/pop_edu_sex_age_absolute.xlsx'
rows_to_skip = list(range(0, 10)) + [11, 12, 13, 44] + list(range(49, 56))
selected_columns = ['TIME'] + [str(year) for year in range(2009, 2023)]

# Just to suppress the unharmful warnings we were getting
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category = UserWarning, module = "openpyxl")
    # Dictionary with datasets: 'education_abs_sheet1', ..., 'education_abs_sheet36'
    education_abs_sheets = {f'education_abs_sheet{i}': pd.read_excel(file_path, sheet_name = sheet_name, skiprows = rows_to_skip, na_values = ':', usecols = selected_columns)
                            for i, sheet_name in enumerate(sheet_names, start = 1)}

We create 2 dictionaries linking each country to its short abbreviation (*country_abbreviations*) and viceversa (*country_abbreviations_reversed*).

In [None]:
country_abbreviations = {
    'Belgium': 'BEL',
    'Bulgaria': 'BGR',
    'Czechia': 'CZE',
    'Denmark': 'DNK',
    'Germany': 'DEU',
    'Estonia': 'EST',
    'Ireland': 'IRL',
    'Greece': 'GRC',
    'Spain': 'ESP',
    'France': 'FRA',
    'Croatia': 'HRV',
    'Italy': 'ITA',
    'Cyprus': 'CYP',
    'Latvia': 'LVA',
    'Lithuania': 'LTU',
    'Luxembourg': 'LUX',
    'Hungary': 'HUN',
    'Malta': 'MLT',
    'Netherlands': 'NLD',
    'Austria': 'AUT',
    'Poland': 'POL',
    'Portugal': 'PRT',
    'Romania': 'ROU',
    'Slovenia': 'SVN',
    'Slovakia': 'SVK',
    'Finland': 'FIN',
    'Sweden': 'SWE',
    'Iceland': 'ISL',
    'Norway': 'NOR',
    'Switzerland': 'CHE',
    'Montenegro': 'MNE',
    'North Macedonia': 'MKD',
    'Serbia': 'SRB',
    'Türkiye': 'TUR',
}

country_abbreviations_reversed = {value: key for key, value in country_abbreviations.items()}

We add a column *Abbreviation* to the 12 datasets.

In [None]:
for i in range(1, 13):
    neet_sheets[f'neet_sheet{i}'].rename(columns = {'TIME': 'Country'}, inplace = True)
    neet_sheets[f'neet_sheet{i}']['Abbreviation'] = neet_sheets[f'neet_sheet{i}']['Country'].map(country_abbreviations)

## Education level: percentage

## Enrolment by age

In [3]:
enrolment = pd.read_csv("/content/drive/MyDrive/DV_project/story2/enrolment_by_age.csv")

In [4]:
enrolment

Unnamed: 0,COUNTRY,Country,SEX,Gender,AGE,Age,INTENSITY,Intensity,EDUCATION_LEV,Education level,YEAR,Year,Value,Flag Codes,Flags
0,AUT,Austria,F,Female,Y15,15 years,PT,Part-time,ISCED11_0,Early childhood education,2013,2013,,m,Missing value; data cannot exist
1,AUT,Austria,F,Female,Y15,15 years,PT,Part-time,ISCED11_0,Early childhood education,2014,2014,,m,Missing value; data cannot exist
2,AUT,Austria,F,Female,Y15,15 years,PT,Part-time,ISCED11_0,Early childhood education,2015,2015,,m,Missing value; data cannot exist
3,AUT,Austria,F,Female,Y15,15 years,PT,Part-time,ISCED11_0,Early childhood education,2016,2016,,m,Missing value; data cannot exist
4,AUT,Austria,F,Female,Y15,15 years,PT,Part-time,ISCED11_0,Early childhood education,2017,2017,,m,Missing value; data cannot exist
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524155,BGR,Bulgaria,M,Male,_T,Total,PT,Part-time,ISCED11_T,Total,2017,2017,55553.0,,
524156,BGR,Bulgaria,M,Male,_T,Total,PT,Part-time,ISCED11_T,Total,2018,2018,57230.0,,
524157,BGR,Bulgaria,M,Male,_T,Total,PT,Part-time,ISCED11_T,Total,2019,2019,48242.0,,
524158,BGR,Bulgaria,M,Male,_T,Total,PT,Part-time,ISCED11_T,Total,2020,2020,42941.0,,


# Download all CSV files

We download all the CSV files we created.

In [None]:
directory_path = '/content'
file_type = ".csv"
files_to_download = glob.glob(f"{directory_path}/*{file_type}")
zip_filename = "datasets.zip"
with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in files_to_download:
        zipf.write(file, arcname = os.path.basename(file))
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>