<a href="https://colab.research.google.com/github/VittorioBartolomeoSecondin/DVISProject-CAValli_Team/blob/main/Preprocessing_story3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries and connecting to Google Drive

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import json
import csv
import requests
import os
import zipfile
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Reading and preprocessing data

In [26]:
all_files = glob.glob("/content/drive/MyDrive/DV_project/story3/*")
all_files.sort()
print(all_files)

['/content/drive/MyDrive/DV_project/story3/NEET_italy.csv', '/content/drive/MyDrive/DV_project/story3/NEET_italy_new.csv', '/content/drive/MyDrive/DV_project/story3/italy.json']


## NEET in Italy

We read the two CSV datasets (from 2009 to 2017, and from 2018 to 2023, to then merge together).

In [75]:
NEET_italy = pd.read_csv("/content/drive/MyDrive/DV_project/story3/NEET_italy.csv", low_memory = False)
NEET_italy_new = pd.read_csv("/content/drive/MyDrive/DV_project/story3/NEET_italy_new.csv", low_memory = False)

We observe the column values of both datasets.

In [76]:
distinct_values_dict = {col: NEET_italy[col].unique() for col in NEET_italy.columns}
for col, values in distinct_values_dict.items():
    print(f"\nDistinct values for column '{col}': {values}")


Distinct values for column 'ITTER107': ['IT' 'ITC1' 'ITC2' 'ITC3' 'ITC4' 'ITDA' 'ITD1' 'ITD2' 'ITD3' 'ITD4'
 'ITD5' 'ITE1' 'ITE2' 'ITE3' 'ITE4' 'ITF1' 'ITF2' 'ITF3' 'ITF4' 'ITF5'
 'ITF6' 'ITG1' 'ITG2']

Distinct values for column 'Territorio': ['Italia' 'Piemonte' "Valle d'Aosta / Vallée d'Aoste" 'Liguria'
 'Lombardia' 'Trentino Alto Adige / Südtirol'
 'Provincia Autonoma Bolzano / Bozen' 'Provincia Autonoma Trento' 'Veneto'
 'Friuli-Venezia Giulia' 'Emilia-Romagna' 'Toscana' 'Umbria' 'Marche'
 'Lazio' 'Abruzzo' 'Molise' 'Campania' 'Puglia' 'Basilicata' 'Calabria'
 'Sicilia' 'Sardegna']

Distinct values for column 'TIPO_DATO_FOL': ['NEET' 'NEET_I']

Distinct values for column 'Tipo dato': ['giovani Neet di 15-34 anni (non occupati e non in istruzione)'
 'incidenza dei giovani Neet di 15-34 anni (non occupati e non in istruzione)']

Distinct values for column 'SEXISTAT1': [1 2 9]

Distinct values for column 'Sesso': ['maschi' 'femmine' 'totale']

Distinct values for column 'ETA1': ['Y1

In [77]:
distinct_values_dict = {col: NEET_italy_new[col].unique() for col in NEET_italy_new.columns}
for col, values in distinct_values_dict.items():
    print(f"\nDistinct values for column '{col}': {values}")


Distinct values for column 'ITTER107': ['IT' 'ITC1' 'ITC2' 'ITC3' 'ITC4' 'ITDA' 'ITD1' 'ITD2' 'ITD3' 'ITD4'
 'ITD5' 'ITE1' 'ITE2' 'ITE3' 'ITE4' 'ITF1' 'ITF2' 'ITF3' 'ITF4' 'ITF5'
 'ITF6' 'ITG1' 'ITG2']

Distinct values for column 'Territorio': ['Italia' 'Piemonte' "Valle d'Aosta / Vallée d'Aoste" 'Liguria'
 'Lombardia' 'Trentino Alto Adige / Südtirol'
 'Provincia Autonoma Bolzano / Bozen' 'Provincia Autonoma Trento' 'Veneto'
 'Friuli-Venezia Giulia' 'Emilia-Romagna' 'Toscana' 'Umbria' 'Marche'
 'Lazio' 'Abruzzo' 'Molise' 'Campania' 'Puglia' 'Basilicata' 'Calabria'
 'Sicilia' 'Sardegna']

Distinct values for column 'TIPO_DATO_FOL': ['NEET' 'NEET_I']

Distinct values for column 'Tipo dato': ['giovani Neet di 15-34 anni (non occupati e non in istruzione)'
 'incidenza dei giovani Neet di 15-34 anni (non occupati e non in istruzione)']

Distinct values for column 'ETA1': ['Y15-29']

Distinct values for column 'Classe di età': ['15-29 anni']

Distinct values for column 'TIME': ['2018' '2018

We discard uninteresting information (like the gender).

In [78]:
years = {str(year) for year in range(2009, 2018)}
filtered_NEET = NEET_italy[( NEET_italy['Sesso'] == 'totale') \
                           & (NEET_italy['Condizione professionale europea'] == 'totale') \
                           & (~NEET_italy['Territorio'].isin(['Provincia Autonoma Bolzano / Bozen', 'Provincia Autonoma Trento']) ) \
                           & (NEET_italy['TIME'].isin([str(year) for year in range(2009, 2018)]) )
                           ]

In [79]:
years_new = {str(year) for year in range(2018, 2024)}
filtered_NEET_new = NEET_italy_new[(~NEET_italy_new['Territorio'].isin(['Provincia Autonoma Bolzano / Bozen', 'Provincia Autonoma Trento']) ) \
                                   & (NEET_italy_new['TIME'].isin([str(year) for year in range(2018, 2023)]) )
                                   ]

We drop useless columns.

In [80]:
NEET_final = filtered_NEET.copy()
NEET_final.drop(['ITTER107', 'Tipo dato', 'SEXISTAT1', 'Sesso', 'ETA1', 'Classe di età', 'CONDIZIONE_PROF_EU', 'Condizione professionale europea', 'Seleziona periodo'], axis = 1, inplace = True)

NEET_final_new = filtered_NEET_new.copy()
NEET_final_new.drop(['ITTER107', 'Tipo dato', 'ETA1', 'Classe di età', 'Seleziona periodo'], axis = 1, inplace = True)

We subdivide for absolute value and for percentage for both datasets.

In [81]:
NEET_absolute = NEET_final[NEET_final['TIPO_DATO_FOL'] == 'NEET']
NEET_abs = NEET_absolute.copy()
NEET_abs.drop('TIPO_DATO_FOL', axis = 1, inplace = True)
NEET_percentage = NEET_final[NEET_final['TIPO_DATO_FOL'] == 'NEET_I']
NEET_per = NEET_percentage.copy()
NEET_per.drop('TIPO_DATO_FOL', axis = 1, inplace = True)

NEET_absolute_new = NEET_final_new[NEET_final_new['TIPO_DATO_FOL'] == 'NEET']
NEET_abs_new = NEET_absolute_new.copy()
NEET_abs_new.drop('TIPO_DATO_FOL', axis = 1, inplace = True)
NEET_percentage_new = NEET_final_new[NEET_final_new['TIPO_DATO_FOL'] == 'NEET_I']
NEET_per_new = NEET_percentage_new.copy()
NEET_per_new.drop('TIPO_DATO_FOL', axis = 1, inplace = True)

We create a dictionary of datasets, one for each year (with all regions, the absolute value, the absolute value in thousands persons and the percentage).

In [82]:
neet_dict = {}

for year in years:
    neet_temp = NEET_abs[NEET_abs['TIME'] == year].copy()
    neet_per_temp = NEET_per[NEET_per['TIME'] == year].copy()
    merged_df = pd.merge(neet_temp, neet_per_temp[['Territorio', 'Value']], on = ['Territorio'], how = 'left')

    merged_df[f'{year}'] = (merged_df['Value_x'] * 1000).round(0).astype(int)
    merged_df[f'{year}_K'] = merged_df['Value_x'].round(1)
    merged_df[f'{year}_perc'] = merged_df['Value_y'].round(1)
    merged_df.drop(['TIME', 'Value_x', 'Value_y'], axis = 1, inplace = True)

    neet_dict[f'NEET_{year}'] = merged_df

for year in years_new:
    neet_temp_new = NEET_abs_new[NEET_abs_new['TIME'] == year].copy()
    neet_per_temp_new = NEET_per_new[NEET_per_new['TIME'] == year].copy()
    merged_df_new = pd.merge(neet_temp_new, neet_per_temp_new[['Territorio', 'Value']], on = ['Territorio'], how = 'left')

    merged_df_new[f'{year}'] = (merged_df_new['Value_x'] * 1000).round(0).astype(int)
    merged_df_new[f'{year}_K'] = merged_df_new['Value_x'].round(1)
    merged_df_new[f'{year}_perc'] = merged_df_new['Value_y'].round(1)
    merged_df_new.drop(['TIME', 'Value_x', 'Value_y'], axis = 1, inplace = True)

    neet_dict[f'NEET_{year}'] = merged_df_new

In [83]:
neet_dict

{'NEET_2016':                         Territorio     2016  2016_K  2016_perc
 0                           Italia  2214129  2214.1       24.3
 1                         Piemonte   118489   118.5       20.0
 2   Valle d'Aosta / Vallée d'Aoste     3096     3.1       17.7
 3                          Liguria    34859    34.9       17.6
 4                        Lombardia   239413   239.4       16.9
 5   Trentino Alto Adige / Südtirol    21461    21.5       12.6
 6                           Veneto   109680   109.7       15.6
 7            Friuli-Venezia Giulia    28006    28.0       17.8
 8                   Emilia-Romagna    92047    92.0       15.7
 9                          Toscana    89087    89.1       18.0
 10                          Umbria    21814    21.8       17.7
 11                          Marche    41803    41.8       19.2
 12                           Lazio   192724   192.7       22.5
 13                         Abruzzo    48925    48.9       24.7
 14                        

In [70]:
from functools import reduce
years_reduce = {str(year) for year in range(2009, 2023)}
dataframes = [neet_dict[f'NEET_{year}'] for year in years_reduce]
neet_dict['NEET'] = reduce(lambda left, right: pd.merge(left, right, on = 'Territorio', how = 'inner'), dataframes)

In [71]:
region_dict = {
    "Valle d'Aosta / Vallée d'Aoste": "Valle d'Aosta/Vallée d'Aoste",
    "Trentino Alto Adige / Südtirol": "Trentino-Alto Adige/Südtirol"
}
neet_dict['NEET']['Territorio'] = neet_dict['NEET']['Territorio'].replace(region_dict)
neet_dict['NEET'].rename(columns = {'Territorio': 'Region'}, inplace = True)

In [72]:
neet_dict['NEET']

Unnamed: 0,Region,2016,2016_K,2016_perc,2011,2011_K,2011_perc,2014,2014_K,2014_perc,...,2021_perc,2022,2022_K,2022_perc,2009,2009_K,2009_perc,2015,2015_K,2015_perc
0,Italia,2214129,2214.1,24.3,2097124,2097.1,22.5,2413297,2413.3,26.2,...,23.1,1669553,1669.6,19.0,1923599,1923.6,20.5,2349101,2349.1,25.7
1,Piemonte,118489,118.5,20.0,97816,97.8,16.3,126683,126.7,21.3,...,19.2,90615,90.6,15.4,96410,96.4,15.9,118744,118.7,20.0
2,Valle d'Aosta/Vallée d'Aoste,3096,3.1,17.7,2666,2.7,15.2,3328,3.3,19.1,...,18.0,2710,2.7,15.3,2582,2.6,14.6,3417,3.4,19.5
3,Liguria,34859,34.9,17.6,30291,30.3,15.4,43089,43.1,21.6,...,19.6,29432,29.4,14.8,27844,27.8,14.4,40881,40.9,20.5
4,Lombardia,239413,239.4,16.9,209315,209.3,15.1,255516,255.5,18.2,...,18.4,198915,198.9,13.6,196688,196.7,14.2,261916,261.9,18.6
5,Trentino-Alto Adige/Südtirol,21461,21.5,12.6,18220,18.2,11.0,23963,24.0,14.3,...,15.4,18230,18.2,10.5,16256,16.3,9.9,21980,22.0,13.0
6,Veneto,109680,109.7,15.6,107839,107.8,15.2,118255,118.3,16.8,...,13.9,93972,94.0,13.1,89178,89.2,12.4,118990,119.0,17.0
7,Friuli-Venezia Giulia,28006,28.0,17.8,25781,25.8,16.1,29119,29.1,18.3,...,16.2,22102,22.1,13.5,21626,21.6,13.3,29234,29.2,18.5
8,Emilia-Romagna,92047,92.0,15.7,86770,86.8,15.0,120263,120.3,20.6,...,15.1,76041,76.0,12.2,71335,71.3,12.4,111417,111.4,19.1
9,Toscana,89087,89.1,18.0,81184,81.2,16.1,100603,100.6,20.1,...,17.9,69555,69.6,13.8,64019,64.0,12.7,92333,92.3,18.6


In [37]:
neet_dict['NEET'].to_csv('dumbbell.csv', index = False)

# Download all CSV files

We download all the CSV files we created.

In [None]:
directory_path = '/content'
file_type = ".csv"
files_to_download = glob.glob(f"{directory_path}/*{file_type}")
zip_filename = "grouped_barcharts.zip"
with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in files_to_download:
        zipf.write(file, arcname = os.path.basename(file))
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

We remove all the CSV files we downloaded.

In [None]:
for file in files_to_download:
    os.remove(file)

# Choropleth Map: JSON file preparation

We load a JSON file containing a map of Italy.

In [None]:
json_path = "/content/drive/MyDrive/DV_project/story3/italy.json"
with open(json_path, 'r') as file:
    json_file = json.load(file)

We add to the JSON file information about the NEET abundance.

In [None]:
for idx in range(len(json_file['objects']['regions'].get('geometries'))):

  properties = json_file['objects']['regions'].get('geometries')[idx]['properties']
  region = properties['reg_name']

  row = neet_dict['NEET'][neet_dict['NEET']['Region'] == region].values[0]

  properties['abundance'] = list(row)[1::3]
  properties['abundance1000'] = list(row)[2::3]
  properties['percentage'] = list(row)[3::3]

We export the modified JSON file: we have still to remove by hand all info on Israel for a better visualisation of the map.

In [None]:
# We then need to remove (by hand) 'Israel' from the JSON file for a better vis of the map
out_path = "/content/choropleth_italy.json"
with open(out_path, 'w') as file:
    json.dump(json_file, file)