<a href="https://colab.research.google.com/github/VittorioBartolomeoSecondin/DVISProject-CAValli_Team/blob/main/Preprocessing_story3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries and connecting to Google Drive

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import json
import csv
import requests
import os
import zipfile
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Reading and preprocessing data

In [2]:
all_files = glob.glob("/content/drive/MyDrive/DV_project/story3/*")
all_files.sort()
print(all_files)

['/content/drive/MyDrive/DV_project/story3/LEAVERS_italy_2020.csv', '/content/drive/MyDrive/DV_project/story3/NEET_italy.csv', '/content/drive/MyDrive/DV_project/story3/NEET_italy_2020.csv', '/content/drive/MyDrive/DV_project/story3/NEET_italy_new.csv', '/content/drive/MyDrive/DV_project/story3/POVERTY_italy_2020.csv', '/content/drive/MyDrive/DV_project/story3/italy.json']


## NEET in Italy

We read the two CSV datasets (from 2009 to 2017, and from 2018 to 2023, to then merge together).

In [3]:
NEET_italy = pd.read_csv("/content/drive/MyDrive/DV_project/story3/NEET_italy.csv", low_memory = False)
NEET_italy_new = pd.read_csv("/content/drive/MyDrive/DV_project/story3/NEET_italy_new.csv", low_memory = False)

We observe the column values of both datasets.

In [4]:
distinct_values_dict = {col: NEET_italy[col].unique() for col in NEET_italy.columns}
for col, values in distinct_values_dict.items():
    print(f"\nDistinct values for column '{col}': {values}")


Distinct values for column 'ITTER107': ['IT' 'ITC1' 'ITC2' 'ITC3' 'ITC4' 'ITDA' 'ITD1' 'ITD2' 'ITD3' 'ITD4'
 'ITD5' 'ITE1' 'ITE2' 'ITE3' 'ITE4' 'ITF1' 'ITF2' 'ITF3' 'ITF4' 'ITF5'
 'ITF6' 'ITG1' 'ITG2']

Distinct values for column 'Territorio': ['Italia' 'Piemonte' "Valle d'Aosta / Vallée d'Aoste" 'Liguria'
 'Lombardia' 'Trentino Alto Adige / Südtirol'
 'Provincia Autonoma Bolzano / Bozen' 'Provincia Autonoma Trento' 'Veneto'
 'Friuli-Venezia Giulia' 'Emilia-Romagna' 'Toscana' 'Umbria' 'Marche'
 'Lazio' 'Abruzzo' 'Molise' 'Campania' 'Puglia' 'Basilicata' 'Calabria'
 'Sicilia' 'Sardegna']

Distinct values for column 'TIPO_DATO_FOL': ['NEET' 'NEET_I']

Distinct values for column 'Tipo dato': ['giovani Neet di 15-34 anni (non occupati e non in istruzione)'
 'incidenza dei giovani Neet di 15-34 anni (non occupati e non in istruzione)']

Distinct values for column 'SEXISTAT1': [1 2 9]

Distinct values for column 'Sesso': ['maschi' 'femmine' 'totale']

Distinct values for column 'ETA1': ['Y1

In [5]:
distinct_values_dict = {col: NEET_italy_new[col].unique() for col in NEET_italy_new.columns}
for col, values in distinct_values_dict.items():
    print(f"\nDistinct values for column '{col}': {values}")


Distinct values for column 'ITTER107': ['IT' 'ITC1' 'ITC2' 'ITC3' 'ITC4' 'ITDA' 'ITD1' 'ITD2' 'ITD3' 'ITD4'
 'ITD5' 'ITE1' 'ITE2' 'ITE3' 'ITE4' 'ITF1' 'ITF2' 'ITF3' 'ITF4' 'ITF5'
 'ITF6' 'ITG1' 'ITG2']

Distinct values for column 'Territorio': ['Italia' 'Piemonte' "Valle d'Aosta / Vallée d'Aoste" 'Liguria'
 'Lombardia' 'Trentino Alto Adige / Südtirol'
 'Provincia Autonoma Bolzano / Bozen' 'Provincia Autonoma Trento' 'Veneto'
 'Friuli-Venezia Giulia' 'Emilia-Romagna' 'Toscana' 'Umbria' 'Marche'
 'Lazio' 'Abruzzo' 'Molise' 'Campania' 'Puglia' 'Basilicata' 'Calabria'
 'Sicilia' 'Sardegna']

Distinct values for column 'TIPO_DATO_FOL': ['NEET' 'NEET_I']

Distinct values for column 'Tipo dato': ['giovani Neet di 15-34 anni (non occupati e non in istruzione)'
 'incidenza dei giovani Neet di 15-34 anni (non occupati e non in istruzione)']

Distinct values for column 'ETA1': ['Y15-29']

Distinct values for column 'Classe di età': ['15-29 anni']

Distinct values for column 'TIME': ['2018' '2018

We discard uninteresting information (like the gender).

In [6]:
years = {str(year) for year in range(2009, 2018)}
filtered_NEET = NEET_italy[( NEET_italy['Sesso'] == 'totale') \
                           & (NEET_italy['Condizione professionale europea'] == 'totale') \
                           & (~NEET_italy['Territorio'].isin(['Provincia Autonoma Bolzano / Bozen', 'Provincia Autonoma Trento']) ) \
                           & (NEET_italy['TIME'].isin([str(year) for year in range(2009, 2018)]) )
                           ]

In [7]:
years_new = {str(year) for year in range(2018, 2023)}
filtered_NEET_new = NEET_italy_new[(~NEET_italy_new['Territorio'].isin(['Provincia Autonoma Bolzano / Bozen', 'Provincia Autonoma Trento']) ) \
                                   & (NEET_italy_new['TIME'].isin([str(year) for year in range(2018, 2023)]) )
                                   ]

We drop useless columns.

In [8]:
NEET_final = filtered_NEET.copy()
NEET_final.drop(['ITTER107', 'Tipo dato', 'SEXISTAT1', 'Sesso', 'ETA1', 'Classe di età', 'CONDIZIONE_PROF_EU', 'Condizione professionale europea', 'Seleziona periodo'], axis = 1, inplace = True)

NEET_final_new = filtered_NEET_new.copy()
NEET_final_new.drop(['ITTER107', 'Tipo dato', 'ETA1', 'Classe di età', 'Seleziona periodo'], axis = 1, inplace = True)

We subdivide for absolute value and for percentage for both datasets.

In [9]:
NEET_absolute = NEET_final[NEET_final['TIPO_DATO_FOL'] == 'NEET']
NEET_abs = NEET_absolute.copy()
NEET_abs.drop('TIPO_DATO_FOL', axis = 1, inplace = True)
NEET_percentage = NEET_final[NEET_final['TIPO_DATO_FOL'] == 'NEET_I']
NEET_per = NEET_percentage.copy()
NEET_per.drop('TIPO_DATO_FOL', axis = 1, inplace = True)

NEET_absolute_new = NEET_final_new[NEET_final_new['TIPO_DATO_FOL'] == 'NEET']
NEET_abs_new = NEET_absolute_new.copy()
NEET_abs_new.drop('TIPO_DATO_FOL', axis = 1, inplace = True)
NEET_percentage_new = NEET_final_new[NEET_final_new['TIPO_DATO_FOL'] == 'NEET_I']
NEET_per_new = NEET_percentage_new.copy()
NEET_per_new.drop('TIPO_DATO_FOL', axis = 1, inplace = True)

We create a dictionary of datasets, one for each year (with all regions, the absolute value, the absolute value in thousands persons and the percentage).

In [10]:
neet_dict = {}

for year in years:
    neet_temp = NEET_abs[NEET_abs['TIME'] == year].copy()
    neet_per_temp = NEET_per[NEET_per['TIME'] == year].copy()
    merged_df = pd.merge(neet_temp, neet_per_temp[['Territorio', 'Value']], on = ['Territorio'], how = 'left')

    merged_df[f'{year}'] = (merged_df['Value_x'] * 1000).round(0).astype(int)
    merged_df[f'{year}_K'] = merged_df['Value_x'].round(1)
    merged_df[f'{year}_perc'] = merged_df['Value_y'].round(1)
    merged_df.drop(['TIME', 'Value_x', 'Value_y'], axis = 1, inplace = True)

    neet_dict[f'NEET_{year}'] = merged_df

for year in years_new:
    neet_temp_new = NEET_abs_new[NEET_abs_new['TIME'] == year].copy()
    neet_per_temp_new = NEET_per_new[NEET_per_new['TIME'] == year].copy()
    merged_df_new = pd.merge(neet_temp_new, neet_per_temp_new[['Territorio', 'Value']], on = ['Territorio'], how = 'left')

    merged_df_new[f'{year}'] = (merged_df_new['Value_x'] * 1000).round(0).astype(int)
    merged_df_new[f'{year}_K'] = merged_df_new['Value_x'].round(1)
    merged_df_new[f'{year}_perc'] = merged_df_new['Value_y'].round(1)
    merged_df_new.drop(['TIME', 'Value_x', 'Value_y'], axis = 1, inplace = True)

    neet_dict[f'NEET_{year}'] = merged_df_new

In [11]:
from functools import reduce
years_reduce = {str(year) for year in range(2009, 2023)}
dataframes = [neet_dict[f'NEET_{year}'] for year in years_reduce]
neet_dict['NEET'] = reduce(lambda left, right: pd.merge(left, right, on = 'Territorio', how = 'inner'), dataframes)

In [12]:
region_dict = {
    "Valle d'Aosta / Vallée d'Aoste": "Valle d'Aosta/Vallée d'Aoste",
    "Trentino Alto Adige / Südtirol": "Trentino-Alto Adige/Südtirol"
}
neet_dict['NEET']['Territorio'] = neet_dict['NEET']['Territorio'].replace(region_dict)
neet_dict['NEET'].rename(columns = {'Territorio': 'Region'}, inplace = True)

In [13]:
reordered_columns = sorted([col for col in neet_dict['NEET'].columns if col != 'Region'])
neet_dict['NEET'] = neet_dict['NEET'][['Region'] + reordered_columns]

In [14]:
neet_dict['NEET'].to_csv('dumbbell.csv', index = False)

## NEET vs POVERTY vs LEAVERS

### NEET (2020)

In [31]:
NEET_2020 = pd.read_csv("/content/drive/MyDrive/DV_project/story3/NEET_italy_2020.csv", low_memory = False)

We discard uninteresting information, we drop useless columns and we subdivide for absolute value and for percentage.

In [32]:
NEET_2020_filtered = NEET_2020[NEET_2020['TIME'] == '2020']

NEET_2020_final = NEET_2020_filtered.copy()
NEET_2020_final.drop(['ITTER107', 'Tipo dato', 'ETA1', 'Classe di età', 'Seleziona periodo', 'Flag Codes', 'Flags'], axis = 1, inplace = True)

NEET_2020_absolute = NEET_2020_final[NEET_2020_final['TIPO_DATO_FOL'] == 'NEET']
NEET_2020_abs = NEET_2020_absolute.copy()
NEET_2020_abs.drop('TIPO_DATO_FOL', axis = 1, inplace = True)
NEET_2020_percentage = NEET_2020_final[NEET_2020_final['TIPO_DATO_FOL'] == 'NEET_I']
NEET_2020_per = NEET_2020_percentage.copy()
NEET_2020_per.drop('TIPO_DATO_FOL', axis = 1, inplace = True)

We create one dataframe with info on absolute value, absolute value in thousands and percentage.

In [33]:
NEET_2020_merged = pd.merge(NEET_2020_abs, NEET_2020_per[['Territorio', 'Value']], on = ['Territorio'], how = 'left')
NEET_2020_merged['abs'] = (NEET_2020_merged['Value_x'] * 1000).round(0).astype(int)
NEET_2020_merged['absK'] = NEET_2020_merged['Value_x'].round(1)
NEET_2020_merged['neet'] = NEET_2020_merged['Value_y'].round(1)
NEET_2020_merged.drop(['TIME', 'Value_x', 'Value_y'], axis = 1, inplace = True)
NEET_2020_merged['Territorio'] = NEET_2020_merged['Territorio'].replace(region_dict)

### POVERTY (2020)

In [18]:
POVERTY_2020 = pd.read_csv("/content/drive/MyDrive/DV_project/story3/POVERTY_italy_2020.csv", low_memory = False)

In [20]:
POVERTY_2020.drop(["Frequenza", "Indicatore", "Tempo"], axis = 1, inplace = True)

In [None]:
region_dict_new = {
    "Valle d'Aosta / Vallée d'Aoste": "Valle d'Aosta/Vallée d'Aoste",
    "Trentino Alto Adige / Südtirol": "Trentino-Alto Adige/Südtirol"
}

In [34]:
POVERTY_2020

Unnamed: 0,Territorio,Osservazione
0,Italia,13.5
1,Piemonte,8.9
2,"'Valle d""'Aosta / Vallée d""'Aoste'",6.9
3,Liguria,10.5
4,Lombardia,9.3
5,Trentino Alto Adige / Südtirol,5.6
6,Veneto,8.2
7,Friuli-Venezia Giulia,8.7
8,Emilia-Romagna,7.9
9,Toscana,8.5


### LEAVERS (2020)

In [28]:
LEAVERS_2020 = pd.read_csv("/content/drive/MyDrive/DV_project/story3/LEAVERS_italy_2020.csv", low_memory = False)

In [29]:
LEAVERS_2020.drop(["Frequenza", "Indicatore", "Sesso", "Cittadinanza", "Tempo"], axis = 1, inplace = True)

In [30]:
LEAVERS_2020

Unnamed: 0,Territorio,Osservazione
0,Italia,13.1
1,Piemonte,12.0
2,"'Valle d""'Aosta / Vallée d""'Aoste'",11.6
3,Liguria,10.7
4,Lombardia,11.9
5,Trentino Alto Adige / Südtirol,11.1
6,Veneto,10.5
7,Friuli-Venezia Giulia,8.5
8,Emilia-Romagna,9.3
9,Toscana,11.7


### Bubble chart

In [None]:
NEET_2020_merged.rename(columns = {'Territorio': 'Region'}, inplace = True)
NEET_2020_merged

# Download all CSV files

We download all the CSV files we created.

In [None]:
directory_path = '/content'
file_type = ".csv"
files_to_download = glob.glob(f"{directory_path}/*{file_type}")
zip_filename = "grouped_barcharts.zip"
with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in files_to_download:
        zipf.write(file, arcname = os.path.basename(file))
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

We remove all the CSV files we downloaded.

In [None]:
for file in files_to_download:
    os.remove(file)

# Choropleth Map: JSON file preparation

We load a JSON file containing a map of Italy.

In [153]:
json_path = "/content/drive/MyDrive/DV_project/story3/italy.json"
with open(json_path, 'r') as file:
    json_file = json.load(file)

We add to the JSON file information about the NEET abundance.

In [154]:
for idx in range(len(json_file['objects']['regions'].get('geometries'))):

  properties = json_file['objects']['regions'].get('geometries')[idx]['properties']
  region = properties['reg_name']

  row = neet_dict['NEET'][neet_dict['NEET']['Region'] == region].values[0]

  properties['abundance'] = list(row)[1::3]
  properties['abundance1000'] = list(row)[2::3]
  properties['percentage'] = list(row)[3::3]

We export the modified JSON file: we have still to remove by hand all info on Israel for a better visualisation of the map.

In [155]:
# We then need to remove (by hand) 'Israel' from the JSON file for a better vis of the map
out_path = "/content/choropleth_italy.json"
with open(out_path, 'w') as file:
    json.dump(json_file, file)