<a href="https://colab.research.google.com/github/VittorioBartolomeoSecondin/DVISProject-CAValli_Team/blob/main/Preprocessing_story3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries and connecting to Google Drive

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import json
import csv
import requests
import os
import zipfile
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Reading and preprocessing data

In [33]:
all_files = glob.glob("/content/drive/MyDrive/DV_project/story3/*")
all_files.sort()
print(all_files)

['/content/drive/MyDrive/DV_project/story3/NEET_italy.csv', '/content/drive/MyDrive/DV_project/story3/italy.json']


## Dictionaries: country --> abbreviation (and viceversa)

## NEET in Italy

We read the CSV dataset.

In [107]:
NEET_italy = pd.read_csv("/content/drive/MyDrive/DV_project/story3/NEET_italy.csv", low_memory = False)

We observe the column values.

In [108]:
distinct_values_dict = {col: NEET_italy[col].unique() for col in NEET_italy.columns}
for col, values in distinct_values_dict.items():
    print(f"\nDistinct values for column '{col}': {values}")


Distinct values for column 'ITTER107': ['IT' 'ITC1' 'ITC2' 'ITC3' 'ITC4' 'ITDA' 'ITD1' 'ITD2' 'ITD3' 'ITD4'
 'ITD5' 'ITE1' 'ITE2' 'ITE3' 'ITE4' 'ITF1' 'ITF2' 'ITF3' 'ITF4' 'ITF5'
 'ITF6' 'ITG1' 'ITG2']

Distinct values for column 'Territorio': ['Italia' 'Piemonte' "Valle d'Aosta / Vallée d'Aoste" 'Liguria'
 'Lombardia' 'Trentino Alto Adige / Südtirol'
 'Provincia Autonoma Bolzano / Bozen' 'Provincia Autonoma Trento' 'Veneto'
 'Friuli-Venezia Giulia' 'Emilia-Romagna' 'Toscana' 'Umbria' 'Marche'
 'Lazio' 'Abruzzo' 'Molise' 'Campania' 'Puglia' 'Basilicata' 'Calabria'
 'Sicilia' 'Sardegna']

Distinct values for column 'TIPO_DATO_FOL': ['NEET' 'NEET_I']

Distinct values for column 'Tipo dato': ['giovani Neet di 15-34 anni (non occupati e non in istruzione)'
 'incidenza dei giovani Neet di 15-34 anni (non occupati e non in istruzione)']

Distinct values for column 'SEXISTAT1': [1 2 9]

Distinct values for column 'Sesso': ['maschi' 'femmine' 'totale']

Distinct values for column 'ETA1': ['Y1

We discard uninteresting information (like the gender, because we are going to consider just the total value).

In [141]:
filtered_NEET = NEET_italy[( NEET_italy['Sesso'] == 'totale') \
                           & (NEET_italy['Condizione professionale europea'] == 'totale') \
                           & (~NEET_italy['Territorio'].isin(['Provincia Autonoma Bolzano / Bozen', 'Provincia Autonoma Trento']) ) \
                           & (NEET_italy['TIME'].isin([str(year) for year in range(2009, 2021)]) )
                           ]

We drop useless columns.

In [142]:
NEET_final = filtered_NEET.copy()
NEET_final.drop(['ITTER107', 'Tipo dato', 'SEXISTAT1', 'Sesso', 'ETA1', 'Classe di età', 'CONDIZIONE_PROF_EU', 'Condizione professionale europea', 'Seleziona periodo'], axis = 1, inplace = True)

In [144]:
NEET_absolute = NEET_final[NEET_final['TIPO_DATO_FOL'] == 'NEET']
NEET_abs = NEET_absolute.copy()
NEET_abs.drop('TIPO_DATO_FOL', axis = 1, inplace = True)

NEET_percentage = NEET_final[NEET_final['TIPO_DATO_FOL'] == 'NEET_I']
NEET_per = NEET_percentage.copy()
NEET_per.drop('TIPO_DATO_FOL', axis = 1, inplace = True)

In [145]:
years = NEET_final['TIME'].unique()
neet_dict = {}
for year in years:
    neet_temp = NEET_abs[NEET_abs['TIME'] == year].copy()
    neet_per_temp = NEET_per[NEET_per['TIME'] == year].copy()
    merged_df = pd.merge(neet_temp, neet_per_temp[['Territorio', 'Value']], on = ['Territorio'], how = 'left')

    merged_df[f'{year}'] = (merged_df['Value_x'] * 1000).round(0).astype(int)
    merged_df[f'{year}_K'] = merged_df['Value_x'].round(1)
    merged_df[f'{year}_perc'] = merged_df['Value_y'].round(1)
    merged_df.drop(['TIME', 'Value_x', 'Value_y'], axis = 1, inplace = True)

    neet_dict[f'NEET_{year}'] = merged_df

In [159]:
from functools import reduce
dataframes = [neet_dict[f'NEET_{year}'] for year in years]
neet_dict['NEET'] = reduce(lambda left, right: pd.merge(left, right, on = 'Territorio', how = 'inner'), dataframes)

In [164]:
region_dict = {
    "Valle d'Aosta / Vallée d'Aoste": "Valle d'Aosta/Vallée d'Aoste",
    "Trentino Alto Adige / Südtirol": "Trentino-Alto Adige/Südtirol"
}
neet_dict['NEET']['Territorio'] = neet_dict['NEET']['Territorio'].replace(region_dict)
neet_dict['NEET'].rename(columns = {'Territorio': 'Region'}, inplace = True)

In [165]:
neet_dict['NEET'].to_csv('dumbbell.csv', index = False)

# Download all CSV files

We download all the CSV files we created.

In [None]:
directory_path = '/content'
file_type = ".csv"
files_to_download = glob.glob(f"{directory_path}/*{file_type}")
zip_filename = "grouped_barcharts.zip"
with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in files_to_download:
        zipf.write(file, arcname = os.path.basename(file))
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

We remove all the CSV files we downloaded.

In [None]:
for file in files_to_download:
    os.remove(file)

# Choropleth Map: JSON file preparation

We load a JSON file containing a map of Italy.

In [148]:
json_path = "/content/drive/MyDrive/DV_project/story3/italy.json"
with open(json_path, 'r') as file:
    json_file = json.load(file)

In [154]:
json_file['objects']['regions'].get('geometries')[0]['properties']

{'reg_name': 'Piemonte', 'reg_istat_code_num': 1, 'reg_istat_code': '01'}

We add to the JSON file information about the NEET abundance.

In [156]:
for idx in range(len(json_file['objects']['regions'].get('geometries'))):
  print(json_file['objects']['regions'].get('geometries')[idx]['properties']['reg_name'])

Piemonte
Valle d'Aosta/Vallée d'Aoste
Lombardia
Trentino-Alto Adige/Südtirol
Veneto
Friuli-Venezia Giulia
Liguria
Emilia-Romagna
Toscana
Umbria
Marche
Lazio
Abruzzo
Molise
Campania
Puglia
Basilicata
Calabria
Sicilia
Sardegna


In [None]:

Trentino Alto Adige / Südtirol

In [157]:
neet_dict['NEET']['Territorio']

0                             Italia
1                           Piemonte
2     Valle d'Aosta / Vallée d'Aoste
3                            Liguria
4                          Lombardia
5     Trentino Alto Adige / Südtirol
6                             Veneto
7              Friuli-Venezia Giulia
8                     Emilia-Romagna
9                            Toscana
10                            Umbria
11                            Marche
12                             Lazio
13                           Abruzzo
14                            Molise
15                          Campania
16                            Puglia
17                        Basilicata
18                          Calabria
19                           Sicilia
20                          Sardegna
Name: Territorio, dtype: object

In [None]:
for idx in range(len(json_file['objects']['regions'].get('geometries'))):

  properties = json_file['objects']['regions'].get('geometries')[idx]['properties']
  name = properties['reg_name']
  abbreviation = state_abbreviations[name]

  if country_abbreviations_reversed.get(abbreviation) is not None:
    name = country_abbreviations_reversed.get(abbreviation)
  del properties['NAME']

  try:
    row1000 = neet_sheets['neet_sheet3'][neet_sheets['neet_sheet3']['Country'] == name].values[0]
    row = neet_sheets['neet_sheet3'][neet_sheets['neet_sheet3']['Country'] == name].values[0] * 1000
    abundance1000 = list(row1000)[1:15]
    abundance = list(row)[1:15]
    properties['name'] = name
    properties['abbreviation'] = abbreviation
    properties['abundance'] = abundance
    properties['abundance1000'] = abundance1000

  except:
    abundance = list(np.zeros(14))
    properties['name'] = name
    properties['abbreviation'] = abbreviation
    properties['abundance'] = abundance
    properties['abundance1000'] = abundance

We export the modified JSON file: we have still to remove by hand all info on Israel for a better visualisation of the map.

In [None]:
# We then need to remove (by hand) 'Israel' from the JSON file for a better vis of the map
out_path = "/content/choropleth.json"
with open(out_path, 'w') as file:
    json.dump(json_file, file)