In [10]:
import pandas as pd
import numpy as np
from translate import translator
import time
import random
from babel import Locale
import json
import operator
from prompter import yesno

In [11]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /home/sara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
path = "Data/"
path_foreigners = "Resident foreigners on 1st January - Citizenship/resident_foreigners.csv"
path_resident_a2011 = "Resident population  on 1st January/resident.csv"
path_resident_b2011 = "Estimated resident population within the borders of the time - Years 2001-2011/resident_before2011.csv"

In [17]:
resident_foreigners = pd.read_table(path+path_foreigners, sep = "\t")
resident_foreigners = resident_foreigners.drop_duplicates()
resident_foreigners = resident_foreigners.drop(["Tipo di indicatore demografico", "Flags"], axis = 1)
resident_foreigners.columns = ["Territorio", "Cittadinanza", "Periodo", "Sesso", "Numero"]
resident_foreigners.index = range(len(resident_foreigners))
resident_foreigners.head()

Unnamed: 0,Territorio,Cittadinanza,Periodo,Sesso,Numero
0,Italia,Andorra,2003,maschi,5
1,Italia,Andorra,2004,maschi,2
2,Italia,Andorra,2005,maschi,1
3,Italia,Andorra,2006,maschi,1
4,Italia,Andorra,2007,maschi,2


In [21]:
# Is not useful to keep the info about the ages, since it's an info not available for the foreigners table
resident_after = pd.read_table(path+path_resident_a2011)
resident_after = resident_after.drop_duplicates()
resident_after = resident_after[resident_after["Età"] == "totale"]
resident_after = resident_after.drop(["Tipo di indicatore demografico", "Stato civile", "Età", "Flags"], axis = 1)
resident_after.columns = ["Territorio", "Periodo", "Sesso", "Numero"]
resident_after.head()

Unnamed: 0,Territorio,Periodo,Sesso,Numero
2484,Italia,2012,maschi,28726599
2485,Italia,2013,maschi,28889597
2486,Italia,2014,maschi,29484564
2487,Italia,2015,maschi,29501590
2488,Italia,2016,maschi,29456321


In [22]:
# Is not useful to keep the info about the ages, since it's an info not available for the foreigners table
resident_before = pd.read_table(path+path_resident_b2011)
resident_before = resident_before.drop_duplicates()
resident_before = resident_before[resident_before["Età"] == "totale"]
resident_before = resident_before.drop(["Tipo dato", "Cittadinanza", "Età", "Flags"], axis = 1)
# change columns order
resident_before = resident_before[['Territorio', 'Seleziona periodo', 'Sesso', '0']]
resident_before.columns = ["Territorio", "Periodo", "Sesso", "Numero"]
resident_before.head()

Unnamed: 0,Territorio,Periodo,Sesso,Numero
4563,Italia,2003,maschi,27658649
4564,Italia,2004,maschi,27845601
4565,Italia,2005,maschi,28044210
4566,Italia,2006,maschi,28138577
4567,Italia,2007,maschi,28212234


In [23]:
# concat the the tables
resident = resident_before.append(resident_after, ignore_index=True)
resident.index = range(len(resident))
resident.head()

Unnamed: 0,Territorio,Periodo,Sesso,Numero
0,Italia,2003,maschi,27658649
1,Italia,2004,maschi,27845601
2,Italia,2005,maschi,28044210
3,Italia,2006,maschi,28138577
4,Italia,2007,maschi,28212234


In [24]:
resident[(resident["Territorio"] == "Lombardia") & (resident["Periodo"] == 2017)]

Unnamed: 0,Territorio,Periodo,Sesso,Numero
3992,Lombardia,2017,maschi,4894363
3998,Lombardia,2017,femmine,5124803
4004,Lombardia,2017,totale,10019166


In [25]:
provincia_regione = pd.read_table(path+"regioni.csv", sep = "\t")
provincia_regione.head()

Unnamed: 0,Provincia,Regione
0,Torino,Piemonte
1,Vercelli,Piemonte
2,Novara,Piemonte
3,Cuneo,Piemonte
4,Asti,Piemonte


In [26]:
regione_zona = pd.read_table(path+"territori.csv", sep = "\t")
regione_zona.head()

Unnamed: 0,Regione,Zona
0,Abruzzo,Sud
1,Basilicata,Sud
2,Calabria,Sud
3,Campania,Sud
4,Emilia-Romagna,Nord-est


Check if the aggragation is right.. that is: 
- the sum over the municipalities (comuni) conrespond to the aggragation value at the region (regione) level;
- the sum over the regions conrespond to the aggregation value at the zone (nord, sud, etc..) level;
- for each tuple (Territorio, Cittadinanza, Anno) if maschio + femmina = totale.

Check both the resident and the resident_foreigners

In [27]:
regioni = list(set(regione_zona["Regione"]))
zone = list(set(regione_zona["Zona"]))
territori = list(set(resident_foreigners["Territorio"]))
years = list(set(resident_foreigners["Periodo"]))
origine = list(set(resident_foreigners["Cittadinanza"]))

### Resident Foreigners Check

In [28]:
temp = resident_foreigners.copy()
# keep only the municipalities and the region data
territori_1 = list(set(list(provincia_regione["Provincia"]) + list(provincia_regione["Regione"])))
temp = temp.loc[[i in territori_1 for i in temp["Territorio"]]]
# do not consider the Valle d'Aosta / Vallée d'Aoste data since the aggregate and the singular data coincide
temp = temp[temp["Territorio"] != "Valle d'Aosta / Vallée d'Aoste"]
# change the sign to the "total" elements
temp["Numero"][[i in list(set(provincia_regione["Regione"])) for i in temp["Territorio"]]] = -temp["Numero"][[i in list(set(provincia_regione["Regione"])) for i in temp["Territorio"]]]

regioni = list(set(provincia_regione["Regione"]))
province = list(set(provincia_regione["Provincia"]))
# add the regione field, so we can aggregate on it
temp["Regione"] = [provincia_regione["Regione"][provincia_regione["Provincia"] == i].values[0] if i in province else i for i in temp["Territorio"]]

# sum all over the same (Regione, Cittadinanza, Periodo, Sesso)
temp['totale'] = temp.groupby(["Regione", "Cittadinanza", "Periodo", "Sesso"])['Numero'].transform(np.sum)

# the sum should always be 0
if len(list(set(temp["totale"]))) == 1 and list(set(temp["totale"]))[0] == 0:
    print("Region-Aggregation works on 'resident_foreigners'!")

Region-Aggregation works on 'resident_foreigners'!


In [29]:
temp = resident_foreigners.copy()
# keep only the municipalities and the region data
territori_1 = list(set(list(regione_zona["Regione"]) + list(regione_zona["Zona"])))
temp = temp.loc[[i in territori_1 for i in temp["Territorio"]]]

# change the sign to the "total" elements
temp["Numero"][[i in list(set(regione_zona["Zona"])) for i in temp["Territorio"]]] = -temp["Numero"][[i in list(set(regione_zona["Zona"])) for i in temp["Territorio"]]]

regioni = list(set(provincia_regione["Regione"]))
zone = list(set(regione_zona["Zona"]))

# add the regione field, so we can aggregate on it
temp["Zona"] = [regione_zona["Zona"][regione_zona["Regione"] == i].values[0] if i in regioni else i for i in temp["Territorio"]]

# sum all over the same (Regione, Cittadinanza, Periodo, Sesso)
temp['totale'] = temp.groupby(["Zona", "Cittadinanza", "Periodo", "Sesso"])['Numero'].transform(np.sum)

# the sum should always be 0
if len(list(set(temp["totale"]))) == 1 and list(set(temp["totale"]))[0] == 0:
    print("Zone-Aggregation works on 'resident_foreigners'!")

Zone-Aggregation works on 'resident_foreigners'!


In [30]:
temp = resident_foreigners.copy()
# change the sign to the "total" elements
temp["Numero"][temp[temp["Sesso"] == "totale"].index] = -temp["Numero"][temp[temp["Sesso"] == "totale"].index]
# sum all over the same (Territorio, Cittadinanza, Periodo)
temp['totale'] = temp.groupby(["Territorio", "Cittadinanza", "Periodo"])['Numero'].transform(np.sum)

# the sum should always be 0
if len(list(set(temp["totale"]))) == 1 and list(set(temp["totale"]))[0] == 0:
    print("Aggregation maschio + femmina works on 'resident_foreigners'!")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Aggregation maschio + femmina works on 'resident_foreigners'!


### Resident Check

In [31]:
temp = resident.copy()
# keep only the municipalities and the region data
territori_1 = list(set(list(provincia_regione["Provincia"]) + list(provincia_regione["Regione"])))
temp = temp.loc[[i in territori_1 for i in temp["Territorio"]]]
# do not consider the Valle d'Aosta / Vallée d'Aoste data since the aggregate and the singular data coincide
temp = temp[temp["Territorio"] != "Valle d'Aosta / Vallée d'Aoste"]
# change the sign to the "total" elements
temp["Numero"][[i in list(set(provincia_regione["Regione"])) for i in temp["Territorio"]]] = -temp["Numero"][[i in list(set(provincia_regione["Regione"])) for i in temp["Territorio"]]]

regioni = list(set(provincia_regione["Regione"]))
province = list(set(provincia_regione["Provincia"]))
# add the regione field, so we can aggregate on it
temp["Regione"] = [provincia_regione["Regione"][provincia_regione["Provincia"] == i].values[0] if i in province else i for i in temp["Territorio"]]

# sum all over the same (Regione, Periodo, Sesso)
temp['totale'] = temp.groupby(["Regione", "Periodo", "Sesso"])['Numero'].transform(np.sum)

# the sum should always be 0
if len(list(set(temp["totale"]))) == 1 and list(set(temp["totale"]))[0] == 0:
    print("Region-Aggregation works on 'resident'!")

Region-Aggregation works on 'resident'!


In [32]:
temp = resident.copy()
# keep only the region and the zone data
territori_1 = list(set(list(regione_zona["Regione"]) + list(regione_zona["Zona"])))
temp = temp.loc[[i in territori_1 for i in temp["Territorio"]]]

# change the sign to the "total" elements
temp["Numero"][[i in list(set(regione_zona["Zona"])) for i in temp["Territorio"]]] = -temp["Numero"][[i in list(set(regione_zona["Zona"])) for i in temp["Territorio"]]]

regioni = list(set(provincia_regione["Regione"]))
zone = list(set(regione_zona["Zona"]))

# add the regione field, so we can aggregate on it
temp["Zona"] = [regione_zona["Zona"][regione_zona["Regione"] == i].values[0] if i in regioni else i for i in temp["Territorio"]]

# sum all over the same (Regione, Periodo, Sesso)
temp['totale'] = temp.groupby(["Zona", "Periodo", "Sesso"])['Numero'].transform(np.sum)

# the sum should always be 0
if len(list(set(temp["totale"]))) == 1 and list(set(temp["totale"]))[0] == 0:
    print("Zone-Aggregation works on 'resident'!")

Zone-Aggregation works on 'resident'!


In [33]:
temp = resident.copy()
# change the sign to the "total" elements
temp["Numero"][temp[temp["Sesso"] == "totale"].index] = -temp["Numero"][temp[temp["Sesso"] == "totale"].index]
# sum all over the same (Territorio, Periodo)
temp['totale'] = temp.groupby(["Territorio", "Periodo"])['Numero'].transform(np.sum)

# the sum should always be 0
if len(list(set(temp["totale"]))) == 1 and list(set(temp["totale"]))[0] == 0:
    print("Aggregation maschio + femmina works on 'resident'!")

Aggregation maschio + femmina works on 'resident'!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


The aggragation checks are verified. Thus, it's possible to drop duplications to get the normal form.

### .. to Normal Form

In [34]:
resident_foreigners_norm = resident_foreigners.copy() 

# drop region and zone data
territori_1 = list(regione_zona["Regione"].values) + list(regione_zona["Zona"].values) + ["Italia"]
# Valle d'Aosta / Vallée d'Aoste it's the only province for Valle d'Aosta / Vallée d'Aoste
territori_1.remove("Valle d'Aosta / Vallée d'Aoste")
resident_foreigners_norm = resident_foreigners_norm.loc[[i not in territori_1 for i in resident_foreigners_norm["Territorio"]]]
resident_foreigners_norm = resident_foreigners_norm[resident_foreigners_norm["Cittadinanza"] != "Mondo"]
# drop total (mashio+femmina) data
resident_foreigners_norm = resident_foreigners_norm[resident_foreigners_norm["Sesso"] != "totale"]

resident_foreigners_norm.index = range(len(resident_foreigners_norm))

In [35]:
resident_norm = resident.copy() 

# drop region and zone data
territori_1 = list(set(resident_foreigners_norm["Territorio"]))
resident_norm = resident_norm.loc[[i in territori_1 for i in resident_norm["Territorio"]]]
# drop total (mashio+femmina) data
resident_norm = resident_norm[resident_norm["Sesso"] != "totale"]

resident_norm.index = range(len(resident_norm))

In [36]:
# Rename the columns
resident_norm.rename(columns = {"Territorio": "Province", "Periodo": "Year", "Sesso": "Gender", "Numero": "Value"}, inplace=True)
resident_foreigners_norm.rename(columns = {"Territorio": "Province", "Cittadinanza": "Country", "Periodo": "Year", 
                                           "Sesso": "Gender", "Numero": "Value"}, inplace=True)
# Rename italian contents (gender col)
resident_foreigners_norm["Gender"] = resident_foreigners_norm["Gender"].replace({"maschi": "male", "femmine": "female"})
resident_norm["Gender"] = resident_norm["Gender"].replace({"maschi": "male", "femmine": "female"})

In [37]:
resident_foreigners_norm.head()

Unnamed: 0,Province,Country,Year,Gender,Value
0,Biella,Andorra,2005,male,1
1,Biella,Andorra,2005,female,0
2,Novara,Andorra,2005,male,0
3,Novara,Andorra,2005,female,2
4,Alessandria,Andorra,2017,male,0


In [38]:
resident_norm.head()

Unnamed: 0,Province,Year,Gender,Value
0,Torino,2003,male,1045816
1,Torino,2004,male,1054784
2,Torino,2005,male,1058774
3,Torino,2006,male,1060448
4,Torino,2007,male,1062274


Problems with the data coherence: before 2008 we have "Serbi and Montenegro" after we have "Serbia" and "Montenegro" as different countries. In 2006 Montenegro proclamations its independence after a referendum.

In [39]:
print("We have %d different provinces." %len(list(set(resident_foreigners_norm["Province"]))))

We have 110 different provinces.


In [40]:
print("We have %d different provinces." %len(list(set(resident_norm["Province"]))))

We have 110 different provinces.


In [41]:
# dump
resident_norm.to_csv("Data/resident_norm.csv", sep = "\t", index = False)
resident_norm.to_csv("Data_final/resident_norm.csv", sep = "\t", index = False)
resident_foreigners_norm.to_csv("Data/resident_foreigners_norm.csv",  sep = "\t", index = False)