# Census Data to Indicators

## Environment

### R Libraries
The relvant R libraries are imported in to the kernal:

In [1]:
# Load R libraries
 if(!require("pacman"))
     install.packages("pacman")
     library("pacman")

#p_load("dplyr", "sf", "purrr", "tidyverse")
p_load("sf", "dplyr")

print("Loaded Packages:")
p_loaded()

Loading required package: pacman



[1] "Loaded Packages:"


### Output directory

In [2]:
# create the pipeline directory if it does not exist
pipelineDir <- file.path("../..","2_pipeline","Milan","1a_CensusData","2021")
if(!dir.exists(pipelineDir)){
    dir.create(pipelineDir, recursive = TRUE)
    print(paste0(pipelineDir, " created"))
}

## Load Data

### Import the csv data
Italy census data from: http://dati-censimentopopolazione.istat.it/Index.aspx?lang=en and https://www.istat.it/it/

In [3]:
# Read the census data
censusData <- read.csv('../../0_data/Milan/ItalianCensus/2021/Milano_indicatori_2021_sezioni.csv', sep=",")

## Prepare data
We only require a subset of the census data for our purposes. We therefore need to extract the relevant data, then combine these to create our vulnerability indicators.

In addition, the raw data is not suitable for use within the vulnerabiltiy assessment. It needs to be normalised based on the number of people/households within each small area. Therefore, the data is converted to percentages based on the total persons/households within each small area.

### Supporting data

#### Code that uniquely identifies the census area

In [4]:
areaIDIndicators <- c('SEZ2011')
areaID <- censusData[, areaIDIndicators, drop = FALSE]
#areaID$SEZ2011 <- as.character(areaID$SEZ2011)

censusDataRows = nrow(censusData)
head(censusDataRows)

#### Population total

In [5]:
populationTotal <- censusData[, 'P1', drop = FALSE]
names(populationTotal)[1] <- 'populationTotal'
head(populationTotal)

Unnamed: 0_level_0,populationTotal
Unnamed: 0_level_1,<int>
1,13
2,16
3,17
4,11
5,1
6,6


#### Households / families total

In [6]:
householdsTotal <- censusData[, 'PF1', drop = FALSE]
names(householdsTotal)[1] <- 'householdsTotal'
head(householdsTotal)

Unnamed: 0_level_0,householdsTotal
Unnamed: 0_level_1,<int>
1,9
2,7
3,17
4,7
5,1
6,3


### Sensitivity dimension data

#### Age domain data

In [7]:
# Dimension:Sensitivity - Domain:Age

# Get indicators and convert into percentage
## 1) Boys under 5 years of age (Indicator P30)
youthMalePct <- (censusData['P30'] / populationTotal) * 100.0
names(youthMalePct)[1] <- 'youngMalePct'

## 2) Girls under 5 years of age (Indicator P67)
youthFemalePct <- (censusData['P67'] / populationTotal) * 100.0
names(youthFemalePct)[1] <- 'youngFemalePct'

## 3) Men over 75 years of age (Indicator P45)
oldMalePct <- (censusData['P45'] / populationTotal) * 100.0
names(oldMalePct)[1] <- 'oldMalePct'

## 4) Women over 75 years of age (Indicator P82)
oldFemalePct <- (censusData['P82'] / populationTotal) * 100.0
names(oldFemalePct)[1] <- 'oldFemalePct'

# Combine all these indicators into an array for this domain
ageDomainPct <- cbind(youthMalePct,
                      youthFemalePct,
                      oldMalePct,
                      oldFemalePct)

# Print the first six rows of the data to visually check it looks OK
head(ageDomainPct)

Unnamed: 0_level_0,youngMalePct,youngFemalePct,oldMalePct,oldFemalePct
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,15.38462,0.0
2,0,0,6.25,6.25
3,0,0,23.52941,0.0
4,0,0,18.18182,0.0
5,0,0,100.0,0.0
6,0,0,0.0,0.0


### Adaptive Capacity dimension data

#### Income domain data

In [8]:
# Dimension Adaptive:Capacity - Domain:Income

# Get indicators and convert into percentage
## 1) Dependants Rate (% people under 16) - using under 15 for the 2022 Italian census
dependantsRateFields <- c('P14', 'P15', 'P16')
dependantsRateData <- censusData[, dependantsRateFields, drop = FALSE]
dependantsRate <- rowSums(dependantsRateData, na.rm=TRUE)
dependantsRatePct <- (dependantsRate / populationTotal) * 100.0
names(dependantsRatePct)[1] <- 'dependantsRatePct'

## 2) Unemployment population
### Calculate total poluation for ages 15-64.
### Note: P101 = total employed people aged 15-64.
### Note: Sum of P17, P18, P19, P20, P21, P22, P23, P24, P25 and P26 is total poluation for ages 15-64.
### Note: to calculate unemployed we use: P101 - total poluation for ages 15-64.
### Note: this unemployed figure might also indicate students not working
totalPopulation_15to64_Indicators <- c('P17','P18','P19','P20','P21','P22','P23','P24','P25','P26')
totalPopulation_15to64_Data <- censusData[, totalPopulation_15to64_Indicators, drop = FALSE]
totalPopulation_15to64 <- rowSums(totalPopulation_15to64_Data, na.rm=TRUE)
unemployed <- totalPopulation_15to64 - censusData$P101
unemployedPct <- (unemployed / populationTotal) * 100.0
names(unemployedPct)[1] <- 'unemployedPct'

# Combine all these indicators into an array for this domain
incomeDomainPct <- cbind(dependantsRatePct,
                         unemployedPct)

# Print the first six rows of the data to visually check it looks OK
head(incomeDomainPct)

Unnamed: 0_level_0,dependantsRatePct,unemployedPct
Unnamed: 0_level_1,<dbl>,<dbl>
1,7.692308,15.384615
2,12.5,31.25
3,0.0,17.647059
4,9.090909,9.090909
5,0.0,0.0
6,0.0,0.0


#### Information Access/Use domain data

In [9]:
# Dimension Adaptive:Capacity - Domain:Information Access/Use

# Get indicators and convert into percentage
## 1) Population with NO higher eductation
### Note: P83	(total population age 9+) = P86 + P87 + P88 + P89 + P90 (without qualification + all primary to highest education levels)
### Note: P83 (total population age 9+) = P16 + P17+ P18 + P19 + P20 + P21 + P22 + P23 + P24 + P25 + P26 + P27 + P28 + P29 (age 10+)
### Note: using without qualification (P86) + primary/elementary schools qualification only (P87) 
noHigherEductationIndicators <- c('P86','P87')
noHigherEductation <- censusData$P86 + censusData$P87
noHigherEductationPct <- (noHigherEductation / populationTotal) * 100.0
names(noHigherEductationPct)[1] <- 'noHigherEductationPct'

# Combine all these indicators into an array for this domain
informationAccessUseDomainPct <- cbind(noHigherEductationPct)

# Print the first six rows of the data to visually check it looks OK
head(informationAccessUseDomainPct)

Unnamed: 0_level_0,noHigherEductationPct
Unnamed: 0_level_1,<dbl>
1,7.692308
2,18.75
3,0.0
4,0.0
5,0.0
6,0.0


#### Local knowledge domain

In [10]:
# Dimension:Adaptive Capacity - Domain:Local Knowledge

# Get indicators and convert into percentage
## 1) Percentage of foreign nationals (ST1)
foreignNationalsPct <- (censusData['ST1'] / populationTotal) * 100.0
names(foreignNationalsPct)[1] <- 'foreignNationalsPct'

# Combine all these indicators into an array for this domain
localKnowledgeDomainPct <- cbind(foreignNationalsPct)

# Print the first six rows of the data to visually check it looks OK
head(localKnowledgeDomainPct)

Unnamed: 0_level_0,foreignNationalsPct
Unnamed: 0_level_1,<dbl>
1,23.07692
2,31.25
3,0.0
4,18.18182
5,0.0
6,66.66667


#### Social Network domain data

In [11]:
# Dimension Adaptive:Capacity - Domain:Social Network

# Get indicators and convert into percentage
## 1) Primary School Age Children (Age 5-9) (Indicator P15)
primarySchoolPct <- (censusData['P15'] / populationTotal) * 100.0
names(primarySchoolPct)[1] <- 'primarySchoolPct'

## 2) Households with one person (Indicator PF3)
onePersonHouseholdPct <- (censusData['PF3'] / householdsTotal) * 100.0
names(onePersonHouseholdPct)[1] <- 'onePersonHouseholdPct'

# Combine all these indicators into an array for this domain
socialNetworkDomainPct <- cbind(primarySchoolPct,
                                onePersonHouseholdPct)

# Print the first six rows of the data to visually check it looks OK
head(socialNetworkDomainPct)

Unnamed: 0_level_0,primarySchoolPct,onePersonHouseholdPct
Unnamed: 0_level_1,<dbl>,<dbl>
1,0.0,66.66667
2,0.0,42.85714
3,0.0,100.0
4,9.090909,57.14286
5,0.0,100.0
6,0.0,66.66667


### Combine all data into one table

In [12]:
# Combine all data into one table
indicatorDataPct <- cbind(areaID,
                          ageDomainPct,
                          incomeDomainPct,
                          informationAccessUseDomainPct,
                          localKnowledgeDomainPct,
                          socialNetworkDomainPct)
head(indicatorDataPct)

Unnamed: 0_level_0,SEZ2011,youngMalePct,youngFemalePct,oldMalePct,oldFemalePct,dependantsRatePct,unemployedPct,noHigherEductationPct,foreignNationalsPct,primarySchoolPct,onePersonHouseholdPct
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,151460000000.0,0,0,15.38462,0.0,7.692308,15.384615,7.692308,23.07692,0.0,66.66667
2,151460000000.0,0,0,6.25,6.25,12.5,31.25,18.75,31.25,0.0,42.85714
3,151460000000.0,0,0,23.52941,0.0,0.0,17.647059,0.0,0.0,0.0,100.0
4,151460000000.0,0,0,18.18182,0.0,9.090909,9.090909,0.0,18.18182,9.090909,57.14286
5,151460000000.0,0,0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
6,151460000000.0,0,0,0.0,0.0,0.0,0.0,0.0,66.66667,0.0,66.66667


## Calculate Z-Score
The raw data is not suitable for use within the vulnerabiltiy assessment. It needs to be standardised. Therefore, the data is converted to z-scores. Z-scores are:

>"A statistical measurement of a score's relationship to the mean (average value) in a group of scores. A Z-score of 0 means the score is the same as the mean (average value). A Z-score can be positive or negative, indicating whether it is above or below the mean and by how many standard deviations. Z-score standardisation represents the deviation of a raw score from its mean in standard deviation units." (Kazmierczak et al., 2015)

### Calculate the Z-score

In [13]:
# Copy the data
indicatorDataZ <- indicatorDataPct

# Get the number of columns in the data
indicatorDataPctColLength = ncol(indicatorDataPct)

# Calculate the z scores for each of the relevant columns - starting at the 2nd column
for(col in names(indicatorDataZ)[2:indicatorDataPctColLength]) {
  indicatorDataZ[paste0(col, "_Z")] = scale(indicatorDataZ[col])
}

# Remove the original data to leave only the area identifier and the z scores
indicatorDataZ <- indicatorDataZ[-c(2:indicatorDataPctColLength)]

head(indicatorDataZ)

Unnamed: 0_level_0,SEZ2011,youngMalePct_Z,youngFemalePct_Z,oldMalePct_Z,oldFemalePct_Z,dependantsRatePct_Z,unemployedPct_Z,noHigherEductationPct_Z,foreignNationalsPct_Z,primarySchoolPct_Z,onePersonHouseholdPct_Z
Unnamed: 0_level_1,<dbl>,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>","<dbl[,1]>"
1,151460000000.0,-1.230411,-1.064144,2.1196292,-1.3062152,-0.7801407,-0.5060886,-0.6075338,0.284861877,-1.458348,0.9333249
2,151460000000.0,-1.230411,-1.064144,0.2302043,-0.3189938,0.1197864,1.3419462,0.884988,0.768744204,-1.458348,-0.7026211
3,151460000000.0,-1.230411,-1.064144,3.8043177,-1.3062152,-2.220024,-0.2425543,-1.6458099,-1.081394102,-1.458348,3.2236494
4,151460000000.0,-1.230411,-1.064144,2.6982091,-1.3062152,-0.5183437,-1.2391934,-1.6458099,-0.004949997,1.865648,0.2789465
5,151460000000.0,-1.230411,-1.064144,19.6216703,-1.3062152,-2.220024,-2.2981224,-1.6458099,-1.081394102,-1.458348,3.2236494
6,151460000000.0,-1.230411,-1.064144,-1.0625601,-1.3062152,-2.220024,-2.2981224,-1.6458099,2.865567617,-1.458348,0.9333249


### Output the Z-score data

In [14]:
# remove the Pct_Z in the filed names
names(indicatorDataZ) <- gsub("Pct_Z","",names(indicatorDataZ))

# Output the z-score data as a csv file
outputFile <- file.path(pipelineDir, "censusDataZ.csv")
write.csv(indicatorDataZ, outputFile, row.names = FALSE)

**END**