/
03a_remove_year_outliers.R
76 lines (59 loc) · 3.08 KB
/
03a_remove_year_outliers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# ----------------------------------------------------------
# UNU_countries: Remove unreliable values (extremes) based on values of other years per Country and UNU_Key
# ----------------------------------------------------------
# ----------------------------------------------------------
# Outlier detection using Median Absolute Deviation (MAD).
#
# For this outlier detection method, the median of values is calculated
# Then, the difference is calculated between each of these values and this median.
# These differences are expressed as their absolute values, and a new median of these absolute values is calculated.
# This is median absolute deviation (MAD).
# If a value is a certain number of MAD away from the median of the residuals, that value is classified as an outlier.
# The threshold is 4 MAD for the calculations performed in this project.
# Big advantage of the MAD is that it works with heavily tailed data too, which we see a lot in e-waste statistics.
# ----------------------------------------------------------
# *** Collect the neccesary data ***
# 1 - mad kg per inhabitant
dfmad_kpi <- as.data.frame(as.table(by(UNU_countries[, "kpi"],
UNU_countries[, c("Country", "UNU_Key")],
mad, constant = 1, na.rm=TRUE)))
names(dfmad_kpi)[3] <- "mad_kpi"
# 2 - number of valid values used in mad calculation
dfvalidn <- count(UNU_countries[!is.na(UNU_countries$kpi), ][ , c("Country", "UNU_Key")])
names(dfvalidn)[3] <- "validn"
# 3 - the last value needed is the median.
dfmed_kpi <- as.data.frame(as.table(by(UNU_countries[, "kpi"],
UNU_countries[, c("Country", "UNU_Key")],
median, na.rm=TRUE)))
names(dfmed_kpi)[3] <- "med_kpi"
# merge the first two, removing the NA's of groups with no data
dfmad_kpi <- merge(dfmad_kpi, dfvalidn, by=c("Country", "UNU_Key"))
# merge the median with them
dfmad_kpi <- merge(dfmad_kpi, dfmed_kpi, by=c("Country", "UNU_Key"))
# now merge these values with the original data
UNU_countries <- merge(UNU_countries, dfmad_kpi, by=c("Country", "UNU_Key"), all.x = TRUE)
# Set our sorting order correct again
sortorder <- order(UNU_countries$UNU_Key, UNU_countries$Country, UNU_countries$Year)
UNU_countries <- UNU_countries[sortorder, ]
# *** The outlier calculation ***
# Highs
selection <- which(UNU_countries$flag <50 &
UNU_countries$validn >= 6 &
UNU_countries$kpi - UNU_countries$med_kpi > distance_mad * UNU_countries$mad_kpi)
UNU_countries[selection, "kpi"] <- NA
UNU_countries[selection, "ppi"] <- NA
UNU_countries[selection, "flag"] <- 5.
# and lows
selection <- which(UNU_countries$flag <50 &
UNU_countries$validn >= 6 &
UNU_countries$med_kpi - UNU_countries$kpi > distance_mad * UNU_countries$mad_kpi)
UNU_countries[selection, "kpi"] <- NA
UNU_countries[selection, "ppi"] <- NA
UNU_countries[selection, "flag"] <- 4.
# clean-up
UNU_countries$mad_kpi <-NULL
UNU_countries$validn <-NULL
UNU_countries$med_kpi <-NULL
rm(dfmad_kpi)
rm(dfmed_kpi)
rm(dfvalidn)