-
Notifications
You must be signed in to change notification settings - Fork 9
/
data_construction.R
111 lines (96 loc) · 3.97 KB
/
data_construction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
## Construction of dataset
##
## This is the code used to create the unicorn dataset,
## when you present this dataset as a genuine old timey dataset, it
## does not really work that well, with modern R code that produces the set.
##
## So maybe it was handed down through a time vortex by Antonie van Leeuwenhoek the time-traveler?
##
## This code is also cc-by-4.0. use remix and whatever you like.
library(tidyverse)
# ISOcodes package, contains groupings per country, but this is coded in a
# string in a row.
country_vec <- ISOcodes::UN_M.49_Regions %>%
filter(Name %in% c( "Western Europe") )%>%
mutate(countries = str_split(Children, ", ")) %>%
unnest() %>%
select(countries) %>%
left_join(ISOcodes::UN_M.49_Countries,
by = c("countries"="Code")) %>%
pull(Name) %>% str_trim()
# create country names and years
baseset <- crossing(
countryname = country_vec,
year = 1670:1680
) %>%
filter(!countryname %in% c("Liechtenstein","Monaco", "Luxembourg", "Belgium")) %>%
mutate(
exclude = case_when(
countryname == "Austria" & year == 1679 ~ TRUE,
TRUE ~ FALSE
),
random = rbernoulli(nrow(.),p = .1)
)
baseset %>% filter(year < 1675) %>% nrow()
baseset$exclude[baseset$year < 1674] %>% length() #20
baseset$exclude[baseset$year < 1674] <- rbernoulli(20, p = .2)
# some missing data from Spain in 76-85 , austria 1679 due to plagues
# Some more missing from early years
# random missing because I feel like it
# delete some countries because they do not exist yet. Liecht,
# Monaco and Luxembourg were part of france.
# Belgium did not really exist, sort of part of Netherlands (under Spanish rule?)
# Austria was not yet austria-hungary
# some size considerations
new_set <- baseset %>% filter(!exclude) %>% filter(!random) %>%
select(countryname,year)
## onward to create some data.
## ranges of population
## populations would be correlated with previous years.
## could use arima.
## Could draw a starting value from rnbinom, and add gausian (Normal) noise and round t
## nearest integer, ugly, but it would make more sense I guess.
create_pop <- function(n_values){
rep(rnbinom(1,20,.23), n_values) + rnorm(n_values, 0, 5) %>%
round(0)
}
### This took a lot of effort, but the trick is to make a function that
### works on a dataframe and returns a dataframe, if then use group_by and
### do, it works for everything. The real trick is, as is often the case,
### make it work on a subset, than apply on the total.
###
make_popcolumn <- . %>%
mutate(pop = create_pop(nrow(.)))
new_set %>%
group_by(countryname) %>%
do(make_popcolumn(.)) %>%
ggplot() +
geom_line(aes(year, pop, group = countryname))+
geom_point(aes(year, pop, color = countryname), alpha = .7)
final_set <- new_set %>%
group_by(countryname) %>%
do(make_popcolumn(.))
final_set %>%
ggplot() +
geom_line(aes(year, pop, group = countryname))+
geom_point(aes(year, pop, color = countryname), alpha = .7)
### creating unicycles set
### unicycles numbers and total price should be related.
### perhaps use some of the population as basis ?
### .70 percent can buy cycle but with wide margins because of outside
### influences? Average price, in dutch guilders.
### A unicycle in 2016 is approx 90 euros. rembrandt got approx 4000 scudi,
### 3200 gulden (guilders). .. I will use 90 guilders a piece, it being artisinal
### and all.
unicycles_price <- function(n_values){
round(rep(runif(1,80,90), n_values) + rt(n_values, 20, 5),1)
}
set2 <- final_set %>%
mutate(bikes = round(.7*pop + rt(1,df = 6),digits = 0)) %>%
mutate(total_turnover = round(bikes * unicycles_price(1),2)) %>%
mutate(name_of_country = str_to_upper(countryname) ) %>%
ungroup() %>%
select(-pop, -countryname)
set2 %>% write_csv("sales.csv")
final_set %>% write_csv("observations.csv")
## could make seperate currency translation. ~ 1 guilder is this in marks, table.