### Introduction

In [1]:
library(tidyverse)
library(datateachr)
library(repr)
library(digest)
library(infer)
library(grid)
library(gridExtra)
library(ggplot2)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6      [32m✔[39m [34mpurrr  [39m 0.3.4 
[32m✔[39m [34mtibble [39m 3.1.8      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.4.1 
[32m✔[39m [34mreadr  [39m 2.1.2      [32m✔[39m [34mforcats[39m 0.5.2 
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘gridExtra’


The following object is masked from ‘package:dplyr’:

    combine




In [2]:
#read and display the original data
temperature <- read.csv("temperature.csv")
head(temperature)

Unnamed: 0_level_0,datetime,Vancouver,Portland,San.Francisco,Seattle,Los.Angeles,San.Diego,Las.Vegas,Phoenix,Albuquerque,⋯,Philadelphia,New.York,Montreal,Boston,Beersheba,Tel.Aviv.District,Eilat,Haifa,Nahariyya,Jerusalem
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2012-10-01 12:00:00,,,,,,,,,,⋯,,,,,,,309.1,,,
2,2012-10-01 13:00:00,284.63,282.08,289.48,281.8,291.87,291.53,293.41,296.6,285.12,⋯,285.63,288.22,285.83,287.17,307.59,305.47,310.58,304.4,304.4,303.5
3,2012-10-01 14:00:00,284.629,282.0833,289.475,281.7972,291.8682,291.5335,293.4031,296.6085,285.1546,⋯,285.6632,288.2477,285.8346,287.1861,307.59,304.31,310.4958,304.4,304.4,303.5
4,2012-10-01 15:00:00,284.627,282.0919,289.4606,281.7898,291.8628,291.5434,293.3922,296.6315,285.234,⋯,285.7568,288.3269,285.8478,287.2317,307.3915,304.2818,310.4115,304.4,304.4,303.5
5,2012-10-01 16:00:00,284.625,282.1005,289.4462,281.7824,291.8575,291.5532,293.3812,296.6545,285.3133,⋯,285.8504,288.4062,285.8609,287.2773,307.1452,304.238,310.3273,304.4,304.4,303.5
6,2012-10-01 17:00:00,284.6229,282.1091,289.4319,281.7751,291.8522,291.5631,293.3702,296.6774,285.3927,⋯,285.9441,288.4855,285.8741,287.3228,306.8989,304.1942,310.2431,304.4,304.4,303.5


In [3]:
#Data Clean up
#select data only from San Fransisco, Denver and New York
#filter out data that are not available
#change the temperature unit from Kelvin to Celcius
temp_clean <- temperature %>%
select(San.Francisco, Denver, New.York)%>%
filter(San.Francisco !="NA" , Denver !="NA" , New.York !="NA")%>%
mutate(San.Francisco = San.Francisco - 273.15, Denver= Denver - 273.15, New.York = New.York - 273.15)
head(temp_clean)

ERROR: Error in head(temp): object 'temp' not found


In [None]:
#number of row in our clean data
# there are 4460 data points for each cities
nrow(temp_clean)

In [None]:
# Mean and Standard Deviation of temperature in each city
mean <- temp_clean %>%
summarize(mean_s =mean(San.Francisco),
          mean_d =mean(Denver), 
          mean_n =mean(New.York),
         sd_s=sd(San.Francisco),
         sd_d=sd(Denver),
         sd_n=sd(New.York))
mean

In [None]:
#make new variable that selects only the temperature column of every city
temp_s <- temp_clean %>%
select(San.Francisco)
temp_d <- temp_clean %>%
select(Denver)
temp_n <- temp_clean %>%
select(New.York)

In [None]:
#Bootstrap samples of each city
set.seed(1)
sample_s <- temp_s %>%
rep_sample_n(size=400)
sample_d <- temp_d %>%
rep_sample_n(size=400)
sample_n <- temp_n %>%
rep_sample_n(size=400)

resampled_s <- sample_s %>%
rep_sample_n(size=400, reps=3000, replace= TRUE)
resampled_d <- sample_d %>%
rep_sample_n(size=400, reps=3000, replace= TRUE)
resampled_n <- sample_n %>%
rep_sample_n(size=400, reps=3000, replace= TRUE)
head(resampled_s)
head(resampled_d)
head(resampled_n)

In [None]:
#calculation of bootstrap distribution mean and their respective 90% CI 
#San Fransisco
distribution_s<- resampled_s %>%
group_by(replicate)%>%
summarize(mean_s2=mean(San.Francisco), 
          sd_s2=sd(San.Francisco))

bmean_s<-mean(distribution_s$mean_s2)

ci_s <- 
    distribution_s %>% 
    summarize(ci_lower = quantile(mean_s2, 0.05),
              ci_upper = quantile(mean_s2, 0.95))

head(distribution_s)
bmean_s
ci_s




#Denver
distribution_d<- resampled_d %>%
group_by(replicate)%>%
summarize(mean_d2=mean(Denver), 
          sd_d2=sd(Denver))

bmean_d<-mean(distribution_d$mean_d2)

ci_d <- 
    distribution_d %>% 
    summarize(ci_lower = quantile(mean_d2, 0.05),
              ci_upper = quantile(mean_d2, 0.95))

head(distribution_d)
bmean_d
ci_d



#New York
distribution_n<- resampled_n %>%
group_by(replicate)%>%
summarize(mean_n2=mean(New.York), 
          sd_n2=sd(New.York))

bmean_n<-mean(distribution_n$mean_n2)



ci_n <-distribution_n %>% 
    summarize(ci_lower = quantile(mean_n2, 0.05),
              ci_upper = quantile(mean_n2, 0.95))

head(distribution_n)
bmean_n
ci_n

In [None]:
#visualize the sampling distribution mean 
s_plot<- distribution_s %>%
ggplot()+
geom_histogram(aes(x= mean_s2))+
ggtitle("bootstrap distribution of average temperature in San Francisco")+
xlab("average temperature(celsius)")+
geom_vline(xintercept = bmean_s,
               size = 1,
               colour = "red")+
annotate("rect", xmin = ci_s$ci_lower, xmax = ci_s$ci_upper, ymin = 0, ymax = Inf,
             fill = "deepskyblue",
             alpha = 0.3)
s_plot

d_plot<- distribution_d %>%
ggplot()+
geom_histogram(aes(x= mean_d2))+
ggtitle("bootstrap distribution of average temperature in Denver")+
xlab("average temperature(celsius)") +
geom_vline(xintercept = bmean_d,
               size = 1,
               colour = "red")+
annotate("rect", xmin = ci_d$ci_lower, xmax = ci_d$ci_upper, ymin = 0, ymax = Inf,
             fill = "deepskyblue",
             alpha = 0.3)
d_plot

n_plot<- distribution_n %>%
ggplot()+
geom_histogram(aes(x= mean_n2))+
ggtitle("bootstrap distribution of average temperature in New York")+
xlab("average temperature(celsius)") +
geom_vline(xintercept = bmean_n,
               size = 1,
               colour = "red")+
annotate("rect", xmin = ci_n$ci_lower, xmax = ci_n$ci_upper, ymin = 0, ymax = Inf,
             fill = "deepskyblue",
             alpha = 0.3)
n_plot


In [None]:
#boxplot
cities <- c("SF", "Denver", "NY")

bp_s <-rename(distribution_s, dist_mean = mean_s2, dist_sd = sd_s2) %>% mutate(city = cities[1]) 
bp_d <-rename(distribution_d, dist_mean = mean_d2, dist_sd = sd_d2) %>% mutate(city = cities[2]) 
bp_n <- rename(distribution_n, dist_mean = mean_n2, dist_sd = sd_n2) %>% mutate(city = cities[3]) 
bp_data <- rbind(bp_s, bp_d, bp_n)

ggplot(bp_data, aes(x=city, y=dist_mean, fill=city)) + 
    ggtitle("Boxplot of Sampling distribution mean") +
    geom_boxplot(alpha=0.3) +
    theme_classic() 

# Methods
"good things about this report" <br>
Since our project will involves 3 different cities, we are going to use One-way Anova to see if there is a statistical difference between their mean temperatures. <br> <br>
Our Hypothesis will be <br>
#### $H_0: \mu = \mu_s = \mu_d = \mu_n$ vs $H_A:$ at least one of the city have different temperature mean <br> <br>


### Vanessa's Personal Note
The idea behind the ANOVA : if the average variation between groups is large enough compared to the average variation within groups, then you could conclude that at least one group mean is not equal to the others.
1. Between variation: comparing the mean of each group with the overall mean of the data—so individual data points don’t matter quite as much as just comparing group means
2. within variation: the variation of each observation from its group mean
# Assumptions of anova
1. Independence of the observations. : Each subject should belong to only one group.There is no relationship between the observations in each group. Having repeated measures for the same participants is not allowed.
2. No significant outliers in any cell of the design : draw boxplot
3. Normality. the data for each design cell should be approximately normally distributed. :
4. Homogeneity of variances. The variance of the outcome variable should be equal in every cell of the design. : 


### Limitation
- One way ANOVA can only determine if two or more groups differ from each other. But it doesn't specify which gropu are different from one another.