`library(readr)
houses  <-  read_tsv('AmesHousing_1.txt', col_types = cols(`Pool QC` = col_character())) `

`standard_deviation <- function(vector) {
    distances  <-  (vector - mean(vector))**2 
    sqrt(sum(distances) / length(distances) )
}` # without Bessel correction

`st_dev  <-  standard_deviation(houses$SalePrice)`

`mean  <-  mean(houses$SalePrice)`

**Task**

We provided the **kernel density plot** for the `SalePrice` variable to find out how far off \$225,000 is from the mean

`library(ggplot2)`

`ggplot(data = houses,
   aes(x = SalePrice)) +
   geom_density(alpha = 0.1, 
                color='blue', 
                fill='blue') +
   geom_vline(aes(xintercept = mean, 
                  color = 'Mean'), 
              size = 1.2 ) +
   geom_vline(aes(xintercept = 220000, 
                  color = '220,000'), 
              size = 1.2 ) +
   geom_vline(aes(xintercept = mean+st_dev, 
                  color = 'Standard deviation'), 
              size = 1.2 ) +
   scale_y_continuous(labels = scales::comma) +
   scale_x_continuous(labels = scales::comma, 
                      lim = c(min(houses$SalePrice), 
                              max(houses$SalePrice))) +
   scale_colour_manual(values = c("Mean"="black", 
                                  "220,000"="red", 
                                  "Standard deviation"="orange"), 
                       name = "") +
   theme_bw() + 
   theme(legend.position='top') +
   xlab("Sale Price") + 
   ylab("Density")`
    
![image.png](attachment:image.png)

* Examine the graph and figure out whether a price of \$220,000 is very expensive. 


**Answer**

`Not expensive`

**Task**

Find out the number of standard deviations away from the mean for a price of $220,000 in the distribution of the `SalePrice` variable.

**Answer**

`distance  <-  220000 - mean(houses$SalePrice)
st_devs_away  <-  distance / standard_deviation(houses$SalePrice)`

**Task**

1. Write a function that takes in a value, the vector the value belongs to, and returns the z-score of that value.
2. Compute the z-score for min_val, mean_val, max_val

**Answer**

`z_score <- function(value, vector, bessel = FALSE) {
    mean  <-  mean(vector)
    st_dev <- ifelse(!bessel, sd(vector), standard_deviation(vector)) 
    (value - mean) / st_dev
}`

`min_val  <-  min(houses$SalePrice)
mean_val  <-  mean(houses$SalePrice)
max_val  <-  max(houses$SalePrice)`

`min_z  <-  z_score(min_val, houses$SalePrice)
mean_z  <-  z_score(mean_val, houses$SalePrice)
max_z  <-  z_score(max_val, houses$SalePrice)`


**Task**

Find out the location for which $200,000 has the z-score closest to 0.

**Answer**

`target_neighborhoods <- c('NAmes', 'CollgCr', 'OldTown', 'Edwards', 'Somerst')`

`library(dplyr)`

`houses %>%
    filter(Neighborhood %in% target_neighborhoods) %>%
    group_by(Neighborhood) %>%
    summarise(zscore = abs(z_score(200000, SalePrice))) %>%
    ungroup() %>%
    arrange(zscore)` # Isolate the data for the five neighborhoods.

`# Location with the z-score closest to 0`

`best_investment  <-  'College Creek'`

 **Task**
 
 Transformed the distribution of the `SalePrice` variable to distribution of `z-scores`
 
 **Answer**
 
`library(dplyr)`

`houses <- houses %>%
    mutate(z_prices = (SalePrice - mean(SalePrice)) / standard_deviation(SalePrice))`

**Task**

* Compute the **mean** of the `z_prices` column. 
* Compute the **standard deviation** of the `z_prices` column.

**Answer**

`z_mean_price  <-  mean(houses$z_prices)`

`z_stdev_price  <-  standard_deviation(houses$z_prices)`


**Task**

Standardize the population of values stored in the population variable and compute its mean and its standard deviation.

`population  <-  c(0, 8, 0, 8)`

**Answer**

`mean_pop  <-  mean(population)
stdev_pop  <-  standard_deviation(population)`

`standardized_pop  <-  (population - mean_pop) / stdev_pop`

`mean_z  <-  mean(standardized_pop)
stdev_z  <-  standard_deviation(standardized_pop)`

**Task**

Compute the **standard deviation** of `standardized_sample` using the sample standard deviation formula.

`sample  <-  c(0, 8, 0, 8)`

`x_bar  <-  mean(sample)`

`s  <-  sd(sample)`

`standardized_sample  <-  (sample - x_bar) / s`


**Answer**

`standardized_sample  <-  (sample - x_bar) / s`

`stdev_sample  <-  sd(standardized_sample)`

**Task**

Calculate total means and standard deviations for each index: `mean_index_1`, `mean_index_2`, `sd_index_1`, and `sd_index_2`

**Answer**

`houses <- houses %>%
  mutate(index_1 = SalePrice/ 100000 + 37) %>%
  mutate(index_1 = replace(index_1,  row_number() %% 2 != 0, NA_real_) ) %>%
  mutate(index_2 = SalePrice/ 90000 - 2.8) %>%
  mutate(index_2 = replace(index_2,  row_number() %% 2 == 0, NA_real_) )`

`mean_index_1  <-  mean(houses$index_1, na.rm = T)
mean_index_2  <-  mean(houses$index_2, na.rm = T)`

`sd_index_1  <-  standard_deviation(na.omit(houses$index_1))
sd_index_2  <-  standard_deviation(na.omit(houses$index_2))`


**Task**

Standardize the distributions of the `index_1` and `index_2` variables.

**Answer**

`houses <- houses %>%
  mutate(z_1 = (index_1 - mean_index_1) / sd_index_1 ) %>%
  mutate(z_2 = (index_2 - mean_index_2) / sd_index_2 )`

`head(houses %>% select(z_1, z_2), 2)`

**Task**

`houses_merged <- bind_rows(houses_1,houses_2) %>%
    mutate(z_1 = tidyr::replace_na(z_1,0)) %>%
    mutate(z_2 = tidyr::replace_na(z_2,0)) %>%
    mutate(z_merged = z_1 +  z_2)`

* Transform the standardized distribution of `z_merged` to a distribution with a 
`μ=50` and `σ=10` using the formula `x=zσ+μ`


**Answer**

`mean  <-  50
st_dev  <-  10`

`houses_merged <- houses_merged %>%
    mutate(transformed = z_merged * st_dev + mean)`
                            
`mean_transformed  <-  mean(houses_merged$transformed)
stdev_transformed  <-  standard_deviation(houses_merged$transformed)`