In [16]:
# install.packages(c("ggplot2", "dplyr","car"))

In [17]:
library(ggplot2)
library(dplyr)
library(car)
library(reshape2)

# Part 1 - Data analysis

## 1. Load data

In [None]:
data <- read.csv("housing_data.csv")
head(data, 5)

Unnamed: 0_level_0,Crime.Rate,Average.Rooms,Public.Transport.Access,Number.of.Schools,Median.Home.Value
Unnamed: 0_level_1,<dbl>,<dbl>,<int>,<int>,<dbl>
1,,5.585324,10,3,47.90077
2,2.654339,5.395206,3,6,41.5391
3,4.619221,6.033965,9,4,48.51757
4,6.807575,5.418335,10,5,42.50757
5,2.414617,6.18932,2,4,51.39125


## 2. Summary statistics


In [None]:
# cat("Structur of data:\n\n")
# str(data)
# cat("\n-------------------------------------------------\n")
cat("Summary: ")
summary(data)

Summary: 

   Crime.Rate        Average.Rooms   Public.Transport.Access Number.of.Schools
 Min.   : 0.005305   Min.   :4.112   Min.   : 1.000          Min.   : 0.000   
 1st Qu.: 1.299938   1st Qu.:5.598   1st Qu.: 3.000          1st Qu.: 4.000   
 Median : 3.031481   Median :6.033   Median : 5.000          Median : 5.000   
 Mean   : 3.137415   Mean   :6.026   Mean   : 5.421          Mean   : 4.992   
 3rd Qu.: 4.584798   3rd Qu.:6.460   3rd Qu.: 8.000          3rd Qu.: 6.000   
 Max.   :12.631829   Max.   :7.801   Max.   :10.000          Max.   :10.000   
 NA's   :25          NA's   :15                                               
 Median.Home.Value
 Min.   :31.55    
 1st Qu.:43.23    
 Median :46.91    
 Mean   :47.10    
 3rd Qu.:50.85    
 Max.   :62.56    
                  

In [None]:
tail(data)

Unnamed: 0_level_0,Crime.Rate,Average.Rooms,Public.Transport.Access,Number.of.Schools,Median.Home.Value
Unnamed: 0_level_1,<dbl>,<dbl>,<int>,<int>,<dbl>
501,5.315444,6.626635,9,4,51.99554
502,,6.44462,4,6,53.57924
503,0.1,6.734687,4,3,49.65183
504,4.407423,5.625335,2,1,43.67766
505,1.373394,6.922176,6,3,50.66238
506,1.782187,6.13832,2,4,45.438


### 3. Get mean, mode and median, as well as the standard deviation for each variable

**Mean**

In [None]:
means <- function(df, column) {
    mean_value <- mean(df[[column]], na.rm = TRUE)
    return(mean_value)
}

for (col in colnames(data)) {
    cat(paste("Mean of", col, ":", means(data, col), "\n"))
}


Mean of Crime.Rate : 3.13741457390437 
Mean of Average.Rooms : 6.02573705745418 
Mean of Public.Transport.Access : 5.42094861660079 
Mean of Number.of.Schools : 4.99209486166008 
Mean of Median.Home.Value : 47.1035584244862 


**Mode**


In [None]:
mode <- function(df, column) {
    unique <- unique(df[[column]])
    mode_value <- unique[which.max(tabulate(match(df[[column]], unique)))]
    return(mode_value)
}

for (col in colnames(data)) {
    cat(paste("Mode of", col, ":", mode(data, col), "\n"))
}

Mode of Crime.Rate : 0.1 
Mode of Average.Rooms : NA 
Mode of Public.Transport.Access : 2 
Mode of Number.of.Schools : 5 
Mode of Median.Home.Value : 47.90076602 


**Median**

In [None]:
median_value <- function(df, column) {
  med <- median(df[[column]], na.rm = TRUE)
  return(med)
}

for (col in colnames(data)) {
  cat(paste("Median of", col, ":", median_value(data, col), "\n"))
}

Median of Crime.Rate : 3.031481002 
Median of Average.Rooms : 6.03317907 
Median of Public.Transport.Access : 5 
Median of Number.of.Schools : 5 
Median of Median.Home.Value : 46.91257421 


### 4. Compute the correlation between each pair of variables


In [None]:
corr_matrix <- cor(data, use = "pairwise.complete.obs")
corr_matrix

Unnamed: 0,Crime.Rate,Average.Rooms,Public.Transport.Access,Number.of.Schools,Median.Home.Value
Crime.Rate,1.0,0.109411375,0.0115047832,0.025079768,0.0879369585
Average.Rooms,0.10941138,1.0,-0.007422052,-0.008113311,0.8896695239
Public.Transport.Access,0.01150478,-0.007422052,1.0,0.014596282,0.0009097478
Number.of.Schools,0.02507977,-0.008113311,0.0145962817,1.0,-0.0072444918
Median.Home.Value,0.08793696,0.889669524,0.0009097478,-0.007244492,1.0


In [None]:
high_corr <- which(corr_matrix == max(corr_matrix[corr_matrix < 1]), arr.ind=TRUE)
print(high_corr)
cat(paste("The highest correlation is between columns: ",colnames(corr_matrix)[high_corr][1]," and ",colnames(corr_matrix)[high_corr][2]))


                  row col
Median.Home.Value   5   2
Average.Rooms       2   5
The highest correlation is between columns:  Median.Home.Value  and  Average.Rooms

In [None]:
cat("Na values: \n")
cat(paste("\nCrime.Rate:", colSums(is.na(data["Crime.Rate"]))))
cat(paste("\nAverage.Rooms:", colSums(is.na(data["Average.Rooms"]))))
cat(paste("\nPublic.Transport.Access:", colSums(is.na(data["Public.Transport.Access"]))))
cat(paste("\nNumber.of.Schools:", colSums(is.na(data["Number.of.Schools"]))))
cat(paste("\nMedian.Home.Value:", colSums(is.na(data["Median.Home.Value"]))))

# colSums(is.na(data))


Na values: 

Crime.Rate: 25
Average.Rooms: 15
Public.Transport.Access: 0
Number.of.Schools: 0
Median.Home.Value: 0

In [11]:
data_cleaned <- data %>% mutate_all(~ifelse(is.na(.), median(., na.rm = TRUE), .))
data_cleaned

Crime.Rate,Average.Rooms,Public.Transport.Access,Number.of.Schools,Median.Home.Value
<dbl>,<dbl>,<int>,<int>,<dbl>
3.0314810,5.585324,10,3,47.90077
2.6543392,5.395206,3,6,41.53910
4.6192213,6.033965,9,4,48.51757
6.8075746,5.418335,10,5,42.50757
2.4146166,6.189320,2,4,51.39125
2.4146576,5.964833,6,4,49.64657
6.9480320,5.832736,7,4,48.76959
4.9185868,5.364705,9,4,38.21798
1.8263140,5.596260,3,6,44.69063
3.0314810,6.528774,10,4,52.59876


In [12]:
ggplot(cor_matrix, aes(Var1, Var2, fill = value)) +
  geom_tile("test") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  theme_minimal() +
  labs(title = "Correlation Matrix Heatmap",
       x = "",
       y = "",
       fill = "Correlation") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ERROR: Error in eval(expr, envir, enclos): objet 'cor_matrix' introuvable


In [None]:
# descriptive_stats <- data %>%
#   summarise(across(everything(), list(
#     mean = ~mean(., na.rm = TRUE),
#     median = ~median(., na.rm = TRUE)
#   )))

# get_mode <- function(v) {
#   unique_v <- unique(v)
#   unique_v[which.max(tabulate(match(v, unique_v)))]
# }

# modes <- sapply(data, get_mode)

# descriptive_stats <- cbind(descriptive_stats, mode = modes)

# descriptive_stats


In [None]:
# # Compute correlations (ignoring NA values)
# cor_matrix <- cor(data, use = "pairwise.complete.obs")

# # Print correlation matrix
# print(cor_matrix)
