In [None]:
setwd('.')
house_prices <- read.csv(file = 'train.csv', row.names=1)

In [None]:
library(psych)
library(corrplot)
library(reshape2)
library(ggplot2)
library(tidyr)
library(Hmisc)
library(cluster)
library(coefplot)
library(car)
library(caret)

In [None]:
colnames(house_prices)

In [None]:
nums <- unlist(lapply(house_prices, is.numeric))
house_prices_nums = house_prices[,nums]

In [None]:
multi.hist(house_prices_nums)

In [None]:
summary(house_prices)

In [None]:
hist(house_prices[,'SalePrice'])

In [None]:
house_prices[,'SalePrice'] = log(house_prices[,'SalePrice'])
hist(house_prices[,'SalePrice'])
d <- density(house_prices$SalePrice)
plot(d, main="Sale Price")

In [None]:
d <- density(house_prices$LotArea)
plot(d, main="Lot Area")

In [None]:
d <- density(log(house_prices$LotArea))
plot(d, main="Lot Area")

In [None]:
par(mfrow=c(2,2))
plot(density(log(house_prices$X2ndFlrSF)))
plot(density(log(house_prices$X1stFlrSF)))
plot(density(log(house_prices$TotalBsmtSF)))
plot(density(log(house_prices$GrLivArea)))

In [None]:
house_prices$X2ndFlrSF <- log(house_prices$X2ndFlrSF)
house_prices$X1stFlrSF <- log(house_prices$X1stFlrSF)
house_prices$TotalBsmtSF <- log(house_prices$TotalBsmtSF)
house_prices$GrLivArea <- log(house_prices$GrLivArea)

In [None]:

house_prices_nums_nona <- house_prices_nums %>% drop_na()
# print(house_prices_nums_nona[,c(3,9)])
corr= cor(house_prices_nums_nona)
#corr= cor(house_prices_nums[,c(-3,-9,-26)])
# corr= cor(house_prices_nums)
corrplot(corr, method="color", type='upper')

In [None]:
house_prices_ncol = ncol(house_prices_nums)
obj_field = which(colnames(house_prices_nums)=="SalePrice")
house_prices_ncol

Scatter plot: SalePrice vs every other features

In [None]:
max <- ceiling((house_prices_ncol-1)/9)-1
for(i in 0:max) {
    first = i*9+1
    last = min(house_prices_ncol-1,first+8)
    house_prices_nums2 <- melt(house_prices_nums_nona[,c(first:last,obj_field)], id.vars = "SalePrice")
    print(ggplot(house_prices_nums2, aes(x=SalePrice, y=value)) + geom_point() + facet_wrap("variable", scales="free"))
}

In [None]:
indices <- which(house_prices_nums_nona$LotArea>200000)
house_prices_nums_nona=house_prices_nums_nona[-indices,]

indices <- which(house_prices_nums_nona$LotFrontage>250)
house_prices_nums_nona=house_prices_nums_nona[-indices,]

Correlation with sale price

In [None]:
sale_corr <- abs(corr[,'SalePrice'])
sort(sale_corr, decreasing = TRUE)

P values of the correlations

In [None]:
pvalues = rcorr(as.matrix(house_prices_nums_nona), type = "pearson")$P
pvalues_price <- pvalues[,'SalePrice']
sort(pvalues_price)

In [None]:
library(FactoMineR)
library(factoextra)

In [None]:
prices_pca <- PCA(house_prices_nums)

In [None]:
fviz_pca_var(prices_pca, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             select.var = list(contrib = 10),
             repel = TRUE # Avoid text overlapping
             )

In [None]:
fviz_pca_biplot(prices_pca,select.var=list(contrib=10),
                select.ind=list(contrib=200),
                label="var",
                col.ind=house_prices$Neighborhood)

In [None]:
set.seed(31)
fviz_nbclust(house_prices_nums_nona, kmeans, method = "wss", k.max = 24) + theme_minimal() + ggtitle("the Elbow Method")

In [None]:
cah.res <- agnes(house_prices_nums_nona, method = "ward")
plot(cah.res, which.plot=2, cex=0.6, main ="Dendrogramme")
rect.hclust(cah.res, k=7)
grp <- cutree(cah.res, k = 7)

In [None]:
clusplot(house_prices_nums_nona, grp, color=TRUE, shade=TRUE,
         labels=2, lines=0, main= '')

In [None]:
house.scaled = as.data.frame(scale(house_prices_nums_nona))
kmeans.res = kmeans(house.scaled, centers=7, nstart=10)
fviz_cluster(kmeans.res, data=house_prices_nums_nona)

In [None]:
houses.pca <- PCA(house_prices_nums_nona)

In [None]:
fviz_pca_biplot(houses.pca,
                label="var",
                col.ind=factor(kmeans.res$cluster),
                gradient.cols = c("darkblue", "red", "purple" ),
                repel=TRUE)

In [None]:
#boxplot(house_prices~)

In [None]:
house.scaled = subset(house.scaled, select = -c(MSSubClass) )

In [None]:
house.lm = lm(SalePrice~. , data=house.scaled)
summary(house.lm)
coefplot(house.lm)

In [None]:
#vif(house.lm)
Anova(house.lm)
plot(house.lm,1)

# Surface features

Separate houses with and without 2nd floor

In [None]:
indices = which(house_prices_nums_nona$X2ndFlrSF == 0)
house.scaled.2ndFlr = house.scaled[-indices,]
house.scaled.no2ndFlr = house.scaled[indices,]

Second floor

In [None]:
coefplot(lm(SalePrice~X2ndFlrSF+X1stFlrSF+TotalBsmtSF+GrLivArea , data=house.scaled.2ndFlr))
summary(lm(SalePrice~X2ndFlrSF+X1stFlrSF+TotalBsmtSF+GrLivArea , data=house.scaled.2ndFlr))

In [None]:
pairs.panels(house.scaled.2ndFlr[ ,c('SalePrice', 'X2ndFlrSF','X1stFlrSF','TotalBsmtSF','GrLivArea')])

No second floot

In [None]:
coefplot(lm(SalePrice~X1stFlrSF+TotalBsmtSF+GrLivArea , data=house.scaled.no2ndFlr))
summary(lm(SalePrice~X1stFlrSF+TotalBsmtSF+GrLivArea , data=house.scaled.no2ndFlr))
pairs.panels(house.scaled.no2ndFlr[ ,c('SalePrice', 'X1stFlrSF','TotalBsmtSF','GrLivArea')])

In [None]:
plot(lm(SalePrice~X1stFlrSF+TotalBsmtSF+GrLivArea , data=house.scaled.no2ndFlr),1)