## Identify 10 Most Important Features

In [None]:
source('../src/util.r')
source('../src/multiplot.r')

In [None]:
ames_housing_df <- loadCleanAmesData()

In [None]:
features_df <- ames_housing_df[, !names(ames_housing_df) %in% c("SalePrice")]

In [None]:
dim(features_df)

In [None]:
head(features_df)

#### Let's count the number of 0s in the columns. 

In [None]:
zero_sums <- colSums(features_df == 0)
zero_sums <- zero_sums[zero_sums > 0]
#zero_sums[zero_sums < 1460 * 0.50]
zero_sums

#### Zero's in these columns indicate that the feature is not available in those homes. If a feature is not available in more than 50% of the homes, we can assume the features are not likely to affect the SalePrice prediction and hence remove those features from consideration.

In [None]:
features_df <- features_df[, colSums(features_df == 0) < (1460 * 0.50)]

In [None]:
dim(features_df)

#### If PoolArea is irrelevant as above, Pool Quality is irrelevant for our prediction and can be removed as well.

In [None]:
features_df <- features_df[, !names(features_df) %in% c("PoolQC")]

In [None]:
dim(features_df)

#### Split numerical and categorical features

In [None]:
numerical_features <- Filter(is.numeric, features_df)
categorical_features <- Filter(is.factor, features_df)

In [None]:
dim(numerical_features)
dim(categorical_features)

#### Log and scale the numerical features

In [None]:
numerical_features = log(numerical_features + 1)

In [None]:
numerical_features = data.frame(scale(numerical_features))

#### One Hot Encode Categorical features

In [None]:
install.packages("onehot")

In [None]:
library("onehot")

In [None]:
one_hot_encoding = onehot(categorical_features, stringsAsFactors = FALSE, addNA = FALSE, max_levels = 25) 
encoded_categorical_features <- predict(one_hot_encoding, categorical_features)

#### Join the numerical features and one hot encoded categorical features

In [None]:
rownames(numerical_features) <- ames_housing_df$Id 
rownames(encoded_categorical_features) <- ames_housing_df$Id 

In [None]:
all_features <- merge(numerical_features, encoded_categorical_features, by=0, all=FALSE)

In [None]:
all_features$Row.names <- NULL
all_features$Id <- NULL

In [None]:
dim(all_features)

In [None]:
head(all_features)

#### Let's find the correlation between features and target and choose features with the highest correlation to the target Sale price

In [None]:
all_features_target <- all_features
all_features_target$SalePrice <- ames_housing_df$SalePrice

In [None]:
head(all_features)

In [None]:
install.packages("mlbench")

In [None]:
install.packages("caret")

In [None]:
set.seed(7)
# load the library
library(mlbench)
library(caret)

In [None]:
#install.packages("corrplot")

In [None]:
#library("Hmisc")
#library("corrplot")

In [None]:
#corr_result <- rcorr(as.matrix(all_features))
#head(corr_result)

**corr_result contains correlation coefficients and correlation p-values**

#### Let's plot the significant correlations (p > 0.5) and leave the insignificant correlations blank

In [None]:
# Insignificant correlations are leaved blank
corrplot(corr_result$r, type="upper", order="hclust", 
         p.mat = corr_result$P, sig.level = 0.50, insig = "blank")