In [2]:
library(corrplot)
library(PerformanceAnalytics)
library(ggplot2)
library(FactoMineR)
library(factoextra)
library(dplyr)
library(lattice)
library(cluster)
library(visreg)
library(car)
library(tidyr)
library(caret)
library(forcats) # fct_recode function
library(repr)    # for figure size
library(randomForest)

corrplot 0.84 loaded
Loading required package: xts
Loading required package: zoo

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

Registered S3 method overwritten by 'xts':
  method     from
  as.zoo.xts zoo 

Attaching package: ‘PerformanceAnalytics’

The following object is masked from ‘package:graphics’:

    legend

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Attaching package: ‘dplyr’

The following objects are masked from ‘package:xts’:

    first, last

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: carData

Attaching package: ‘car’

The following object is masked from ‘package:dplyr’:

    

In [3]:
is.Date <- function(x) inherits(x, 'Date')

In [4]:
# library(summarytools)

# Traitement des features

## Variables pas vraiment numériques -> conversion en date

- YearBuilt
- YearRemodAdd
- GarageYrBlt
- MoSold
- YrSold

In [5]:
function date_conversion(dataframe){
    dataframe$YearBuilt <- as.Date(
        ISOdate(dataframe[, 'YearBuilt'], 01, 01), format = "%Y")
    dataframe$YearRemodAdd <- as.Date(
        ISOdate(dataframe[, 'YearRemodAdd'], 01, 01), format = "%Y")
    dataframe$GarageYrBlt <- as.Date(
        ISOdate(dataframe[, 'GarageYrBlt'], 01, 01), format = "%Y")
    dataframe$YrSold <- as.Date(
        ISOdate(dataframe[, 'YrSold'], dataframe[, 'MoSold'], 01), format = "%Y")

    dataframe = select(dataframe, -MoSold)
}

ERROR: Error in parse(text = x, srcfile = src): <text>:1:10: unexpected symbol
1: function date_conversion
             ^


## Variable non quantitative -> conversion en qualitative

In [None]:
trainfull$MSSubClass <- factor(trainfull$MSSubClass)

## Variables ordinales transformables :

|    Feature       |       Levels      | Score Possibilty  |
|:-----------------|:-----------------:|:-----------------:|
|LotShape                    |irregular -> regular        |1 to 4|
|LandContour                 |depression -> flat           |1 to 4|
|Utilities                   |electricity only -> all      |1 to 4|
|LandSlope                   |sever -> gentle              |1 to 3|
|ExterQual                   |poor -> excellent            |1 to 5|
|ExterCond                   |poor -> excellent            |1 to 5|
|BsmtQual                    |poor -> excellent            |1 to 5|
|BsmtCond                    |poor -> excellent            |1 to 5|
|BsmtExposure                |no -> good                   |1 to 4|
|BsmtFinType1                |unfinished -> good           |1 to 6|
|BsmtFinType2                |unfinished -> good           |1 to 6|    
|HeatingQC                   |poor -> excellent            |1 to 5|
|CentralAir                  |no / yes                     |0 / 1 |
|Electrical                  |poor -> standard             |1 to 5|
|KitchenQual                 |poor -> excellent            |1 to 5|
|Functional                  |salvage -> typical           |1 to 8|
|FireplaceQu                 |poor -> excellent            |1 to 5|
|GarageFinish                |unfinished -> finshed        |1 to 3|
|GarageQual                  |poor -> excellent            |1 to 5|
|GarageCond                  |poor -> excellent            |1 to 5|
|PavedDrive                  |dirt -> paved                |1 to 3|
|PoolQC                      |fair -> excellent            |1 to 4|
|Fence                       |none -> good                 |1 to 5|

In [None]:
function quantitative_conversion(dataframe){
    dataframe$LotShape <- dataframe$LotShape %>%
        fct_recode('1' = 'IR3', '2' = 'IR2', '3' = 'IR1', '4' = 'Reg')
    dataframe$LotShape <- as.numeric(as.character(dataframe$LotShape))
    
    dataframe$LandContour <- dataframe$LandContour %>%
        fct_recode('1' = 'Low', '2' = 'HLS', '3' = 'Bnk', '4' = 'Lvl')
    dataframe$LandContour <- as.numeric(as.character(dataframe$LandContour))
    
    dataframe$Utilities <- dataframe$Utilities %>%
        fct_recode('1' = 'ELO', '2' = 'NoSeWa', '3' = 'NoSewr', '4' = 'AllPub')
    dataframe$Utilities <- as.numeric(as.character(dataframe$Utilities))
    
    dataframe$LandSlope <- dataframe$LandSlope %>%
        fct_recode('1' = 'Sev', '2' = 'Mod', '3' = 'Gtl')
    dataframe$LandSlope <- as.numeric(as.character(dataframe$LandSlope))
    
    dataframe$ExterQual <- dataframe$ExterQual %>%
        fct_recode('1' = 'Po', '2' = 'Fa', '3' = 'TA', '4' = 'Gd', '5' = 'Ex')
    dataframe$ExterQual <- as.numeric(as.character(dataframe$ExterQual))
    
    dataframe$ExterCond <- dataframe$ExterCond %>%
        fct_recode('1' = 'Po', '2' = 'Fa', '3' = 'TA', '4' = 'Gd', '5' = 'Ex')
    dataframe$ExterCond <- as.numeric(as.character(dataframe$ExterCond))

    dataframe$BsmtQual <- dataframe$BsmtQual %>%
        fct_recode('1' = 'Po', '2' = 'Fa', '3' = 'TA', '4' = 'Gd', '5' = 'Ex')
    dataframe$BsmtQual <- as.numeric(as.character(dataframe$BsmtQual))

    dataframe$BsmtCond  <- dataframe$BsmtCond  %>%
        fct_recode('1' = 'Po', '2' = 'Fa', '3' = 'TA', '4' = 'Gd', '5' = 'Ex')
    dataframe$BsmtCond  <- as.numeric(as.character(dataframe$BsmtCond))

    dataframe$BsmtExposure  <- dataframe$BsmtExposure  %>%
        fct_recode('1' = 'No', '2' = 'Mn', '3' = 'Av', '4' = 'Gd')
    dataframe$BsmtExposure  <- as.numeric(as.character(dataframe$BsmtExposure))

    dataframe$BsmtFinType1  <- dataframe$BsmtFinType1  %>%
        fct_recode('1' = 'Unf', '2' = 'LwQ', '3' = 'Rec',
                   '4' = 'BLQ', '5' = 'ALQ', '6' = 'GLQ')
    dataframe$BsmtFinType1  <- as.numeric(as.character(dataframe$BsmtFinType1))

    dataframe$BsmtFinType2  <- dataframe$BsmtFinType2  %>%
        fct_recode('1' = 'Unf', '2' = 'LwQ', '3' = 'Rec',
                   '4' = 'BLQ', '5' = 'ALQ', '6' = 'GLQ')
    dataframe$BsmtFinType2  <- as.numeric(as.character(dataframe$BsmtFinType2))

    dataframe$HeatingQC  <- dataframe$HeatingQC  %>%
        fct_recode('1' = 'Po', '2' = 'Fa', '3' = 'TA', '4' = 'Gd', '5' = 'Ex')
    dataframe$HeatingQC  <- as.numeric(as.character(dataframe$HeatingQC))

    dataframe$CentralAir  <- dataframe$CentralAir  %>%
        fct_recode('0' = 'N', '1' = 'Y')
    dataframe$CentralAir  <- as.numeric(as.character(dataframe$CentralAir))

    dataframe$Electrical  <- dataframe$Electrical  %>%
        fct_recode('1' = 'Mix', '2' = 'FuseP', '3' = 'FuseF', '4' = 'FuseA', '5' = 'SBrkr')
    dataframe$Electrical  <- as.numeric(as.character(dataframe$Electrical))

    dataframe$KitchenQual  <- dataframe$KitchenQual  %>%
        fct_recode('1' = 'Po', '2' = 'Fa', '3' = 'TA', '4' = 'Gd', '5' = 'Ex')
    dataframe$KitchenQual  <- as.numeric(as.character(dataframe$KitchenQual))

    dataframe$Functional <- dataframe$Functional  %>%
        fct_recode('1' = 'Sal', '2' = 'Sev', '3' = 'Maj2', '4' = 'Maj1',
                   '5' = 'Mod', '6' = 'Min2', '7' = 'Min1', '8' = 'Typ')
    dataframe$Functional <- as.numeric(as.character(dataframe$Functional))

    #dataframe$FireplaceQu  <- dataframe$FireplaceQu  %>%
    #    fct_recode('1' = 'Po', '2' = 'Fa', '3' = 'TA', '4' = 'Gd', '5' = 'Ex')
    #dataframe$FireplaceQu  <- as.numeric(as.character(dataframe$FireplaceQu))

    #dataframe$GarageFinish  <- dataframe$GarageFinish  %>%
    #    fct_recode('1' = 'Unf', '2' = 'RFn', '3' = 'Fin')
    #dataframe$GarageFinish  <- as.numeric(as.character(dataframe$GarageFinish))

    dataframe$GarageQual  <- dataframe$GarageQual  %>%
        fct_recode('1' = 'Po', '2' = 'Fa', '3' = 'TA', '4' = 'Gd', '5' = 'Ex')
    dataframe$GarageQual  <- as.numeric(as.character(dataframe$GarageQual))

    dataframe$GarageCond  <- dataframe$GarageCond  %>%
        fct_recode('1' = 'Po', '2' = 'Fa', '3' = 'TA', '4' = 'Gd', '5' = 'Ex')
    dataframe$GarageCond  <- as.numeric(as.character(dataframe$GarageCond))

    dataframe$PavedDrive  <- dataframe$PavedDrive  %>%
        fct_recode('1' = 'N', '2' = 'P', '3' = 'Y')
    dataframe$PavedDrive  <- as.numeric(as.character(dataframe$PavedDrive))

    #dataframe$PoolQC  <- dataframe$PoolQC  %>%
    #    fct_recode('1' = 'Po', '2' = 'Fa', '3' = 'TA', '4' = 'Gd', '5' = 'Ex')
    #dataframe$PoolQC  <- as.numeric(as.character(dataframe$PoolQC))

    #dataframe$Fence  <- dataframe$Fence  %>%
    #    fct_recode('1' = 'MnWw', '2' = 'GdWo', '3' = 'MnPrv', '4' = 'GdPrv')
    #dataframe$Fence  <- as.numeric(as.character(dataframe$Fence))
}

## Modalités avec peu de données

In [None]:
nzv <- nearZeroVar(trainfull, saveMetrics = TRUE)
nzv[which(nzv$nzv == TRUE | nzv$zeroVar == TRUE),]

## Features avec beaucoup de 0 (3rd quartile = 0)

- BsmtFinSF2
- LowQualFinSF
- EnclosedPorch
- X3SsnPorch
- ScreenPorch
- PoolArea
- MiscVal

other nearzero alerts

- LandContour
- LandSlope
- BsmtCond
- BsmtFinType2
- KitchenAbvGr
- Functional
- GarageQual
- GarageCond

In [None]:
function many_zeros_delete(dataframe){
    dataframe <- select(dataframe,
                    - BsmtFinSF2,
                    - LowQualFinSF,
                    - EnclosedPorch,
                    - X3SsnPorch,
                    - ScreenPorch,
                    - PoolArea,
                    - MiscVal,
                    - LandContour,
                    - LandSlope,
                    - BsmtCond,
                    - BsmtFinType2,
                    - KitchenAbvGr,
                    - Functional,
                    - GarageQual,
                    - GarageCond
                   )
}

## Détails 


|    Feature       |       Modalité    |     Possibilty         |
|:-----------------|:-----------------:|:-----------------------|
|Street            |gravel = 6         |-> supprimer feature    |
|Utilities         |Mode = 1459        |-> supprimer feature    |
|LotConfig         |3frontages = 4     |-> supprimer lignes     |
|Condition2        |mode = 1445/1460   |-> supprimer feature    |
|RoofMatl          |Mode = 1434/1460   |-> RoofMatl 0/1         |
|Foundation        |Wood= 3, stone= 6  |-> supprimer lignes     |
|Heating           |Mode = 1428/1460   |-> convertir en 0/1     |
|Electrical        |Mix=1, fuseP=1     |-> supprimer lignes (finalement conservé car converti en numérique)    |
|GarageType        |2Types=6, carport=9|-> supprimer lignes     |
|MiscFeature       |Mode = 1406/1460   |-> convertir en 0/1     |
|SaleCondition     |AdjLand=4          |-> supprimer lignes     |




In [None]:
# trainfull <- select(trainfull, -Street, -Utilities, -Condition2, -MiscFeature)
trainfull <- select(trainfull, -Street, -Utilities, -Condition2)

In [None]:
#trainfull <- trainfull[-which(trainfull$LotConfig == 'FR3' |
#      trainfull$Foundation == 'Wood' | trainfull$Foundation == 'Stone' |
#      trainfull$GarageType == '2Types' | trainfull$GarageType == 'carport' |
#      trainfull$SaleCondition == 'AdjLand'),]

In [None]:
trainfull$Heating  <- trainfull$Heating  %>%
    fct_recode('0' = 'GasA',
               '1' = 'Floor', '1' = 'GasW', '1' = 'Grav',
               '1' = 'OthW', '1' = 'Wall')
trainfull$Heating  <- as.numeric(as.character(trainfull$Heating))

In [None]:
trainfull$RoofMatl  <- trainfull$RoofMatl  %>%
    fct_recode('0' = 'CompShg',
               '1' = 'ClyTile', '1' = 'Membran', '1' = 'Metal',
               '1' = 'Roll', '1' = 'Tar&Grv', '1' = 'WdShake', '1' = 'WdShngl')
trainfull$RoofMatl  <- as.numeric(as.character(trainfull$RoofMatl))

In [None]:
levels(trainfull$MiscFeature) <- c(levels(trainfull$MiscFeature), 'None')
trainfull$MiscFeature[which(is.na(trainfull$MiscFeature))] <- 'None'
trainfull$MiscFeature <- trainfull$MiscFeature  %>%
    fct_recode('0' = 'None',
               '1' = 'Elev', '1' = 'Gar2', '1' = 'Othr',
               '1' = 'Shed', '1' = 'TenC')
trainfull$MiscFeature <- as.numeric(as.character(trainfull$MiscFeature))

In [None]:
for (i in 1:ncol(trainfull)){
    na_number <- length(which(is.na(trainfull[, i])))
    if (na_number > 0){
        cat(na_number, colnames(select(trainfull, i)),'\n')
    }
}

## Données manquantes

|NA's  |Feature      | Possibility |
|-----:|:------------|:------------|
|  257 |LotFrontage  |-> supprimer lignes
| 1350 |Alley        |-> Créer modalité None
|    8 |MasVnrType   |-> supprimer
|    8 |MasVnrArea   |-> supprimer
|   37 |BsmtQual     |-> supprimer
|   36 |BsmtCond     |-> supprimer
|   37 |BsmtExposure |-> supprimer
|   36 |BsmtFinType1 |-> supprimer
|   37 |BsmtFinType2 |-> supprimer
|    1 |Electrical   |-> supprimer
|  673 |FireplaceQu  |-> Créer modalité None
|   78 |GarageType   |-> supprimer feature si corrélée à une autre
|   78 |GarageYrBlt  |-> mettre la valeur de YearBuilt
|   78 |GarageFinish |-> Créer modalité None
| 1430 |PoolQC       |-> Créer modalité None
| 1161 |Fence        |-> Créer modalité None

In [None]:
function missing_data_to_none(dataframe){
    levels(trainfull$Alley) <- c(levels(trainfull$Alley), 'None')
    trainfull$Alley[which(is.na(trainfull$Alley))] <- 'None'

    levels(trainfull$FireplaceQu) <- c(levels(trainfull$FireplaceQu), 'None')
    trainfull$FireplaceQu[which(is.na(trainfull$FireplaceQu))] <- 'None'

    levels(trainfull$GarageType) <- c(levels(trainfull$GarageType), 'None')
    trainfull$GarageType[which(is.na(trainfull$GarageType))] <- 'None'

    levels(trainfull$GarageFinish) <- c(levels(trainfull$GarageFinish), 'None')
    trainfull$GarageFinish[which(is.na(trainfull$GarageFinish))] <- 'None'

    levels(trainfull$PoolQC) <- c(levels(trainfull$PoolQC), 'None')
    trainfull$PoolQC[which(is.na(trainfull$PoolQC))] <- 'None'

    levels(trainfull$Fence) <- c(levels(trainfull$Fence), 'None')
    trainfull$Fence[which(is.na(trainfull$Fence))] <- 'None'

    #trainfull <- trainfull[which(!is.na(trainfull$LotFrontage)),]
    #trainfull <- trainfull[which(!is.na(trainfull$MasVnrType)),]
    #trainfull <- trainfull[which(!is.na(trainfull$MasVnrArea)),]
    #trainfull <- trainfull[which(!is.na(trainfull$BsmtQual)),]
    #trainfull <- trainfull[which(!is.na(trainfull$BsmtExposure)),]
    #trainfull <- trainfull[which(!is.na(trainfull$BsmtFinType1)),]
    #trainfull <- trainfull[which(!is.na(trainfull$Electrical)),]

    NA_lines = which(is.na(trainfull$GarageYrBlt))
    trainfull$GarageYrBlt[NA_lines] <- trainfull$YearBuilt[NA_lines]
    
    trainfull$MasVnrType[which(is.na(trainfull$MasVnrType))] <- 'None'
    
    
}

## Remarques :

Features avec queue de distribution -> possibilité passer en log (si pas de 0)

- BsmtFinSF1
- BsmtUNFSF
- GrLivArea
- GarageArea
- LotArea
- LotFrontage
- MasVnrArea
- OpenPorchSF
- SalePrice
- TotalBsmtSF
- WoodDeckSF
- X1stFlrSF
- X2ndFlrSF

"""
trainfull <- trainfull %>%
    #mutate(BsmtFinSF1=log(BsmtFinSF1)) %>%
    #mutate(BsmtUNFSF=log(BsmtUNFSF)) %>%
    mutate_if(GrLivArea=log(GrLivArea)) %>%
    mutate(GarageArea=log(GarageArea)) %>%
    mutate(LotArea=log(LotArea)) %>%
    mutate(LotFrontage=log(LotFrontage)) %>%
    mutate(MasVnrArea=log(MasVnrArea)) %>%
    mutate(OpenPorchSF=log(OpenPorchSF)) %>%
    mutate(SalePrice=log(SalePrice)) %>%
    mutate(TotalBsmtSF=log(TotalBsmtSF)) %>%
    mutate(WoodDeckSF=log(WoodDeckSF)) %>%
    mutate(X1stFlrSF=log(X1stFlrSF)) %>%
    mutate(X2ndFlrSF=log(X2ndFlrSF))
"""

Beaucoup de valeur à 0 qui semblent anormale -> mettre en NA / supprimer lignes / créer nouvelle feature ?
- BsmtFinSF1
- BsmtFinSF2
- MasVnrArea
- OpenPorchSF
- WoodDeckSF
- X2ndFlrSF