## I - Data Reading

In [1]:
import pandas as pd

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

## II - Exploratory Data Analysis

Steps:
- Distribution of variables (features and labels): outliers, imbalancement?
- Missing values
- Correlation
- Categorical variables: ordinal, nominal, many distinct values, method of encoding ?
- Numerical variables: agregation between variables (sum, mean, percentage ?)

In [None]:
print(train.shape) #1460 observations, 81 variables: features + label = SalePrice
print(test.shape) #1459 observations, 80 variables features

In [None]:
#import pandas as pd
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)


def analyse_data(data):
    
    display(data.head(n=5))
    
    liste_colonnes = data.columns
    print("Dimensions:", data.shape)
    print('\n')
    
    cat_non_numerical = []
    for i, col in enumerate(liste_colonnes) :
        print("Colonne n°{}: {}".format(i, col))
        print('-'*30)
        print("Type de la colonne:", data[col].dtype)
        print("Nb de valeurs uniques:", len(data[col].unique()))
        print("Comptage des valeurs:")
        print(data[col].value_counts(dropna=False))
        percentage_NaN = data[col].isna().sum()/len(data)*100
        print("Pourcentage de NaN: {}%".format(percentage_NaN))

        print('\n')
        print('*'*50)
        print('\n')
        
        if data[col].dtype == 'object':
            cat_non_numerical.append(col)
    
    #return cols_a_garder, pourcentage_cols_a_garder
    return cat_non_numerical

In [None]:
cat_non_numerical_train = analyse_data(train_data)

In [None]:
cat_non_numerical_test = analyse_data(test_data)

In [None]:
print(cat_non_numerical)

In [None]:
train_data['MoSold'].value_counts()

In [None]:
- MSSubClass: nominal variable
- MSZoning: nominal variable
- LotArea: 215k outlier ?
- Street: nominal variable, Enlever la variable ?
- Alley: nominal variable, Remplacer NaN par une valeur ? 
- LotShape: nominal variable
- LandContour: nominal variable
- Utilities: nominal variable, suppression de la variable ?
- LotConfig: nominal variable
- LandSlope: ordinal variable
- Neighborhood: nominal variable
- Condition1: nominal variable
- Condition2: nominal variable, déséquilibre distribution: suppression variable ? 
- BldgType: nominal variable
- HouseStyle: nominal variable     
- OverallQual: ordinal variable
- OverallCond:  ordinal variable
    regarder corrélation avec chi2, pearson avec OverallQual
YearBuilt: transformer en durée
YearRemodAdd: transformer en durée, corrélation avec YearBuilt
RoofStyle: nominal variable
RoofMatl: nominal variable
Exterior1st: nominal variable
Exterior2nd: nominal variable
MasVnrType: nominal variablen, Presence de None et de Nan, est-ce que la même chose ou c'est différent ?
MasVnrArea: numérique, Présence de NaN
ExterQual: ordinal variable
ExterCond:  ordinal variable
Foundation:  nominal variable
BsmtQual: ordinal variable, NaN
BsmtCond: ordinal variable, NaN
BsmtExposure: ordinal variable, NaN
BsmtFinType1: ordinal variable, NaN
BsmtFinSF1: continuous,  énormément de 0, distribution déséquilibrée
BsmtFinType2:  ordinal variable, NaN
BsmtFinSF2: continuous, énormément de 0, distribution déséquilibrée
BsmtUnfSF: continuous
TotalBsmtSF: continuous, à mettre ensemble avec BsmtUnfSF en pourcentage ?
Heating: nominal variable, distribution déséquilibrée
HeatingQC: ordinal variable
CentralAir: binary variable string, distribution déséquilibrée, label encoding ?
Electrical: ordinal or nominal variable ?, 1 seul NaN,  distribution déséquilibrée, label encoding
1stFlrSF: continuous
2ndFlrSF: continuous, distribution déséquilibrée
LowQualFinSF: continuous, distribution déséquilibrée, faire somme avec 1stFlrSF et 2ndFlrSF
GrLivArea: continuous, on peut ptet l'utiliser avec 1stFlrSF et 2ndFlrSF
BsmtFullBath: numérique
BsmtHalfBath: numérique, distribution déséquilibrée
FullBath: numérique
HalfBath: numérique
Bedroom: numerical variable
Kitchen: numerical variable
KitchenQual: ordinal variable
TotRmsAbvGrd: numerical variable
Functional: nominal variable
Fireplaces: numerical variable
FireplaceQu: ordinal variable, NaN   = pas de cheminée
GarageType:  nominal variable, NaN = No Garage
GarageYrBlt: transformer en durée, il y a des NaN
GarageFinish: nominal variable, NaN = no garage
GarageCars: continuous
GarageArea: continuous
GarageQual: ordinal, NaN, distribution déséquilibrée
GarageCond: ordinal, NaN, distribution déséquilibrée
PavedDrive: nominal,  distribution déséquilibrée
WoodDeckSF: continuous
OpenPorchSF: continuous,distribution déséquilibrée
EnclosedPorch : continuous,distribution déséquilibrée   
3SsnPorch: continuous, distribution déséquilibrée   
ScreenPorch: continuous, distribution déséquilibrée 
PoolArea:continuous, distribution déséquilibrée 
PoolQC: ordinal, bcp de NaN = No Pool
Fence: ordinal, bcp de NaN = No Fence
MiscFeature: nominal, bcp de NaN = None
MiscVal: continuous, bcp de 0
MoSold: nominal or ordinal (mois) à transformer en durée ?
YrSold: nominal or ordinal(année) à transformer en durée ?
SaleType: nominal, distribution déséquilibrée  
SaleCondition: nominal, distribution déséquilibrée 

In [None]:
#from pandas_profiling import ProfileReport

## III - Features Engineering

### 1) Ordinal Variables

In [None]:
#train['ExterQual'].value_counts()

In [None]:
#Label Encoding for ordinal variables
#from sklearn import preprocessing
#le = preprocessing.LabelEncoder()
#OrdinalFeature = 'ExterQual'#
#train[OrdinalFeature+ 'Encoded'] = le.fit_transform(train['ExterQual'])
#train2 = train.drop(OrdinalFeature, axis=1)

In [None]:
#le.classes_
#How LabelEncoder choose the order to encode the distinct values ?
#In the case of strings, LabelEncoder sorts alphabetically the distinct values and then it assign values from 0 to N

#With Label Encoding we have this following order: 
#['Ex', 'Fa', 'Gd', 'TA'] = [0,1,2,3]
#The correct order according to the data_description is:
#       Ex	Excellent
#       Gd	Good
#       TA	Average/Typical
#       Fa	Fair
#       Po	Poor
#so we must have Po < Fa < TA < Gd < Ex => ['Po', 'Fa', 'TA', 'Gd', 'Ex']  = [0,1,2,3,4]
#But Po doesn't exist in our data, so we have : 
#['Fa', 'TA', 'Gd', 'Ex'] = [0,1,2,3]

#Should we use the order of LabelEncoder or the order of the data description ?
#There is a difference for the algorithm ?
#If yes, so we have to create our own LabelEncoder for using the correct order.

In [2]:
train = train_data.copy()
test = test_data.copy()

In [None]:
print(train['ExterQual'].value_counts())
print(test['ExterQual'].value_counts())
print(train['ExterCond'].value_counts())
print(test['ExterCond'].value_counts())

In [None]:
from sklearn.preprocessing import OrdinalEncoder
#Same thing like LabelEncoder but we can apply for many features at the same time
#But it works only with at least 2 features
#We choose our own order by adding the argument categories and write all the categories in a list for each feature
OrdinalFeatures = ['ExterQual','ExterCond']
enc = OrdinalEncoder(dtype = 'int32', categories = [['Fa', 'TA', 'Gd', 'Ex'],['Po','Fa', 'TA', 'Gd', 'Ex']])
train[OrdinalFeatures] = enc.fit_transform(train[OrdinalFeatures])
test[OrdinalFeatures] = enc.transform(test[OrdinalFeatures])

In [None]:
#With automatic order
enc2 = OrdinalEncoder(dtype = 'int32')
train[OrdinalFeatures] = enc2.fit_transform(train[OrdinalFeatures])
test[OrdinalFeatures] = enc2.transform(test[OrdinalFeatures])

In [None]:
train_data[OrdinalFeatures].head(n=5)

In [None]:
train[OrdinalFeatures].head(n=5)

In [None]:
test[OrdinalFeatures].head(n=5)

## Algorithm ML

In [None]:
#X =  train.drop('SalePrice', axis = 1)
X = train[OrdinalFeatures]
y = train['SalePrice']

In [None]:
from sklearn import tree
from sklearn.model_selection import cross_val_score

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

scores = cross_val_score(clf, X, y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
scores

In [None]:
clf.predict()

## Pipeline

In [6]:
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.preprocessing import OrdinalEncoder

OrdinalFeatures = ['ExterQual','ExterCond']
enc = OrdinalEncoder(dtype = 'int32', categories = [['Fa', 'TA', 'Gd', 'Ex'],['Po','Fa', 'TA', 'Gd', 'Ex']])
clf = tree.DecisionTreeClassifier()
estimators = [('enc ', enc), ('clf', clf )]
pipe = Pipeline(estimators)
train[OrdinalFeatures] = pipe.fit_transform(train[OrdinalFeatures])
test[OrdinalFeatures] = pipe.transform(test[OrdinalFeatures])

TypeError: Singleton array array(None, dtype=object) cannot be considered a valid collection.