In [None]:
import pandas as pd
import xgboost as xgb

%matplotlib notebook

In [None]:
pd.set_option('display.max_columns', 300)

In [None]:
train_full = pd.read_csv('./sources/train.csv')
train_full

In [None]:
# columns where NaN values have meaning e.g. no pool etc.
cols_fillna = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType', 'Electrical',
               'KitchenQual', 'SaleType', 'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'MSZoning', 'Utilities']

# replace 'NaN' with 'None' in these columns
for col in cols_fillna:
    train_full[col].fillna('None',inplace=True)
    
train_full

In [None]:
# Trouver toutes les varaibles categorielles

train_cat = train_full.select_dtypes(include="object")
train_cat

Nous choisissons la feature "Neighborhood" en sus de la variable "Surface" <br />
On crée la variable "Surface":

In [None]:
train_full['Surface'] = train_full['TotalBsmtSF'] + train_full['GrLivArea']

In [None]:
train_full['Surface']

On élimine les lignes avec des valeurs manquantes pour "Neighborhood"

In [None]:
train_full = train_full.dropna(subset=['Neighborhood'])
train_full

In [None]:
# On plot le prix en fonction de la surface totale en colorant par "Neighborhood"

import seaborn as sns

In [None]:
sns.scatterplot(x=train_full['Surface'],
                y=train_full['SalePrice'], hue=train_full['Neighborhood'])

Suite au scatterplot, on choisi de supprimer les données dont la surface est supérieure à 7000

In [None]:
kept_surface = train_full['Surface'] < 7000
train_full = train_full[kept_surface]



Pour intégrer les variables catégorielles, on peut les "séparer" en autant de colones que de catégorie par variables remplies par des `0` si la catégrie ne match pas et `1` si la catégorie match... <br />
fonction `get_dummies` de Pandas

In [None]:
train_full = pd.get_dummies(train_full, columns=['Neighborhood'])

In [None]:
train_full

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(
    train_full, train_full['SalePrice'], test_size=0.2, random_state=42)

In [None]:
neighborhoods = [ col for col in X_train.columns if col.startswith('Neig') ]
X_train = X_train[neighborhoods].join(X_train['Surface'])
X_train

X_val = X_val[neighborhoods].join(X_val['Surface'])
X_val

On utilise Keras pour entrainer un modèle

In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
model = Sequential([
    Dense(1, input_shape=X_train.shape[1:])
])


from tensorflow.keras.optimizers import SGD
loss ='mse'
LEARNING_RATE = 0.01
model.compile(loss=loss, optimizer=SGD(lr=LEARNING_RATE))
#

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

# Normalize Val set
X_val = scaler.transform(X_val)

#
BATCH_SIZE = X_train.shape[0] # computing the loss over the whole dataset
EPOCHS = 200 # how many iterations over the whole dataset
history = model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)
#
Y_predict = model.predict(X_train)

In [None]:
hist = pd.DataFrame(history.history)

sns.lineplot(x=hist.index, y='loss', data=hist)

import numpy as np
from sklearn import metrics

print('RMSLE : ', np.sqrt(metrics.mean_squared_log_error(Y_train, Y_predict)))
print('MAE : ', metrics.mean_absolute_error(Y_train, Y_predict))

On validation set

In [None]:
Y_predict_val = model.predict(X_val)

In [None]:
print('RMSLE : ', np.sqrt(metrics.mean_squared_log_error(Y_val, Y_predict_val)))
print('MAE : ', metrics.mean_absolute_error(Y_val, Y_predict_val))

XGBoost

In [None]:
import xgboost as xgb

In [None]:
xgbReg_model = xgb.XGBRegressor()
xgbReg_model.fit(X_train,Y_train)

In [None]:
Y_predict = xgbReg_model.predict(X_train)

print('RMSLE : ', np.sqrt(metrics.mean_squared_log_error(Y_train, Y_predict)))
print('MAE : ', metrics.mean_absolute_error(Y_train, Y_predict))

Y_predict_val = xgbReg_model.predict(X_val)

print('RMSLE : ', np.sqrt(metrics.mean_squared_log_error(Y_val, Y_predict_val)))
print('MAE : ', metrics.mean_absolute_error(Y_val, Y_predict_val))

With all features

In [None]:
train_num = train_full.select_dtypes(exclude="object")
train_cat = train_full.select_dtypes(include="object")

print(train_num.shape, train_cat.shape, train_full.shape)

In [None]:
train_full_drop = train_full.dropna(axis=0)
train_full_drop.shape
