In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


Chargement train dataset

In [None]:
train_full = pd.read_csv('./Data/train.csv')

Enlèvement de la colonne Id

In [None]:
train_full.drop("Id", axis=1, inplace=True)

Sélection des colonnes de features quantitatives

In [None]:
quant = train_full.select_dtypes(include=['float64', 'int64']).columns
print(f"Features Quanti:\n {quant}")

In [None]:
# quant = quant.drop(["TotRmsAbvGrd", "GarageCars", "TotalBsmtSF", "GarageYrBlt"]) : prédictions moins bonnes
train_full_quant = train_full[quant]

Vérifiction puis remplacement des NA

In [None]:
print(f"Nombre de NAs dans train_full_quant: \n {train_full_quant.isna().sum()}")

In [None]:
train_full_quant = train_full_quant.fillna(0)

In [None]:
print(f"Nombre de NAs dans train_full_quant: \n {train_full_quant.isna().sum()}")

Split du train_full dataset en train et val

In [None]:
from sklearn.model_selection import train_test_split
from numpy import random

SEED = 42
random.seed(SEED)
train, val= train_test_split(train_full_quant, test_size=0.3)

print("Train full shape: " + str(train_full_quant.shape))
print("Train shape: " + str(train.shape))
print("Val shape: " + str(val.shape))

Séparation des features en X (tout sauf SalePrice) et Y (SalePrice)

In [None]:
target_column = 'SalePrice'
Y_train = train[target_column].values
X_train = train.drop(target_column, axis=1).values
X_val = val.drop(target_column, axis=1).values
Y_val = val[target_column].values

Standardisation des X

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
print(X_train_scaled.shape)

Transformation des Y en log

In [None]:
Y_train = np.log1p(Y_train)
Y_val = np.log1p(Y_val)

Redimensionnement des Y -> ajout d'une dimension

In [None]:
Y_train_reshaped = np.reshape(Y_train, (Y_train.shape[0], 1))
print(Y_train_reshaped.shape)
Y_val_reshaped = np.reshape(Y_val, (Y_val.shape[0], 1))
print(Y_val_reshaped.shape)

Standardisation des Y log

In [None]:
scaler2 = StandardScaler()
scaler2.fit(Y_train_reshaped)

Y_train_scaled = scaler2.transform(Y_train_reshaped)
Y_val_scaled = scaler2.transform(Y_val_reshaped)

Initialisation du modèle

In [None]:
from tensorflow.keras.models import Sequential

model = Sequential()

Création et entrainement du modèle

In [None]:
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import regularizers

model.add(Input(shape=X_train_scaled.shape[1]))
model.add(Dense(200, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(1))
loss = 'mse'
LEARNING_RATE = 0.005
model.compile(loss=loss, optimizer=SGD(lr=LEARNING_RATE))
BATCH_SIZE = X_train_scaled.shape[0]
EPOCHS = 1000
history = model.fit(X_train_scaled, Y_train_scaled, validation_data=(X_val_scaled, Y_val_scaled), epochs=EPOCHS, batch_size=BATCH_SIZE)

In [None]:
model.summary()

Affichage de la performance du modèle sur train et val

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title('Model performance throughout training')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.show()

Application du modèle à la prédiction sur val

In [None]:
predict = model.predict(X_val_scaled)

Unscale des prédictions

In [None]:
predict = scaler2.inverse_transform(predict)

Calcul de la RMSE sur les prédictions

In [None]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(Y_val, predict))
print(rmse)

Chargement du dataset de test

In [None]:
test_full = pd.read_csv('./Data/test.csv')

Enlèvement de la colonne Id

In [None]:
test_full_Id = test_full['Id']

Enlèvement de SalePrice des features quantitatives sélectionnées sur le train_full

In [None]:
quant = quant.drop("SalePrice")


Sélection des features quantitatives du test_full

In [None]:
test_full_quant = test_full[quant]

Check et remplacement des NA du test_full

In [None]:
print(f"Nombre de NAs dans test_full_quant: \n {test_full_quant.isna().sum()}")

In [None]:
test_full_quant = test_full_quant.fillna(0)

In [None]:
print(f"Nombre de NAs dans test_full_quant: \n {test_full_quant.isna().sum()}")

Standardisation du test_full

In [None]:
test_scaled = scaler.transform(test_full_quant)

Application du modèle à la prédiction sur le test_full

In [None]:
predict_test = model.predict(test_scaled)

Unscale et unlog des prédictions sur le test_full

In [None]:
predict_test = scaler2.inverse_transform(predict_test)
predict_test  = np.e**predict_test

Mise en forme des prédictions en vue de la soumission à Kaggle

In [None]:
from pandas import DataFrame
df = DataFrame(data=predict_test, index=test_full_Id)

In [None]:
df.columns = ['SalePrice']

Création du csv de prédictions à envoyer à Kaggle

In [None]:
df.to_csv('prediction_full_quanti.csv')