In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


### import datasets

In [None]:
train_full = pd.read_csv('./Data/train.csv')
test_full = pd.read_csv('./Data/test.csv')

In [None]:
test_full_Id = test_full['Id']

In [None]:
train_full.drop("Id", axis=1, inplace=True)
test_full.drop("Id", axis=1, inplace=True)

In [None]:
train_full.shape
test_full.shape

In [None]:
train_full.dtypes

### Identification des features qualitatives et quantitatives de train_full

In [None]:
qual = train_full.select_dtypes(include=['object']).columns
print(f"Features Quali:\n {qual}")
quant = train_full.select_dtypes(include=['float64', 'int64']).columns
print(f"Features Quanti:\n {quant}")

### Création d'un dataset de features qualitatives et d'un dataset de features quantitatives à partir de train_full

In [None]:
train_full_quant = train_full[quant]
print(train_full_quant.shape)
train_full_qual = train_full[qual]
print(train_full_qual.shape)

### Check et remplacement des NA dans la partie quantitative de train_full

In [None]:
print(f"Nombre de NAs dans train_full[quant]: \n {train_full_quant.isna().sum()}")

In [None]:
train_full_quant = train_full_quant.fillna(0)

In [None]:
print(f"Nombre de NAs dans train_full_quant: \n {train_full_quant.isna().sum()}")

### Détection et suppression automatique des outliers des parties quantitatives et qualitatives de train_full

In [None]:
from sklearn.ensemble import IsolationForest

anomalies_ratio = 0.1
clf = IsolationForest(contamination = anomalies_ratio, max_samples = 200, random_state = 42)
clf.fit(train_full_quant)
y_noano = clf.predict(train_full_quant)
y_noano = pd.DataFrame(y_noano, columns = ['Top'])
y_noano[y_noano['Top'] == 1].index.values

train_full_quant = train_full_quant.iloc[y_noano[y_noano['Top'] == 1].index.values]
train_full_quant.reset_index(drop = True, inplace = True)

train_full_qual = train_full_qual.iloc[y_noano[y_noano['Top'] == 1].index.values]
train_full_qual.reset_index(drop = True, inplace = True)

print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
print("Number of rows without outliers:", train_full_quant.shape[0])

### Comptage du nombre de features quantitatives et qualitatives

In [None]:
num_quant_train_full = len(train_full.select_dtypes(include=['int64','float64']).columns)
num_qual_train_full = len(train_full.select_dtypes(include=['object']).columns)

print(f"{num_quant_train_full} features numériques & {num_qual_train_full} features catégorielles")

### Matrice de confusion de la partie quantitative de train_full

In [None]:
corrmat = train_full_quant.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True, linewidths=.5, cmap="YlGnBu");

### Sélection des 10 plus grosses corrélations parmis les features quantitatives

In [None]:
cols = corrmat.nlargest(11,'SalePrice').index
cols

### Matrice de confusion des 10 features quantitatives les plus corrélées

In [None]:
corrmat2 = train_full_quant[cols].corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat2, vmax=1, square=True, linewidths=.5, cmap="YlGnBu", annot=True, mask=np.triu(corrmat2));

### Elimination des features quantitatives les plus corrélées entre elles <-- matrice de confusion

In [None]:
train_full_quant.drop(["TotRmsAbvGrd", "GarageCars", "TotalBsmtSF", "GarageYrBlt"], axis=1, inplace=True)

### Matrice de confusion des features quantitatives restantes, puis sélection des 10 plus grosses correlations avec SalePrice

In [None]:
corrmat3 = train_full_quant.corr()
cols3 = corrmat3.nlargest(11,'SalePrice').index
cols3

### Matrice de confusion des 10 features quantitatives sélectionnées

In [None]:
corrmat4 = train_full_quant[cols3].corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat4, vmax=1, square=True, linewidths=.5, cmap="YlGnBu", annot=True, mask=np.triu(corrmat4));

### Affichage des correlations avec SalePrice des 10 features quantitatives sélectionnées

In [None]:
corrmat4['SalePrice']

### Sélection des 10 features quantitatives dans train_full

In [None]:
train_full_quant = train_full_quant[cols3]

### Traitement des features quali

In [None]:
print(train_full_quant.shape)
print(train_full_qual.shape)

### Transformation des features qualitatives en quantitatives

In [None]:
train_full_qual = pd.get_dummies(train_full_qual)
train_full_qual

In [None]:
train_full_quant

### Réunion des features qualittives et quantitatives

In [None]:
train_full_concat = pd.concat([train_full_quant, train_full_qual], axis = 1)
train_full_concat

### Split du train_full_quant en train et val

In [None]:
from sklearn.model_selection import train_test_split
from numpy import random

SEED = 42
random.seed(SEED)
train, val= train_test_split(train_full_concat, test_size=0.3)

print("Train full shape: " + str(train_full_concat.shape))
print("Train shape: " + str(train.shape))
print("Val shape: " + str(val.shape))

### Séparation de X_train et Y_train sur train et val

In [None]:
target_column = 'SalePrice'
Y_train = train[target_column].values
X_train = train.drop(target_column, axis=1).values
X_val = val.drop(target_column, axis=1).values
Y_val = val[target_column].values

### Normalisation X

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
print(X_train_scaled.shape)

### Transfo SalePrice en log

In [None]:
Y_train = np.log1p(Y_train)
Y_val = np.log1p(Y_val)

### Normalisation Y

In [None]:
Y_train_reshaped = np.reshape(Y_train, (Y_train.shape[0], 1))
print(Y_train_reshaped.shape)
Y_val_reshaped = np.reshape(Y_val, (Y_val.shape[0], 1))
print(Y_val_reshaped.shape)

In [None]:
scaler2 = StandardScaler()
scaler2.fit(Y_train_reshaped)

Y_train_scaled = scaler2.transform(Y_train_reshaped)
Y_val_scaled = scaler2.transform(Y_val_reshaped)

### Initialisation modèle Sequential

In [None]:
from tensorflow.keras.models import Sequential

model = Sequential()

### Création et entrainement du modèle

In [None]:
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import regularizers

model.add(Input(shape=X_train_scaled.shape[1]))
model.add(Dense(504, activation='elu'))
model.add(Dense(504, activation='elu'))
model.add(Dense(252, activation='elu'))
model.add(Dense(126, activation='elu'))
model.add(Dense(1, activation='elu'))
loss = 'mse'
LEARNING_RATE = 0.001
model.compile(loss=loss, optimizer=SGD(lr=LEARNING_RATE))
BATCH_SIZE = X_train_scaled.shape[0]
EPOCHS = 5000
history = model.fit(X_train_scaled, Y_train_scaled, validation_data=(X_val_scaled, Y_val_scaled), epochs=EPOCHS, batch_size=BATCH_SIZE)

In [None]:
model.summary()

### Affichage de la performance du modèle sur train et val

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title('Model performance throughout training')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.show()

### Prédiction sur X_val

In [None]:
predict = model.predict(X_val_scaled)

### Calcul de l'erreur sur la prédiction (MSE)

In [None]:
predict = scaler2.inverse_transform(predict)

In [None]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(Y_val, predict))
print(rmse)

### Reconversion du SalePrice

In [None]:
predict  = np.e**predict

### Choix des colonnes sur le test_full

Supression de SalePrice de la liste des features quantitatives issue de train_full

In [None]:
cols3 = cols3.drop("SalePrice")
cols3

In [None]:
qual

### Séparation des colonnes des features quantitatives et qualitatives de test_full

In [None]:
test_quant = test_full[cols3]
#print(test_quant)
test_qual = test_full[qual]
print(test_qual)

In [None]:
train_full_qual.shape, test_qual.shape

### Transformation des features qualitatives de test en quantitatives

In [None]:
test_qual = pd.get_dummies(test_qual)

In [None]:
test_qual.head()

### Identification des colonnes manquantes entre les parties qualitatives transformées de train et test, et mise en forme de celle de test

In [None]:
missing_cols = set( train_full_qual.columns ) - set( test_qual.columns )
for c in missing_cols:
    test_qual[c] = 0
test_qual = test_qual[train_full_qual.columns]

In [None]:
test_qual

### Check et remplacement des NA de la partie quantitative de test_full

In [None]:
test_quant.isna().sum()
test_quant = test_quant.fillna(0)

In [None]:
test_quant.isna().sum()

### Réunion des parties quantitatives et qualitatives transformées de test_full

In [None]:
test = pd.concat([test_quant, test_qual], axis = 1)

In [None]:
test

### Standardisation des données de test_full

In [None]:
test_scaled = scaler.transform(test)

### Application du modèle entrainé sur le train à la prédiction sur le test

In [None]:
predict_test = model.predict(test_scaled)

### Unscale et unlog des prédictions sur le test

In [None]:
predict_test = scaler2.inverse_transform(predict_test)
predict_test  = np.e**predict_test

### Mise en forme des prédictions en vue de la soumission à Kaggle

In [None]:
from pandas import DataFrame
df = DataFrame(data=predict_test, index=test_full_Id)

df


In [None]:
df.columns = ['SalePrice']
#df = df.rename(columns={"":"SalePrice"})
df

### Création du csv de prédictions àenvoyer à Kaggle

In [None]:
df.to_csv('prediction_quanti_quali.csv')