In [None]:
from pandas import DataFrame, read_csv, Series
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from numpy import transpose
import numpy as np
from math import sqrt
import tensorflow as tf
from tensorflow.keras.layers import Dense, Activation, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import SGD,RMSprop,Adam
from time import time
import seaborn as sns
#tf.debugging.set_log_device_placement(True)
print(tf.__version__)

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

#a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
#b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
#c = tf.matmul(a, b)

#print(c)

In [None]:
X_train_log = read_csv('X_train.csv', index_col=0)
X_val_log = read_csv('X_val.csv', index_col=0)
test_log = read_csv('X_test.csv', index_col=0)
y_train_log = read_csv('y_train.csv', index_col=0)
y_val_log = read_csv('y_val.csv', index_col=0)

In [None]:
y_train_log.shape

In [None]:
X_train_log.shape

### Output scaling

In [None]:
scaler_Y = StandardScaler()
scaler_Y.fit(y_train_log)

y_train = scaler_Y.transform(y_train_log)
y_val = scaler_Y.transform(y_val_log)

In [None]:
X_train = X_train_log.values.astype(float)
X_val = X_val_log.values.astype(float)
test = test_log.values.astype(float)

Select n best features according to XGBoost

In [None]:
nb_features = 75 # 24
X_train = X_train[:,:nb_features]
X_val = X_val[:,:nb_features]
test = test[:,:nb_features]

In [None]:
X_train.shape

In [None]:
from tensorflow.keras.backend import sigmoid, tanh, maximum
def custom(x):
    return maximum(x,tanh(x))

Modèle de regression linéaire.

In [None]:
def get_reg_model():
    return Sequential([
                    Input(shape=X_train.shape[1]),
                    Dense(1)
])

Modèle avec couches cachées.

In [None]:
def get_other_model(nb_neurons=32):
    return Sequential([
                    Input(shape=X_train.shape[1]),
                    Dense(nb_neurons, kernel_regularizer=regularizers.l1(0.001)),
                    Activation('tanh', activity_regularizer=regularizers.l1(0.001)),
#                    Dense(32, kernel_regularizer=regularizers.l1(0.001)),
#                    Activation('tanh', activity_regularizer=regularizers.l1(0.001)),
                    Dense(1)
])

# Modèle simple

In [None]:
loss = 'mse'
LEARNING_RATE = 0.01

In [None]:
model = get_reg_model()
model.compile(loss=loss, optimizer=SGD(lr=LEARNING_RATE))
BATCH_SIZE = X_train.shape[0] # computing the loss over the whole dataset
EPOCHS = 750 # how many iterations over the whole dataset
t_0 = time()
#with tf.device('/device:GPU:0'):
history = model.fit(X_train, y_train,  validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0)

print(time()-t_0, 's')

In [None]:
DataFrame(history.history).plot(figsize=(8, 5), logy=True)
plt.grid(True)
#plt.gca().set_ylim(0, 1)
plt.title('Model performance throughout training')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.show()

In [None]:
y_p_scaled = model.predict(X_val)
y_p_scaled = y_p_scaled.reshape(y_p_scaled.shape[0])

y_p_val = scaler_Y.inverse_transform(y_p_scaled)

RMSLE = sqrt(mean_squared_error(y_val_log,y_p_val))
print('Validation RMSLE:', RMSLE)
y_p_scaled = model.predict(X_train)
y_p_scaled = y_p_scaled.reshape(y_p_scaled.shape[0])

y_p_train = scaler_Y.inverse_transform(y_p_scaled)

RMSLE = sqrt(mean_squared_error(y_train_log.SalePrice,y_p_train))
print('Train RMSLE:', RMSLE)

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(1,2,1)
sns.distplot(y_p_train-y_train_log.SalePrice, axlabel='error on training set')
plt.subplot(1,2,2)
sns.distplot(y_p_val-y_val_log.SalePrice, axlabel='error on validation set')
plt.show()

save test prediction

In [None]:
test_pred_log_scaled = model.predict(test)
test_pred_log = scaler_Y.inverse_transform(test_pred_log_scaled)
test_pred = np.exp(test_pred_log).reshape(test_pred_log.shape[0])
submission = DataFrame({"SalePrice": test_pred}, index=test_log.index)
submission.to_csv('test-prediction-keras-lin.csv')

# Hidden Layer Model

In [None]:
model2 = get_other_model()
model2.summary()

In [None]:
LEARNING_RATE=0.05
model2.compile(loss=loss, optimizer=SGD(lr=LEARNING_RATE))
#model.compile(loss=loss, optimizer=RMSprop(lr=LEARNING_RATE))
#model.compile(loss=loss, optimizer=Adam(lr=LEARNING_RATE))

In [None]:
BATCH_SIZE = X_train.shape[0] # computing the loss over the whole dataset
EPOCHS = 1000 # how many iterations over the whole dataset
t_0 = time()
#with tf.device('/device:GPU:0'):
history = model2.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0)
print(time()-t_0)

In [None]:
DataFrame(history.history).plot(figsize=(8, 5), logy=True)
plt.grid(True)
#plt.gca().set_ylim(0, 1)
plt.title('Model performance throughout training')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.show()

In [None]:
y_p_scaled = model2.predict(X_val)
y_p_scaled = y_p_scaled.reshape(y_p_scaled.shape[0])
y_p_val = scaler_Y.inverse_transform(y_p_scaled)

RMSLE = sqrt(mean_squared_error(y_val_log,y_p_val))
print('Validation RMSLE:', RMSLE)
y_p_scaled = model2.predict(X_train)
y_p_scaled = y_p_scaled.reshape(y_p_scaled.shape[0])

y_p_train = scaler_Y.inverse_transform(y_p_scaled)

RMSLE = sqrt(mean_squared_error(y_train_log.SalePrice, y_p_train))
print('Train RMSLE:', RMSLE)

## Error distribution

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(1,2,1)
sns.distplot(y_p_train-y_train_log.SalePrice, axlabel='error on training set')
plt.subplot(1,2,2)
sns.distplot(y_p_val-y_val_log.SalePrice, axlabel='error on validation set')
plt.show()

In [None]:
y = y_val_log.SalePrice.to_numpy()
y = y.reshape(y.shape[0])
print(y_p_train.shape)
df = DataFrame(data={'value':np.exp(y), 'predicted':np.exp(y_p_val)},index=y_val_log.index)
plt.figure(figsize=(15,10))
sns.scatterplot(x='value',y='predicted', data=df )
plt.show()

In [None]:
anomaly_idx = df['predicted'].idxmax()
print(anomaly_idx)

In [None]:
anomaly = X_val_log.loc[anomaly_idx]

In [None]:
anomaly.loc[np.abs(anomaly)>1.5]

Test prediction

In [None]:
test_pred_log_scaled = model.predict(test)
test_pred_log = scaler_Y.inverse_transform(test_pred_log_scaled)
test_pred = np.exp(test_pred_log).reshape(test_pred.shape[0])

submission = DataFrame({"SalePrice": test_pred}, index=test_log.index)
submission.to_csv('test-prediction-keras-1hiddenLayer.csv')