# House Price Prediction with Images and Tabular Data

## Import Data

### Tabular Data

In [3]:
import pandas as pd
import numpy as np
X_test= pd.read_csv('./Challenge/Data/X_test.csv')
X_train_raw = pd.read_csv('./Challenge/Data/X_train.csv')
y_random_raw = pd.read_csv('./Challenge/Data/y_random.csv')
y_train_raw = pd.read_csv('./Challenge/Data/y_train.csv')
train = pd.merge(X_train_raw, y_train_raw, how = "outer", on = "id_annonce")


### Image Data

In [None]:
import os
from PIL import Image
from scipy import ndimage
import cv2
import skimage.measure 
path_train = './Challenge/Data/reduced_images_ILB/reduced_images/train'
path_test = './Challenge/Data/reduced_images_ILB/reduced_images/test'
test_annonce = np.array(X_test.id_annonce)
train_annonce = np.array(train.id_annonce)

#TRAIN IMAGES

dict = {'id_annonce':[]}
dict['Entropy'] = [] # Calculating the image's entropy and inserting it in the table.
image_features = []
for annonce in train_annonce:
    path = path_train+f'/ann_{annonce}'
    images_path = os.listdir(path)
    entropy = []
    output_image = np.zeros((140,210), dtype = np.int8)
    count = 0
    for i in images_path:
        m = count//3
        n = count%3
        img0 = Image.open(path+f'/{i}')
        img = img0.convert('L')
        imagea = np.asarray(img, dtype='uint8')
        image = cv2.resize(imagea, (70,70)).tolist()
        entropy.append(skimage.measure.shannon_entropy(image))
         # As each house has between 1 and 6 images, I put each images in a 210*140 image 
        output_image[m*70:(m+1)*70,n*70:(n+1)*70] = image
        count += 1
    image_features.append(output_image)
    dict['Entropy'].append(np.mean(entropy))
    dict['id_annonce'].append(annonce)

dict['Image'] = image_features
train_images = pd.DataFrame(data=dict)

#TEST IMAGES

dict = {'id_annonce':[]}
dict['Entropy'] = []
image_features =[]
for annonce in test_annonce:
    path = path_test+f'/ann_{annonce}'
    images_path = os.listdir(path)
    entropy = []
    output_image = np.zeros((140,210), dtype = np.int8)
    count = 0
    for i in images_path:
        m = count//3
        n = count%3
        img0 = Image.open(path+f'/{i}')
        img = img0.convert('L')
        imagea = np.asarray(img, dtype='uint8')
        image = cv2.resize(imagea, (70,70)).tolist()
        entropy.append(skimage.measure.shannon_entropy(image))
        output_image[m*70:(m+1)*70,n*70:(n+1)*70] = image
        count += 1
    image_features.append(output_image)
    dict['Entropy'].append(np.mean(entropy))
    dict['id_annonce'].append(annonce)

dict['Image'] = image_features
test_images = pd.DataFrame(data=dict)

## Preprocessing

In [12]:
train.pop('city')
X_test.pop('city')

columns_str = np.array(['id_annonce','property_type','energy_performance_category','ghg_category','exposition','postal_code'])
columns_train = np.array(train.columns)




# Function to impute most occured category and add importance vairable
def impute_nan_add_vairable(DataFrame,ColName):
    #1. add new column and replace if category is null then 1 else 0
    DataFrame[ColName+"_Imputed"] =   np.where(DataFrame[ColName].isnull(),1,0)
    
    # 2. Take most occured category in that vairable (.mode())
    
    Mode_Category = DataFrame[ColName].mode()[0]
    
    ## 2.1 Replace NAN values with most occured category in actual vairable
    
    DataFrame[ColName].fillna(Mode_Category,inplace=True)
# Call function to impute NAN values and add new importance feature

for i in columns_train:
    if i in columns_str:
        impute_nan_add_vairable(train,i)
    else:
        median = np.nanmedian(np.array(train[[i]]).flatten())
        train[[i]] = train[[i]].fillna(median)
        
columns_X_test = np.array(X_test.columns)
for i in columns_X_test:
    if i in columns_str:
        impute_nan_add_vairable(X_test,i)
    else:
        median = np.nanmedian(np.array(X_test[[i]]).flatten())
        X_test[[i]] = X_test[[i]].fillna(median)
        
        
#TARGET ENCODING CATEGORICAL VALUES
        
from category_encoders import TargetEncoder
encoder = TargetEncoder()

        
for i in columns_str:
    train[[f'{i}']] = encoder.fit_transform(train[f'{i}'], train['price'])
    X_test[[f'{i}']] = encoder.transform(X_test[f'{i}'])



## Creating a train and a test ensemble for tabular data and images.

In [18]:
Complete_train = pd.merge(train, train_images, how = "outer", on = "id_annonce")
Complete_test = pd.merge(X_test, test_images, how = "outer", on = "id_annonce")


image_data = Complete_train.pop('Image')
image_data_test = Complete_test.pop('Image')
Complete_train.pop('id_annonce')
Complete_test.pop('id_annonce')
y_train = Complete_train.pop('price')


In [19]:
image_data = np.array(image_data)
image_data = image_data.tolist()
image_data = np.asarray(image_data, dtype=np.int8)/255

In [16]:
image_data_test = np.array(image_data_test)
image_data_test = image_data_test.tolist()
image_data_test = np.asarray(image_data_test)/255

# Concatenation of a CNN and a MLP with Keras

In [20]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [21]:
#Normalizing tabular data.

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(Complete_train))

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

In [23]:
def create_mlp(regress=False):
    # define our MLP network
    model = Sequential(normalizer)
    model.add(Dense(32, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(4, activation="relu"))
    # check to see if the regression node should be added
    if regress:
        model.add(Dense(1, activation="linear"))
    # return our model
    return model

In [24]:
def create_cnn(width, height, depth, filters=(16, 32, 64), regress=False):
    # initialize the input shape and channel dimension, assuming
    # TensorFlow/channels-last ordering
    inputShape = (height, width, depth)
    chanDim = -1
    # define the model input
    inputs = Input(shape=inputShape)
    # loop over the number of filters
    for (i, f) in enumerate(filters):
        # if this is the first CONV layer then set the input
    # appropriately
        if i == 0:
            x = inputs
        # CONV => RELU => BN => POOL
        x = Conv2D(f, (3, 3), padding="same")(x)
        x = Activation("relu")(x)
        x = BatchNormalization(axis=chanDim)(x)
        x = MaxPooling2D(pool_size=(2, 2))(x)
    # flatten the volume, then FC => RELU => BN => DROPOUT
    x = Flatten()(x)
    x = Dense(16)(x)
    x = Activation("relu")(x)
    x = BatchNormalization(axis=chanDim)(x)
    x = Dropout(0.5)(x)
    # apply another FC layer, this one to match the number of nodes
    # coming out of the MLP
    x = Dense(4)(x)
    x = Activation("relu")(x)
    # check to see if the regression node should be added
    if regress:
        x = Dense(1, activation="linear")(x)
    # construct the CNN
    model = Model(inputs, x)
    # return the CNN
    return model

In [25]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import concatenate
# create the MLP and CNN models
mlp = create_mlp(regress=False)
cnn = create_cnn(70,70, 1, regress=False)
# create the input to our final set of layers as the *output* of both
# the MLP and CNN
combinedInput = concatenate([mlp.output, cnn.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(1, activation="linear")(x)
# our final model will accept categorical/numerical data on the MLP
# input and images on the CNN input, outputting a single value (the
# predicted price of the house)
model = Model(inputs=[mlp.input, cnn.input], outputs=x)

In [None]:
opt = Adam(1e-3)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)
# train the model
print("[INFO] training model...")
model.fit(
    x=[Complete_train, image_data], y=y_train,
    validation_split = 0.2,
    epochs=500, batch_size=8)

# Results

In [20]:
import matplotlib.pyplot as plt
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Error [price]')
  plt.legend()
  plt.grid(True)

In [None]:
plot_loss(model.history)

In [30]:
y_pred = model.predict([Complete_test, image_data_test])



In [31]:
X_test[['price']] = y_pred.reshape(len(y_pred),1)
submission = X_test[['id_annonce','price']]
submission

Unnamed: 0,id_annonce,price
0,35160615,1.792943e+05
1,35830639,1.019728e+06
2,36016657,2.517493e+05
3,35759225,9.008312e+04
4,35252229,3.012731e+05
...,...,...
9334,36052217,6.182704e+05
9335,35823719,3.572681e+05
9336,35793053,2.975501e+05
9337,36049283,5.380710e+05


In [32]:
submission.to_csv('Submission.csv',header = True, index = False)