## Mount the Drive, and Change to Google Drive Folder

In [1]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount = True)

# %cd /content/drive/MyDrive/MSc.-Dissertations/1/Files
# %ls

## Import Libraries

In [2]:
import tensorflow as tf
from tensorflow import keras
from keras import models, layers, utils, losses
from keras.wrappers import scikit_learn
from keras.models import *
from keras.layers import *
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from keras.utils import np_utils

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import random
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

## Count the Number of Files, and Take Random Samples from the Image Files

In [3]:
# !ls street_view
# count how many files and write the filenames into a file
# !ls street_view -1 | wc -l 
# !ls street_view/*.jpg > flist.txt
flist = list(pd.read_csv('flist.txt', header = None)[0])

# Set seed so sample is reproducible 
random.seed(99)  # set this to an integer value!!!
nsamp = 100
flist_sub = random.sample(flist, nsamp)
flist = flist_sub

# print(flist)

## Overview of the `properties` Dataset

Read the `properties` dataset first, and make sure that `property type` is a categorical variable.

In [4]:
properties = pd.read_csv('properties.csv')
properties.propertyType = properties.propertyType.astype('category')
properties.head()

Unnamed: 0.1,Unnamed: 0,address,propertyType,bedrooms,detailUrl,location_lat,location_lng,property_id
0,0,"12, Gorsey Brigg, Dronfield Woodhouse, Dronfie...",Terraced,3.0,https://www.rightmove.co.uk/house-prices/detai...,53.29986,-1.49446,60d9dd15-c5a0-4d9c-a341-a1d47add49d5
1,0,"5, Highgate Lane, Dronfield, Derbyshire S18 1UB",Detached,4.0,https://www.rightmove.co.uk/house-prices/detai...,53.29135,-1.45975,4a586e80-181a-4b82-b5c3-2d789436bb14
2,0,"125, Gosforth Lane, Dronfield, Derbyshire S18 1RB",Detached,3.0,https://www.rightmove.co.uk/house-prices/detai...,53.29763,-1.47573,93680b6c-237e-44d3-8f40-959a14b80cad
3,0,"80, Shakespeare Crescent, Dronfield, Derbyshir...",Detached,3.0,https://www.rightmove.co.uk/house-prices/detai...,53.29259,-1.45644,5d49758b-f148-4d06-bbae-3eb23f5c68fb
4,0,"21, Gainsborough Road, Dronfield, Derbyshire S...",Detached,,https://www.rightmove.co.uk/house-prices/detai...,53.2974,-1.48503,4645f5eb-de7c-474f-8d7e-b59fa8c55f19


Basic information of the dataset is shown as follows.

In [5]:
properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17550 entries, 0 to 17549
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Unnamed: 0    17550 non-null  int64   
 1   address       17550 non-null  object  
 2   propertyType  17550 non-null  category
 3   bedrooms      11505 non-null  float64 
 4   detailUrl     17550 non-null  object  
 5   location_lat  17550 non-null  float64 
 6   location_lng  17550 non-null  float64 
 7   property_id   17550 non-null  object  
dtypes: category(1), float64(3), int64(1), object(3)
memory usage: 977.2+ KB


Descriptive statistics of continuous variables are shown as follows.

In [6]:
properties.describe()

Unnamed: 0.1,Unnamed: 0,bedrooms,location_lat,location_lng
count,17550.0,11505.0,17550.0,17550.0
mean,0.0,2.871186,52.912264,-2.330492
std,0.0,1.010339,1.83383,1.262468
min,0.0,0.0,50.61708,-4.26895
25%,0.0,2.0,51.23283,-3.06729
50%,0.0,3.0,53.095885,-2.658955
75%,0.0,3.0,53.84676,-1.71275
max,0.0,6.0,55.91054,0.71999


Frequencies of each level of the variable `property type` are obtained as follows.

In [7]:
properties.propertyType.value_counts()

Detached         4134
Semi-Detached    4056
Unknown          3900
Terraced         3666
Flat             1794
Name: propertyType, dtype: int64

## A Subset of the `properties` Dataset

As random samples of images have been obtained previously, a subset of the whole `properties` dataset could hence be formulated by selecting the rows of the whole `properties` dataset corresponding to the selected samples.

In [8]:
flist_id = list(map(lambda string: string[16 : -4], flist))
properties_sub = pd.DataFrame(properties.loc[properties['property_id'].isin(flist_id)])

Dictionaries are created to link `property ID` with our variables of interest.

In [28]:
dic_propID_imgArray = dict(zip(flist_id, list(map(lambda x: np.array(Image.open(x)), flist)))) # dictionary of RGB values in each pixel
dic_propID_propType = dict(zip(flist_id, properties_sub.propertyType)) # dictionary of property types
dic_address_propID = dict(zip(properties_sub.address, flist_id))

The original data should be splitted into training and testing sets, and the testing set contains 30% of the original data.

In [10]:
Img_array_train, Img_array_test, propertyType_train, propertyType_test = train_test_split(
    np.array(list(map(dic_propID_imgArray.get, flist_id))), # RGB values in each pixel
    np.array(list(map(dic_propID_propType.get, flist_id))), # property types
    test_size = 0.3)

For categorical variables, one-hot encoder is introduced.

In [11]:
def onehot(variable):

    '''
    Constructs one-hot encoder for a specified categorical variable.

    Parameter:
    variable: A categorical variable.
    
    Returns:
    Dummy encoding of the categorical variable.
    '''

    onehot_encoder = LabelEncoder()
    onehot_encoder.fit(variable)
    encoded_variable = onehot_encoder.transform(variable)
    dummy_variable = np_utils.to_categorical(encoded_variable)
    return dummy_variable

In [20]:
dummy_propertyType_train = pd.get_dummies(propertyType_train)
dummy_propertyType_test = pd.get_dummies(propertyType_test)
propertyType_test_fac = np.argmax(np.array(dummy_propertyType_test), axis = 1) 

## Multi-Class Classification Using Neural Network

### Multi-Layer Perceptron (MLP) model

In [14]:
def mlp(output_dim):

    '''
    Creates a multi-layer perceptron neural network model without hidden layers.

    Parameter:
    output_dim (int): The number of output classes.
    
    Returns:
    A compiled Keras model.
    '''

    model = Sequential()
    model.add(Rescaling(1. / 255))
    model.add(Flatten())
    # model.add(Dense(100, activation = tf.nn.leaky_relu))
    model.add(Dense(output_dim, activation = tf.nn.softmax))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [15]:
MLP = mlp(dummy_propertyType_train.shape[1])
MLP.fit(Img_array_train, dummy_propertyType_train, epochs = 100, batch_size = 30)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1ba467bbdf0>

In [16]:
propertyType_test_pred = np.argmax(MLP.predict(Img_array_test), axis = 1) 



In [21]:
confusion_matrix(propertyType_test_fac, propertyType_test_pred)

array([[ 5,  3, 22, 13, 20],
       [ 5,  0, 10,  9,  7],
       [ 9,  1, 19, 27, 17],
       [ 6,  3, 24, 22, 23],
       [ 5,  0, 14, 15, 21]], dtype=int64)

## Convolutional Neural Network

In [22]:
def cnn(output_dim):

    '''
    Creates a convolutional neural network model without hidden layers.

    Parameter:
    output_dim (int): The number of output classes.
    
    Returns:
    A compiled Keras model.
    '''

    model = Sequential()
    model.add(Rescaling(1. / 255))
    model.add(Conv2D(16, 3, padding = 'same', activation = tf.nn.relu))
    model.add(MaxPooling2D())   
    model.add(Flatten())
    model.add(Dense(128, activation = 'relu'))
    model.add(Dense(output_dim, activation = tf.nn.softmax))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [25]:
CNN = cnn(dummy_propertyType_train.shape[1])
CNN.fit(Img_array_train, dummy_propertyType_train, epochs = 10, batch_size = 100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ba4fa0df60>