## Mount the Drive, and Change to Google Drive Folder

In [1]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount = True)

# %cd /content/drive/MyDrive/MSc.-Dissertations/1/Files
# %ls

## Import Libraries

In [2]:
import tensorflow as tf
from tensorflow import keras
from keras import models, layers, utils, losses
from keras.wrappers import scikit_learn
from keras.models import *
from keras.layers import *
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

import random
import pandas as pd
import numpy as np
from PIL import Image, ImageFilter
import matplotlib.pyplot as plt

## Count the Number of Files, and Take Random Samples from the Image Files

In [3]:
# !ls street_view
# count how many files and write the filenames into a file
# !ls street_view -1 | wc -l 
# !ls street_view/*.jpg > flist.txt
flist_old = list(pd.read_csv('flist.txt', header = None)[0])
flist = []
change_names = list(map(lambda x: flist.append(f'street_view/{x}'), flist_old))

# Set seed so sample is reproducible 
# random.seed(99)  # set this to an integer value!!!
# nsamp = 100
# flist_sub = random.sample(flist, nsamp)
# flist = flist_sub

# print(flist)

## Overview of the `properties` Dataset

Read the `properties` dataset first, and make sure that `property type` is a categorical variable.

In [4]:
properties = pd.read_csv('properties.csv')
properties_juny12 = pd.read_csv('properties_juny12.csv')
properties_full = pd.concat([properties, properties_juny12])
properties = properties_full
properties.propertyType = properties.propertyType.astype('category')
properties.head()

Unnamed: 0.1,Unnamed: 0,address,propertyType,bedrooms,detailUrl,location_lat,location_lng,property_id
0,0,"12, Gorsey Brigg, Dronfield Woodhouse, Dronfie...",Terraced,3.0,https://www.rightmove.co.uk/house-prices/detai...,53.29986,-1.49446,60d9dd15-c5a0-4d9c-a341-a1d47add49d5
1,0,"5, Highgate Lane, Dronfield, Derbyshire S18 1UB",Detached,4.0,https://www.rightmove.co.uk/house-prices/detai...,53.29135,-1.45975,4a586e80-181a-4b82-b5c3-2d789436bb14
2,0,"125, Gosforth Lane, Dronfield, Derbyshire S18 1RB",Detached,3.0,https://www.rightmove.co.uk/house-prices/detai...,53.29763,-1.47573,93680b6c-237e-44d3-8f40-959a14b80cad
3,0,"80, Shakespeare Crescent, Dronfield, Derbyshir...",Detached,3.0,https://www.rightmove.co.uk/house-prices/detai...,53.29259,-1.45644,5d49758b-f148-4d06-bbae-3eb23f5c68fb
4,0,"21, Gainsborough Road, Dronfield, Derbyshire S...",Detached,,https://www.rightmove.co.uk/house-prices/detai...,53.2974,-1.48503,4645f5eb-de7c-474f-8d7e-b59fa8c55f19


Basic information of the dataset is shown as follows.

In [5]:
properties.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37402 entries, 0 to 19851
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Unnamed: 0    37402 non-null  int64   
 1   address       37402 non-null  object  
 2   propertyType  37402 non-null  category
 3   bedrooms      24486 non-null  float64 
 4   detailUrl     37402 non-null  object  
 5   location_lat  37402 non-null  float64 
 6   location_lng  37402 non-null  float64 
 7   property_id   37402 non-null  object  
dtypes: category(1), float64(3), int64(1), object(3)
memory usage: 2.3+ MB


Descriptive statistics of continuous variables are shown as follows.

In [6]:
properties.describe()

Unnamed: 0.1,Unnamed: 0,bedrooms,location_lat,location_lng
count,37402.0,24486.0,37402.0,37402.0
mean,0.0,2.875194,52.868188,-2.089771
std,0.0,0.993353,1.874818,1.502522
min,0.0,0.0,0.0,-7.55716
25%,0.0,2.0,51.408743,-3.06669
50%,0.0,3.0,52.650855,-2.18756
75%,0.0,3.0,53.775813,-0.93863
max,0.0,12.0,60.16273,1.61824


Frequencies of each level of the variable `property type` are obtained as follows.

In [7]:
properties.propertyType.value_counts()

Semi-Detached    8783
Terraced         8535
Detached         8369
Unknown          6647
Flat             5068
Name: propertyType, dtype: int64

## A Subset of the `properties` Dataset

As random samples of images have been obtained previously, a subset of the whole `properties` dataset could hence be formulated by selecting the rows of the whole `properties` dataset corresponding to the selected samples.

In [8]:
flist_id = list(map(lambda string: string[16 : -4], flist))
properties_sub = pd.DataFrame(properties.loc[properties['property_id'].isin(flist_id)])
properties_sub = properties_sub.drop_duplicates(['location_lat', 'location_lng'])
flist_id = list(properties_sub.property_id)
flist_new = []
change_names_new = list(map(lambda x: flist_new.append(f'street_view/gsv_{x}.jpg'), flist_id))

The original data should be splitted into training and testing sets, and the testing set contains 30% of the original data.

In [9]:
flist_train, flist_test, propertyType_train, propertyType_test = train_test_split(
    flist_new, # image directories
    properties_sub.propertyType, # property types
    test_size = 0.3)

In [10]:
Img_list_train = list(map(lambda x: np.asarray(Image.open(x).resize((32, 32), Image.LANCZOS)), flist_train))
Img_list_test = list(map(lambda x: np.asarray(Image.open(x).resize((32, 32), Image.LANCZOS)), flist_test))
Img_array_train = np.asarray(Img_list_train)
Img_array_test = np.asarray(Img_list_test)

For categorical variables, one-hot encoder is introduced.

In [11]:
dummy_propertyType_train = pd.get_dummies(propertyType_train)
dummy_propertyType_test = pd.get_dummies(propertyType_test)
propertyType_test_fac = np.argmax(np.array(dummy_propertyType_test), axis = 1) 

## Multi-Class Classification Using Neural Network

### Multi-Layer Perceptron (MLP) model

In [12]:
def mlp(output_dim):

    '''
    Creates a multi-layer perceptron neural network model without hidden layers.

    Parameter:
    output_dim (int): The number of output classes.
    
    Returns:
    A compiled Keras model.
    '''

    model = Sequential()
    model.add(Rescaling(1. / 255))
    model.add(Flatten())
    model.add(Dense(128, activation = tf.nn.leaky_relu))
    model.add(Dense(output_dim, activation = tf.nn.softmax))
    loss = keras.losses.CategoricalCrossentropy(from_logits = False)
    weights = np.array(len(propertyType_train) / propertyType_train.value_counts())
    loss.weighted = weights
    model.compile(loss = loss, optimizer = 'adam', metrics = ['accuracy'])
    return model

In [13]:
MLP = mlp(dummy_propertyType_train.shape[1])
MLP.fit(ImageDataGenerator().flow(Img_array_train, dummy_propertyType_train, batch_size = 256),
        epochs = 32, batch_size = 64)

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.callbacks.History at 0x237d9e800d0>

In [14]:
propertyType_test_pred_MLP = np.argmax(MLP.predict(Img_array_test), axis = 1) 
MLP.evaluate(Img_array_test, dummy_propertyType_test)



[1.5130411386489868, 0.32070598006248474]

In [15]:
confusion_matrix(propertyType_test_fac, propertyType_test_pred_MLP)

array([[643, 113,  49, 155,  10],
       [195, 297,  25, 139,  11],
       [696, 184, 111, 271,  14],
       [418, 283,  58, 417,  11],
       [260,  88,  36, 140,  22]], dtype=int64)

## Convolutional Neural Network

In [16]:
def cnn(output_dim):

    '''
    Creates a convolutional neural network model without hidden layers.

    Parameter:
    output_dim (int): The number of output classes.
    
    Returns:
    A compiled Keras model.
    '''

    model = Sequential()
    model.add(Rescaling(1. / 255))
    model.add(Conv2D(4, 2, padding = 'same', activation = tf.nn.leaky_relu))
    model.add(MaxPooling2D())  
    model.add(Conv2D(8, 2, padding = 'same', activation = tf.nn.leaky_relu))
    model.add(MaxPooling2D()) 
    model.add(Flatten())
    model.add(Dense(16, activation = tf.nn.leaky_relu))
    model.add(Dense(output_dim, activation = tf.nn.softmax))
    loss = keras.losses.CategoricalCrossentropy(from_logits = False)
    weights = np.array(len(propertyType_train) / propertyType_train.value_counts())
    loss.weighted = weights
    model.compile(loss = loss, optimizer = 'adam', metrics = ['accuracy'])
    return model

In [17]:
CNN = cnn(dummy_propertyType_train.shape[1])
CNN.fit(ImageDataGenerator().flow(Img_array_train, dummy_propertyType_train, batch_size = 256), 
        epochs = 32, batch_size = 64)

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.callbacks.History at 0x237e241d390>

In [18]:
propertyType_test_pred_CNN = np.argmax(CNN.predict(Img_array_test), axis = 1) 
CNN.evaluate(Img_array_test, dummy_propertyType_test)



[1.4357295036315918, 0.3964700698852539]

In [19]:
confusion_matrix(propertyType_test_fac, propertyType_test_pred_CNN)

array([[343,  41, 387, 154,  45],
       [ 80, 186, 160, 212,  29],
       [245,  53, 684, 230,  64],
       [153,  74, 370, 529,  61],
       [ 94,  55, 190, 107, 100]], dtype=int64)

In [20]:
# import dill
# dill.dump_session('Presetting.pkl')
# # dill.load_session('Presetting.pkl')