## Mount the Drive, and Change to Google Drive Folder

In [1]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount = True)

# %cd /content/drive/MyDrive/MSc.-Dissertations/1/Files
# %ls

## Import Libraries

In [2]:
import tensorflow as tf
from tensorflow import keras
from keras import models, layers, utils, losses, optimizers, regularizers
from keras.wrappers import scikit_learn
from keras.models import *
from keras.layers import *
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from keras.utils import np_utils, image_dataset_from_directory
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

import random
import pandas as pd
import numpy as np
from PIL import Image, ImageFilter
import matplotlib.pyplot as plt

## Count the Number of Files, and Take Random Samples from the Image Files

In [3]:
# !ls street_view
# count how many files
# !ls street_view -1 | wc -l
flist = list(pd.read_csv('flist.txt', header = None)[0])

## Overview of the `properties` Dataset

Read the `properties` dataset first, and make sure that `property type` is a categorical variable.

In [4]:
properties = pd.read_csv('properties.csv')
properties_juny12 = pd.read_csv('properties_juny12.csv')
properties_full = pd.concat([properties, properties_juny12])
properties = properties_full
properties.propertyType = properties.propertyType.astype('category')
properties.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37402 entries, 0 to 19851
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Unnamed: 0    37402 non-null  int64   
 1   address       37402 non-null  object  
 2   propertyType  37402 non-null  category
 3   bedrooms      24486 non-null  float64 
 4   detailUrl     37402 non-null  object  
 5   location_lat  37402 non-null  float64 
 6   location_lng  37402 non-null  float64 
 7   property_id   37402 non-null  object  
dtypes: category(1), float64(3), int64(1), object(3)
memory usage: 2.3+ MB


## A Subset of the `properties` Dataset

As random samples of images have been obtained previously, a subset of the whole `properties` dataset could hence be formulated by selecting the rows of the whole `properties` dataset corresponding to the selected samples.

In [5]:
flist_id = list(map(lambda string: string[-40 : -4], flist))
properties_sub = pd.DataFrame(properties.loc[properties['property_id'].isin(flist_id)])
properties_sub = properties_sub.drop_duplicates(['location_lat', 'location_lng'])
# properties_sub = pd.read_csv('properties_sub.csv')
properties_sub.propertyType = properties_sub.propertyType.astype('category')
flist_id = list(properties_sub.property_id)
properties_sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15484 entries, 0 to 19851
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Unnamed: 0    15484 non-null  int64   
 1   address       15484 non-null  object  
 2   propertyType  15484 non-null  category
 3   bedrooms      10967 non-null  float64 
 4   detailUrl     15484 non-null  object  
 5   location_lat  15484 non-null  float64 
 6   location_lng  15484 non-null  float64 
 7   property_id   15484 non-null  object  
dtypes: category(1), float64(3), int64(1), object(3)
memory usage: 983.1+ KB


In [6]:
properties_sub.propertyType.value_counts(sort = False)

Detached         3337
Flat             2252
Semi-Detached    4062
Terraced         4124
Unknown          1709
Name: propertyType, dtype: int64

The original data should be splitted into training and testing sets, and the testing set contains 30% of the original data.

In [7]:
directory = 'street_view/'

height = 64
width = 64
batch = 32

training = image_dataset_from_directory(
  directory,
  validation_split = 0.3,
  subset = 'training',
  seed = 123,
  image_size = (height, width),
  batch_size = batch,
  label_mode = 'categorical')

validation = image_dataset_from_directory(
  directory,
  validation_split = 0.3,
  subset = 'validation',
  seed = 123,
  image_size = (height, width),
  batch_size = batch,
  label_mode = 'categorical')

# training = training.cache().prefetch(buffer_size = tf.data.AUTOTUNE)
# validation = validation.cache().prefetch(buffer_size = tf.data.AUTOTUNE)

Found 15484 files belonging to 5 classes.
Using 10839 files for training.
Found 15484 files belonging to 5 classes.
Using 4645 files for validation.


In [8]:
loss = losses.CategoricalCrossentropy()
propertyType_train_fac = np.argmax(np.asarray(list(training.unbatch().map(lambda x, y: y))), axis = 1)
propertyType_validation_fac = np.argmax(np.asarray(list(validation.unbatch().map(lambda x, y: y))), axis = 1)
labels = pd.Series(propertyType_train_fac).astype('category')
weights = sum(labels.value_counts()) / labels.value_counts(sort = False)
loss.weighted = weights

In [9]:
Callbacks = [
             EarlyStopping(monitor = 'val_accuracy', patience = 3), 
             ReduceLROnPlateau(monitor = 'val_accuracy', factor = .01, patience = 0, cooldown = 0)
            ]

## Multi-Class Classification Using Neural Network

### Multi-Layer Perceptron (MLP) model

In [10]:
# mlp = Sequential([
#                   Rescaling(1. / 255, input_shape = (height, width, 3)),
#                   Flatten(),
#                   Dense(128, activation = tf.nn.leaky_relu),
#                   Dense(
#                         len(labels.cat.categories), 
#                         activation = tf.nn.softmax, 
#                         kernel_initializer = 'ones',
#                         kernel_regularizer = regularizers.L1(.01),
#                         activity_regularizer = regularizers.L1(.01)
#                        )
#                 ])
# mlp.compile(loss = loss, optimizer = keras.optimizers.Adam(), metrics = ['accuracy'])

In [11]:
# mlp.summary()

In [12]:
# mlp.fit(training, validation_data = validation, epochs = 64, batch_size = 64, callbacks = Callbacks)

In [13]:
# propertyType_validation_pred_MLP = np.argmax(mlp.predict(validation), axis = 1)

In [14]:
# confusion_matrix(propertyType_validation_fac, propertyType_validation_pred_MLP)

## Convolutional Neural Network

In [15]:
cnn = Sequential([
                  Rescaling(1. / 255, input_shape = (height, width, 3)),
                  Conv2D(8, 4, padding = 'same', activation = tf.nn.leaky_relu),
                  MaxPooling2D(),
                  Conv2D(4, 4, padding = 'same', activation = tf.nn.leaky_relu),
                  MaxPooling2D(),
                  Flatten(),
                  Dense(16, activation = tf.nn.leaky_relu),
                  Dropout(.25),
                  Dense(
                        len(labels.cat.categories), 
                        activation = tf.nn.softmax, 
                        kernel_initializer = 'ones',
                        kernel_regularizer = regularizers.L1(.1),
                        activity_regularizer = regularizers.L1(.1)
                       )
                ])
cnn.compile(loss = loss, optimizer = keras.optimizers.Adam(), metrics = ['accuracy'])
cnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 rescaling (Rescaling)       (None, 64, 64, 3)         0         
                                                                 
 conv2d (Conv2D)             (None, 64, 64, 8)         392       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 32, 32, 8)        0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 32, 32, 4)         516       
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 16, 16, 4)        0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 1024)              0

In [16]:
cnn.fit(training, validation_data = validation, epochs = 64, batch_size = 64, callbacks = Callbacks)

Epoch 1/64


Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64


<keras.callbacks.History at 0x1f4637d9990>

In [17]:
propertyType_validation_pred_CNN = np.argmax(cnn.predict(validation), axis = 1)



In [18]:
confusion_matrix(propertyType_validation_fac, propertyType_validation_pred_CNN)

array([[157,  86, 314, 431,   0],
       [125,  69, 208, 293,   0],
       [192, 109, 387, 535,   0],
       [194, 110, 416, 510,   0],
       [ 90,  38, 164, 217,   0]], dtype=int64)

In [19]:
# import dill
# dill.dump_session('Presetting.pkl')
# # dill.load_session('Presetting.pkl')