## Mount the Drive, and Change to Google Drive Folder

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

%cd /content/drive/MyDrive/MSc.-Dissertations/1/Files
%ls

Mounted at /content/drive
/content/drive/MyDrive/MSc.-Dissertations/1/Files
 classification.ipynb                   properties_sub.csv
 flist_images.txt                       property_transactions_juny12.csv
 flist.txt                              randomsample.ipynb
'load_google_streetview_db (3).ipynb'   Robin.ipynb
 properties.csv                         [0m[01;34mstreet_view[0m/
 properties_juny12.csv


## Import Libraries

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import models, layers, utils, losses, optimizers
from keras.wrappers import scikit_learn
from keras.models import *
from keras.layers import *
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from keras.utils import np_utils, image_dataset_from_directory
from keras.preprocessing.image import ImageDataGenerator

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

import random
import pandas as pd
import numpy as np
from PIL import Image, ImageFilter
import matplotlib.pyplot as plt

## Count the Number of Files, and Take Random Samples from the Image Files

In [None]:
# !ls street_view
# count how many files
# !ls street_view -1 | wc -l
flist = list(pd.read_csv('flist.txt', header = None)[0])

## Overview of the `properties` Dataset

Read the `properties` dataset first, and make sure that `property type` is a categorical variable.

In [None]:
properties = pd.read_csv('properties.csv')
properties_juny12 = pd.read_csv('properties_juny12.csv')
properties_full = pd.concat([properties, properties_juny12])
properties = properties_full
properties.propertyType = properties.propertyType.astype('category')
properties.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37402 entries, 0 to 19851
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Unnamed: 0    37402 non-null  int64   
 1   address       37402 non-null  object  
 2   propertyType  37402 non-null  category
 3   bedrooms      24486 non-null  float64 
 4   detailUrl     37402 non-null  object  
 5   location_lat  37402 non-null  float64 
 6   location_lng  37402 non-null  float64 
 7   property_id   37402 non-null  object  
dtypes: category(1), float64(3), int64(1), object(3)
memory usage: 2.3+ MB


## A Subset of the `properties` Dataset

As random samples of images have been obtained previously, a subset of the whole `properties` dataset could hence be formulated by selecting the rows of the whole `properties` dataset corresponding to the selected samples.

In [None]:
flist_id = list(map(lambda string: string[-40 : -4], flist))
properties_sub = pd.DataFrame(properties.loc[properties['property_id'].isin(flist_id)])
properties_sub = properties_sub.drop_duplicates(['location_lat', 'location_lng'])
# properties_sub = pd.read_csv('properties_sub.csv')
properties_sub.propertyType = properties_sub.propertyType.astype('category')
flist_id = list(properties_sub.property_id)
properties_sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15484 entries, 0 to 19851
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Unnamed: 0    15484 non-null  int64   
 1   address       15484 non-null  object  
 2   propertyType  15484 non-null  category
 3   bedrooms      10967 non-null  float64 
 4   detailUrl     15484 non-null  object  
 5   location_lat  15484 non-null  float64 
 6   location_lng  15484 non-null  float64 
 7   property_id   15484 non-null  object  
dtypes: category(1), float64(3), int64(1), object(3)
memory usage: 983.1+ KB


In [None]:
properties_sub.propertyType.value_counts(sort = False)

Detached         3337
Flat             2252
Semi-Detached    4062
Terraced         4124
Unknown          1709
Name: propertyType, dtype: int64

The original data should be splitted into training and testing sets, and the testing set contains 30% of the original data.

In [None]:
directory = 'street_view/'

height = 32
width = 32
batch = 32

training = image_dataset_from_directory(
  directory,
  validation_split = 0.3,
  subset = 'training',
  seed = 123,
  image_size = (height, width),
  batch_size = batch,
  label_mode = 'categorical')

validation = image_dataset_from_directory(
  directory,
  validation_split = 0.3,
  subset = 'validation',
  seed = 123,
  image_size = (height, width),
  batch_size = batch,
  label_mode = 'categorical')

training = training.cache().prefetch(buffer_size = tf.data.AUTOTUNE)
validation = validation.cache().prefetch(buffer_size = tf.data.AUTOTUNE)

Found 15484 files belonging to 5 classes.
Using 10839 files for training.
Found 15484 files belonging to 5 classes.
Using 4645 files for validation.


In [None]:
loss = losses.CategoricalCrossentropy()
propertyType_train_fac = np.argmax(np.asarray(list(training.unbatch().map(lambda x, y: y))), axis = 1)
propertyType_validation_fac = np.argmax(np.asarray(list(validation.unbatch().map(lambda x, y: y))), axis = 1)
labels = pd.Series(propertyType_train_fac).astype('category')
weights = sum(labels.value_counts()) / labels.value_counts(sort = False)
loss.weighted = weights

## Multi-Class Classification Using Neural Network

### Multi-Layer Perceptron (MLP) model

In [None]:
mlp = Sequential([
                  Rescaling(1. / 255, input_shape = (height, width, 3)),
                  Flatten(),
                  Dense(128, activation = tf.nn.leaky_relu),
                  Dense(len(labels.cat.categories), activation = tf.nn.softmax)
                ])
mlp.compile(loss = loss, optimizer = keras.optimizers.Adam(), metrics = ['accuracy'])

In [None]:
mlp.summary()

In [None]:
mlp.fit(training, validation_data = validation, epochs = 32, batch_size = 64)

In [None]:
propertyType_validation_pred_MLP = np.argmax(mlp.predict(validation), axis = 1)
mlp.evaluate(validation)

In [None]:
confusion_matrix(propertyType_validation_fac, propertyType_validation_pred_MLP)

## Convolutional Neural Network

In [None]:
cnn = Sequential([
                  Rescaling(1. / 255, input_shape = (height, width, 3)),
                  Conv2D(4, 2, padding = 'same', activation = tf.nn.leaky_relu),
                  MaxPooling2D(),
                  Conv2D(8, 2, padding = 'same', activation = tf.nn.leaky_relu),
                  MaxPooling2D(),
                  Flatten(),
                  Dense(16, activation = tf.nn.leaky_relu),
                  # Dropout(.5),
                  Dense(len(labels.cat.categories), activation = tf.nn.softmax)
                ])
cnn.compile(loss = loss, optimizer = keras.optimizers.Adam(), metrics = ['accuracy'])
cnn.summary()

In [None]:
cnn.fit(training, validation_data = validation, epochs = 16, batch_size = 64)

In [None]:
propertyType_validation_pred_CNN = np.argmax(cnn.predict(validation), axis = 1)
cnn.evaluate(validation)

In [None]:
confusion_matrix(propertyType_validation_fac, propertyType_validation_pred_CNN)

In [None]:
# import dill
# dill.dump_session('Presetting.pkl')
# # dill.load_session('Presetting.pkl')