## Mount the Drive, and Change to Google Drive Folder

In [None]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount = True)

# %cd /content/drive/MyDrive/MSc.-Dissertations/1/Files
# %ls

## Import Libraries

In [299]:
import tensorflow as tf
from tensorflow import keras
from keras import models, layers, utils, losses
from keras.wrappers import scikit_learn
from keras.models import *
from keras.layers import *
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

import random
import pandas as pd
import numpy as np
from PIL import Image, ImageFilter
import matplotlib.pyplot as plt

## Count the Number of Files, and Take Random Samples from the Image Files

In [None]:
# !ls street_view
# count how many files and write the filenames into a file
# !ls street_view -1 | wc -l 
# !ls street_view/*.jpg > flist.txt
flist_old = list(pd.read_csv('flist.txt', header = None)[0])
flist = []
change_names = list(map(lambda x: flist.append(f'street_view/{x}'), flist_old))

# Set seed so sample is reproducible 
# random.seed(99)  # set this to an integer value!!!
# nsamp = 100
# flist_sub = random.sample(flist, nsamp)
# flist = flist_sub

# print(flist)

## Overview of the `properties` Dataset

Read the `properties` dataset first, and make sure that `property type` is a categorical variable.

In [None]:
properties = pd.read_csv('properties.csv')
properties_juny12 = pd.read_csv('properties_juny12.csv')
properties_full = pd.concat([properties, properties_juny12])
properties = properties_full
properties.propertyType = properties.propertyType.astype('category')
properties.head()

Basic information of the dataset is shown as follows.

In [None]:
properties.info()

Descriptive statistics of continuous variables are shown as follows.

In [None]:
properties.describe()

Frequencies of each level of the variable `property type` are obtained as follows.

In [None]:
properties.propertyType.value_counts()

## A Subset of the `properties` Dataset

As random samples of images have been obtained previously, a subset of the whole `properties` dataset could hence be formulated by selecting the rows of the whole `properties` dataset corresponding to the selected samples.

In [None]:
flist_id = list(map(lambda string: string[16 : -4], flist))
properties_sub = pd.DataFrame(properties.loc[properties['property_id'].isin(flist_id)])
properties_sub = properties_sub.drop_duplicates(['location_lat', 'location_lng'])
flist_id = list(properties_sub.property_id)
flist_new = []
change_names_new = list(map(lambda x: flist_new.append(f'street_view/gsv_{x}.jpg'), flist_id))

The original data should be splitted into training and testing sets, and the testing set contains 30% of the original data.

In [None]:
flist_train, flist_test, propertyType_train, propertyType_test = train_test_split(
    flist_new, # image directories
    np.array(properties_sub.propertyType), # property types
    test_size = 0.3)

In [347]:
Img_list_train = list(map(lambda x: np.asarray(Image.open(x).resize((32, 32), Image.LANCZOS)), flist_train))
Img_list_test = list(map(lambda x: np.asarray(Image.open(x).resize((32, 32), Image.LANCZOS)), flist_test))
Img_array_train = np.asarray(Img_list_train)
Img_array_test = np.asarray(Img_list_test)

For categorical variables, one-hot encoder is introduced.

In [None]:
dummy_propertyType_train = pd.get_dummies(propertyType_train)
dummy_propertyType_test = pd.get_dummies(propertyType_test)
propertyType_test_fac = np.argmax(np.array(dummy_propertyType_test), axis = 1) 

## Multi-Class Classification Using Neural Network

### Multi-Layer Perceptron (MLP) model

In [316]:
def mlp(output_dim):

    '''
    Creates a multi-layer perceptron neural network model without hidden layers.

    Parameter:
    output_dim (int): The number of output classes.
    
    Returns:
    A compiled Keras model.
    '''

    model = Sequential()
    model.add(Rescaling(1. / 255))
    model.add(Flatten())
    model.add(Dense(100, activation = tf.nn.leaky_relu))
    model.add(Dense(output_dim, activation = tf.nn.softmax))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [317]:
MLP = mlp(dummy_propertyType_train.shape[1])
MLP.fit(ImageDataGenerator().flow(Img_array_train, dummy_propertyType_train, batch_size = 64), epochs = 20, batch_size = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1183c0dc2e0>

In [318]:
propertyType_test_pred_MLP = np.argmax(MLP.predict(Img_array_test), axis = 1) 
MLP.evaluate(Img_array_test, dummy_propertyType_test)



[1.8216115236282349, 0.28024107217788696]

In [319]:
confusion_matrix(propertyType_test_fac, propertyType_test_pred_MLP)

array([[  74,    3,  848,    7,   79],
       [  21,   47,  517,   16,   85],
       [  42,    6, 1047,   13,   82],
       [  34,   22,  988,   43,  156],
       [  13,   10,  393,    9,   91]], dtype=int64)

## Convolutional Neural Network

In [373]:
def cnn(output_dim):

    '''
    Creates a convolutional neural network model without hidden layers.

    Parameter:
    output_dim (int): The number of output classes.
    
    Returns:
    A compiled Keras model.
    '''

    model = Sequential()
    model.add(Rescaling(1. / 255))
    model.add(Conv2D(4, 2, padding = 'same', activation = tf.nn.leaky_relu))
    model.add(MaxPooling2D())  
    model.add(Flatten())
    model.add(Dense(16, activation = tf.nn.leaky_relu))
    model.add(Dense(output_dim, activation = tf.nn.softmax))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [380]:
CNN = cnn(dummy_propertyType_train.shape[1])
CNN.fit(ImageDataGenerator().flow(Img_array_train, dummy_propertyType_train, batch_size = 256), epochs = 20, batch_size = 64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1186ead2e00>

In [381]:
propertyType_test_pred_CNN = np.argmax(CNN.predict(Img_array_test), axis = 1) 
CNN.evaluate(Img_array_test, dummy_propertyType_test)



[1.4578146934509277, 0.3715023696422577]

In [382]:
confusion_matrix(propertyType_test_fac, propertyType_test_pred_CNN)

array([[417,  49, 418, 112,  15],
       [109, 208, 175, 183,  11],
       [325,  46, 630, 171,  18],
       [216, 116, 449, 445,  17],
       [113,  59, 232,  86,  26]], dtype=int64)

In [None]:
# import dill
# dill.dump_session('Presetting.pkl')
# # dill.load_session('Presetting.pkl')