## Mount the Drive, and Change to Google Drive Folder

In [1]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount = True)

# %cd /content/drive/MyDrive/MSc.-Dissertations/1/Files
%ls

 Volume in drive C has no label.
 Volume Serial Number is E858-006A

 Directory of c:\Users\Robin\Desktop\MSc.-Dissertations\1\Files

01/06/2023  21:06    <DIR>          .
01/06/2023  21:07    <DIR>          ..
01/06/2023  21:06            28,474 classification.ipynb
01/06/2023  21:06           983,970 flist.txt
01/06/2023  21:06         4,693,058 properties.csv
01/06/2023  21:06             4,527 randomsample.ipynb
01/06/2023  21:06            30,793 Robin.ipynb
01/06/2023  21:07    <DIR>          street_view
               5 File(s)      5,740,822 bytes
               3 Dir(s)  396,901,695,488 bytes free


## Import Libraries

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import random
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Count the Number of Files, and Take Random Samples from the Image Files

In [3]:
# !ls street_view
# count how many files and write the filenames into a file
# !ls street_view -1 | wc -l 
# !ls street_view/*.jpg > flist.txt
flist = list(pd.read_csv('flist.txt', header = None)[0])

# Set seed so sample is reproducible 
random.seed(99)  # set this to an integer value!!!
nsamp = 10
flist_sub = random.sample(flist, nsamp)
flist = flist_sub

# print(flist)

## Overview of the `properties` Dataset

In [4]:
properties = pd.read_csv('properties.csv')
properties.head()

Unnamed: 0.1,Unnamed: 0,address,propertyType,bedrooms,detailUrl,location_lat,location_lng,property_id
0,0,"12, Gorsey Brigg, Dronfield Woodhouse, Dronfie...",Terraced,3.0,https://www.rightmove.co.uk/house-prices/detai...,53.29986,-1.49446,60d9dd15-c5a0-4d9c-a341-a1d47add49d5
1,0,"5, Highgate Lane, Dronfield, Derbyshire S18 1UB",Detached,4.0,https://www.rightmove.co.uk/house-prices/detai...,53.29135,-1.45975,4a586e80-181a-4b82-b5c3-2d789436bb14
2,0,"125, Gosforth Lane, Dronfield, Derbyshire S18 1RB",Detached,3.0,https://www.rightmove.co.uk/house-prices/detai...,53.29763,-1.47573,93680b6c-237e-44d3-8f40-959a14b80cad
3,0,"80, Shakespeare Crescent, Dronfield, Derbyshir...",Detached,3.0,https://www.rightmove.co.uk/house-prices/detai...,53.29259,-1.45644,5d49758b-f148-4d06-bbae-3eb23f5c68fb
4,0,"21, Gainsborough Road, Dronfield, Derbyshire S...",Detached,,https://www.rightmove.co.uk/house-prices/detai...,53.2974,-1.48503,4645f5eb-de7c-474f-8d7e-b59fa8c55f19


In [5]:
properties.propertyType = properties.propertyType.astype('category')
properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17550 entries, 0 to 17549
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Unnamed: 0    17550 non-null  int64   
 1   address       17550 non-null  object  
 2   propertyType  17550 non-null  category
 3   bedrooms      11505 non-null  float64 
 4   detailUrl     17550 non-null  object  
 5   location_lat  17550 non-null  float64 
 6   location_lng  17550 non-null  float64 
 7   property_id   17550 non-null  object  
dtypes: category(1), float64(3), int64(1), object(3)
memory usage: 977.2+ KB


In [6]:
properties.describe()

Unnamed: 0.1,Unnamed: 0,bedrooms,location_lat,location_lng
count,17550.0,11505.0,17550.0,17550.0
mean,0.0,2.871186,52.912264,-2.330492
std,0.0,1.010339,1.83383,1.262468
min,0.0,0.0,50.61708,-4.26895
25%,0.0,2.0,51.23283,-3.06729
50%,0.0,3.0,53.095885,-2.658955
75%,0.0,3.0,53.84676,-1.71275
max,0.0,6.0,55.91054,0.71999


In [7]:
properties.propertyType.value_counts()

Detached         4134
Semi-Detached    4056
Unknown          3900
Terraced         3666
Flat             1794
Name: propertyType, dtype: int64

## A Subsample of the `properties` Dataset

In [8]:
flist_id = list(map(lambda string: string[16 : -4], flist))
properties_sub = pd.DataFrame(properties.loc[properties['property_id'].isin(flist_id)])
Img_mat = np.array(list(map(lambda x: np.array(Image.open(x)).reshape(-1), flist))) / 255
Img_list = list(Img_mat)
properties_sub = properties_sub.assign(Images = Img_list)
properties_sub.head()

Unnamed: 0.1,Unnamed: 0,address,propertyType,bedrooms,detailUrl,location_lat,location_lng,property_id,Images
2550,0,"64, Queens Crescent, Livingston, West Lothian ...",Unknown,1.0,https://www.rightmove.co.uk/house-prices/detai...,55.89959,-3.54215,7b947570-ae42-418b-a43f-f571b120d93c,"[0.7450980392156863, 0.7647058823529411, 0.780..."
4740,0,"11, Meadow Drive, Aughton, Ormskirk, Lancashir...",Detached,4.0,https://www.rightmove.co.uk/house-prices/detai...,53.55281,-2.90198,bcbdc03c-25e2-4fb7-9af7-5bd030592b45,"[0.8, 0.8549019607843137, 0.8901960784313725, ..."
5282,0,"13, Queens Crescent, Stoke-sub-hamdon, Somerse...",Semi-Detached,2.0,https://www.rightmove.co.uk/house-prices/detai...,50.95956,-2.74961,621bbc77-62df-40fd-af32-09c779be9958,"[0.5372549019607843, 0.5450980392156862, 0.698..."
9038,0,"1, Beckenham Terrace, North Street, Westbourne...",Terraced,,https://www.rightmove.co.uk/house-prices/detai...,50.8644,-0.92514,2afe400f-f2e3-4b1a-88fe-5567c7dc84d7,"[0.6980392156862745, 0.796078431372549, 0.9137..."
9040,0,"24, Kelsey Avenue, Southbourne, Emsworth, West...",Semi-Detached,3.0,https://www.rightmove.co.uk/house-prices/detai...,50.85245,-0.9068,bb9597b6-655e-448e-b6a6-79fbe1c26c43,"[0.6705882352941176, 0.6078431372549019, 0.505..."


## PCA

In [9]:
pca = PCA()
pc_Img_mat = pca.fit_transform(Img_mat)
pc_Img_mat

array([[-2.58328081e+01,  2.90601485e+01,  3.65923368e+01,
        -3.04087024e+01,  5.74587132e+01, -3.37174087e+01,
        -1.66891835e+01, -2.33158429e+01,  1.83982706e+00,
         3.59487343e-13],
       [-1.49505521e+01,  1.33623161e+00, -6.35797349e+01,
        -1.21680330e+01,  2.73326787e+00, -1.30182083e+01,
         5.49936171e+01, -2.64699100e+01,  5.30753824e+00,
         3.59487343e-13],
       [ 3.95614674e+01, -2.68853089e+01, -5.77472796e+00,
         3.70627971e+01, -2.57518145e+01, -4.82558144e+01,
        -2.07685901e+01, -1.63727586e+01, -3.14047684e+01,
         3.59487343e-13],
       [ 4.96397227e+01,  1.00905468e+02, -4.68221767e+00,
         5.63545190e+00, -2.19064700e+01,  1.39429506e+01,
        -4.15793433e+00,  8.96629860e+00, -1.00726227e+00,
         3.59487343e-13],
       [ 9.81425117e+01, -4.33599182e+01,  2.56012576e+01,
        -3.20109967e+01,  5.47051707e+00,  3.83480364e+01,
         6.60300241e+00, -1.64555939e+01,  3.51823889e+00,
         3.