In [2]:
from data_cleaner import get_cleaned_data, flatten_data
import pandas as pd
import numpy as np
import torch

### Data Cleaning
- `cleaned_data()` removes bad characters and outlier data including blanks and data for single room listings.
- `flatten_data()` flattens the building and unit amenities to put individual amenities into its own column.

In [3]:
cleaned_data = get_cleaned_data()
flattened_data = flatten_data(cleaned_data)
df = pd.DataFrame(flattened_data)

In [4]:
print("Printing columns:")
print(df.columns)

Printing columns:
Index(['Building', 'Address', 'Listing', 'Bed', 'Bath', 'SqFt', 'Price',
       'Pets', 'Latitude', 'Longitude', 'Balcony', 'In Unit Laundry',
       'Air Conditioning', 'High Ceilings', 'Furnished', 'Hardwood Floor',
       'Controlled Access', 'Fitness Center', 'Swimming Pool', 'Roof Deck',
       'Storage', 'Residents Lounge', 'Outdoor Space'],
      dtype='object')


In [5]:
print("Printing first 2 rows:")
print(df.head(2))

Printing first 2 rows:
             Building                                  Address    Listing  \
0  20 Samuel Wood Way  20 Samuel Wood Way, Toronto, ON M9B 0C8     Studio   
1  20 Samuel Wood Way  20 Samuel Wood Way, Toronto, ON M9B 0C8  1 Bedroom   

   Bed  Bath  SqFt  Price  Pets  Latitude  Longitude  ...  High Ceilings  \
0    0   1.0   370   2225     1   43.6959    -79.552  ...              0   
1    1   1.0   540   2625     1   43.6959    -79.552  ...              0   

   Furnished  Hardwood Floor  Controlled Access  Fitness Center  \
0          0               0                  1               1   
1          0               0                  1               1   

   Swimming Pool  Roof Deck  Storage  Residents Lounge  Outdoor Space  
0              0          0        1                 1              1  
1              0          0        1                 1              1  

[2 rows x 23 columns]


### Standardize the Data
We use the standard scaling to standardize the values before passing to the model.

In [6]:
from constants import TableHeaders

In [7]:
SQFT = TableHeaders.SQFT.value

np_sqft = df[SQFT].to_numpy()
sqft_mean = np_sqft.mean()
sqft_std = np_sqft.std()

print("Mean SQFT:",sqft_mean)
print("STDEV SQFT:",sqft_std)

Mean SQFT: 849.9989795918367
STDEV SQFT: 563.6767772248846
