# Cleaning Dataset

In [1]:
# Importing of dependencies required for analysis
import pandas as pd
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
import requests
import urllib.parse
import datetime
import time
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Read in the uncleaned inactive listings dataset
df = pd.read_csv('Resources_Uncleaned/Inactive_Listings_Raw.csv')
df

Unnamed: 0,LSC,EC,St#,Street Name,Abbr,Dir,Municipality,Community,List Price,Sold Price,...,Fam,Kit,Gar Type,(A/C),Heat,Contract Date,Sold Date,List Brokerage,Co-Op Brokerage,MLS #
0,Sld,,793 Adelaide St W,,,,Toronto C00,Niagara,"$699,999","$780,000",...,N,1.0,,Central Air,Gas,1/13/2021,1/21/2021,ROYAL LEPAG...,CENTURY 21 HE...,C5083064
1,Sld,,75 Major St,,,,Toronto C01,University,"$799,000","$1,225,000",...,Y,2.0,,,Gas,5/29/2020,6/4/2020,ROYAL LEPAG...,RE/MAX REALTR...,C4773721
2,Sld,,81 Major St,,,,Toronto C01,University,"$799,000","$1,203,000",...,N,2.0,,,Gas,11/5/2020,11/6/2020,CENTURY 21 ...,"RIFE REALTY, ...",C4979474
3,Sld,N,784 Adelaide St W,,,,Toronto C01,Niagara,"$899,000","$959,000",...,Y,1.0,,Central Air,Gas,9/22/2020,10/1/2020,RE/MAX WEST...,HOMELIFE CULT...,C4923168
4,Sld,,16 Whitaker Ave,,,,Toronto C01,Niagara,"$899,000","$1,073,000",...,N,1.0,,Central Air,Gas,11/10/2020,11/17/2020,KELLER WILL...,"THE AGENCY, B...",C4985281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31912,Sld,,8 Wardlaw Cres,,,,Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",...,N,1.0,Attached,Central Air,Gas,10/3/2021,12/28/2021,CENTURY 21 ...,RE/MAX PARAMO...,W5391840
31913,Sld,,156 Thistle Down Blvd,,,,Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",...,N,1.0,Attached,Central Air,Gas,11/12/2021,11/20/2021,SUTTON GROU...,INTERNATIONAL...,W5430335
31914,Sld,,11 Woodlot Cres,,,,Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",...,Y,1.0,Attached,Central Air,Gas,12/17/2021,1/3/2022,HOMELIFE CU...,CENTURY 21 GR...,W5458498
31915,Sld,,16 Forest Path Crt,,,,Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",...,Y,1.0,Attached,Central Air,Gas,12/2/2021,12/23/2021,WORLD CLASS...,WORLD CLASS R...,W5447590


In [3]:
# Check for null values within the dataframe
df.isnull().sum()

LSC                    0
EC                 24463
St#                    0
Street Name        31917
Abbr               31917
Dir                31917
Municipality           0
Community              0
List Price             0
Sold Price             0
Type                   0
Style                 44
BR                    52
(+)                13163
Wr                     0
Fam                   48
Kit                   50
Gar Type              39
(A/C)                 49
Heat                  50
Contract Date          0
Sold Date              0
List Brokerage         0
Co-Op Brokerage        0
MLS #                  0
dtype: int64

In [4]:
# Convert all NaNs in the Extra Room column / (+) to 0s
df['(+)'] = df['(+)'].fillna(0)

# drop LSC, EC, Street Names, Abbr, Dir, List/Coop Brokerage and Sold Date.
# We remove sold date as feature as sold date does not exist in active listings dataset(the data we want predictions on)
df.drop(['LSC', 'EC', 'Street Name', 'Abbr', 'Dir', 'List Brokerage', 'Co-Op Brokerage', 'Sold Date'], axis=1, inplace=True)

In [5]:
df.isnull().sum()

St#                0
Municipality       0
Community          0
List Price         0
Sold Price         0
Type               0
Style             44
BR                52
(+)                0
Wr                 0
Fam               48
Kit               50
Gar Type          39
(A/C)             49
Heat              50
Contract Date      0
MLS #              0
dtype: int64

In [6]:
# Since out of almost 32,000 data points only sum NaN values exist, we can go ahead and drop them
x_df = df.dropna()
x_df.isnull().sum()

St#               0
Municipality      0
Community         0
List Price        0
Sold Price        0
Type              0
Style             0
BR                0
(+)               0
Wr                0
Fam               0
Kit               0
Gar Type          0
(A/C)             0
Heat              0
Contract Date     0
MLS #             0
dtype: int64

In [7]:
# Now there are no NaNs and total rows are 31862.
x_df

Unnamed: 0,St#,Municipality,Community,List Price,Sold Price,Type,Style,BR,(+),Wr,Fam,Kit,Gar Type,(A/C),Heat,Contract Date,MLS #
0,793 Adelaide St W,Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,1/13/2021,C5083064
1,75 Major St,Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,5/29/2020,C4773721
2,81 Major St,Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,11/5/2020,C4979474
3,784 Adelaide St W,Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,9/22/2020,C4923168
4,16 Whitaker Ave,Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,11/10/2020,C4985281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31912,8 Wardlaw Cres,Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",Detached,1 1/2 Storey,4.0,2.0,3,N,1.0,Attached,Central Air,Gas,10/3/2021,W5391840
31913,156 Thistle Down Blvd,Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",Detached,Backsplit,4.0,1.0,2,N,1.0,Attached,Central Air,Gas,11/12/2021,W5430335
31914,11 Woodlot Cres,Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",Detached,2-Storey,4.0,0.0,4,Y,1.0,Attached,Central Air,Gas,12/17/2021,W5458498
31915,16 Forest Path Crt,Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",Detached,Backsplit,5.0,0.0,4,Y,1.0,Attached,Central Air,Gas,12/2/2021,W5447590


In [8]:
# rename some columns
x_df = x_df.rename(columns={'St#': 'Address', '(+)':'ER', 'Wr': 'WR', 'Fam': 'FR', 'Kit':'Kitchen', 'Contract Date ': 'Contract_Date', 'Gar Type': 'Gar_Type', '(A/C)':'AC', 'MLS #': 'MLS_ID'})

In [9]:
# Add Toronto to all the street addresses so when the code to find lat/lng runs it will only find addresses in GTA
x_df['Address'] = x_df['Address'].astype(str) + ', Toronto'

In [10]:
# resetting the DataFrame index
x_df = x_df.reset_index()
x_df.drop(['index'], axis=1, inplace=True)
x_df

Unnamed: 0,Address,Municipality,Community,List Price,Sold Price,Type,Style,BR,ER,WR,FR,Kitchen,Gar_Type,AC,Heat,Contract_Date,MLS_ID
0,"793 Adelaide St W, Toronto",Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,1/13/2021,C5083064
1,"75 Major St, Toronto",Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,5/29/2020,C4773721
2,"81 Major St, Toronto",Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,11/5/2020,C4979474
3,"784 Adelaide St W, Toronto",Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,9/22/2020,C4923168
4,"16 Whitaker Ave, Toronto",Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,11/10/2020,C4985281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31857,"8 Wardlaw Cres, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",Detached,1 1/2 Storey,4.0,2.0,3,N,1.0,Attached,Central Air,Gas,10/3/2021,W5391840
31858,"156 Thistle Down Blvd, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",Detached,Backsplit,4.0,1.0,2,N,1.0,Attached,Central Air,Gas,11/12/2021,W5430335
31859,"11 Woodlot Cres, Toronto",Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",Detached,2-Storey,4.0,0.0,4,Y,1.0,Attached,Central Air,Gas,12/17/2021,W5458498
31860,"16 Forest Path Crt, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",Detached,Backsplit,5.0,0.0,4,Y,1.0,Attached,Central Air,Gas,12/2/2021,W5447590


In [11]:
# convert time to date time
x_df['Contract_Date'] = pd.to_datetime(x_df['Contract_Date'])
x_df = x_df.rename(columns={'List Price': 'List_Price($)', 'Sold Price':'Sold_Price($)'})
x_df.dtypes

Address                  object
Municipality             object
Community                object
List_Price($)            object
Sold_Price($)            object
Type                     object
Style                    object
BR                      float64
ER                      float64
WR                        int64
FR                       object
Kitchen                 float64
Gar_Type                 object
AC                       object
Heat                     object
Contract_Date    datetime64[ns]
MLS_ID                   object
dtype: object

In [12]:
x_df

Unnamed: 0,Address,Municipality,Community,List_Price($),Sold_Price($),Type,Style,BR,ER,WR,FR,Kitchen,Gar_Type,AC,Heat,Contract_Date,MLS_ID
0,"793 Adelaide St W, Toronto",Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,2021-01-13,C5083064
1,"75 Major St, Toronto",Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,2020-05-29,C4773721
2,"81 Major St, Toronto",Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,2020-11-05,C4979474
3,"784 Adelaide St W, Toronto",Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,2020-09-22,C4923168
4,"16 Whitaker Ave, Toronto",Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,2020-11-10,C4985281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31857,"8 Wardlaw Cres, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",Detached,1 1/2 Storey,4.0,2.0,3,N,1.0,Attached,Central Air,Gas,2021-10-03,W5391840
31858,"156 Thistle Down Blvd, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",Detached,Backsplit,4.0,1.0,2,N,1.0,Attached,Central Air,Gas,2021-11-12,W5430335
31859,"11 Woodlot Cres, Toronto",Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",Detached,2-Storey,4.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-17,W5458498
31860,"16 Forest Path Crt, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",Detached,Backsplit,5.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-02,W5447590


In [13]:
# Using street addresses to find Lat/Lng and adding to dataframe
data=[]
s=[]
f=[]
for i in x_df['Address']:
    try: 
        url = "https://nominatim.openstreetmap.org/?addressdetails=1&q=" + i + "+" +"&format=json&limit=1"
        response = requests.get(url).json()
        data.append({"Address":i, "Lat": response[0]["lat"], "Lng": response[0]["lon"]})
        s.append(i)
    except:
        data.append({"Address":i, "Lat": np.NaN, "Lng": np.NaN})
        f.append(i)

In [14]:
# finding size of succeeded calls and number of failed
print(len(s))
print(len(f))

30718
1144


In [15]:
# converting data to dataframe and seeing number of null values
data_df = pd.DataFrame(data)
data_df.isnull().sum()

Address       0
Lat        1144
Lng        1144
dtype: int64

In [16]:
data_df

Unnamed: 0,Address,Lat,Lng
0,"793 Adelaide St W, Toronto",43.64380671428571,-79.4094258
1,"75 Major St, Toronto",43.659648485714285,-79.40345567142857
2,"81 Major St, Toronto",43.65977672857143,-79.40350535714286
3,"784 Adelaide St W, Toronto",43.6441171,-79.409249425
4,"16 Whitaker Ave, Toronto",43.643947983509534,-79.40645687386683
...,...,...,...
31857,"8 Wardlaw Cres, Toronto",43.739634607486565,-79.56562064679817
31858,"156 Thistle Down Blvd, Toronto",43.74098727668199,-79.55054547910042
31859,"11 Woodlot Cres, Toronto",43.73290208585937,-79.61406852542748
31860,"16 Forest Path Crt, Toronto",43.7419279,-79.5556235


In [17]:
# display all the houses that failed to fetch a lat/lng
f

['131 Beatrice St W, Toronto',
 '23 St Patricks Sq, Toronto',
 "253 St Helen's Ave, Toronto",
 "189 St Helen's Ave, Toronto",
 "224 St Helen's Ave, Toronto",
 "211 St Helen's Ave, Toronto",
 "218 St Helen's Ave, Toronto",
 "247 St Helen's Ave, Toronto",
 '5 Mc Master Ave, Toronto',
 '26 Mc Master Ave, Toronto',
 '45 Gloucester Grve, Toronto',
 '26 Gloucester Grve, Toronto',
 '51 Gloucester Grve, Toronto',
 '17 Connaught Circ, Toronto',
 '570 Arlington Ave W, Toronto',
 '11 Connaught Circ, Toronto',
 '194 Gloucester Grve, Toronto',
 '103 Gloucester Grve, Toronto',
 '324 Glenholme Ave N, Toronto',
 '363 Lauder Ave E, Toronto',
 '61 Gloucester Grve, Toronto',
 '391 Winona Dr W, Toronto',
 '539 Vaughan Rd E, Toronto',
 '151 Gloucester Grve, Toronto',
 '548 Old Orchard Grve, Toronto',
 '368 Old Orchard Grve, Toronto',
 '585 Old Orchard Grve, Toronto',
 '430 Old Orchard Grve, Toronto',
 '387 Old Orchard Grve, Toronto',
 '562 Old Orchard Grve, Toronto',
 '369 Old Orchard Grve, Toronto',
 '455

In [29]:
# converting null values for lat/lng to a dataframe
x = data_df[data_df['Lat'].isnull() & data_df['Lng'].isnull()]
x

Unnamed: 0,Address,Lat,Lng
557,"131 Beatrice St W, Toronto",,
873,"23 St Patricks Sq, Toronto",,
963,"253 St Helen's Ave, Toronto",,
968,"189 St Helen's Ave, Toronto",,
981,"224 St Helen's Ave, Toronto",,
...,...,...,...
31801,"24 Autumn Glen Circ, Toronto",,
31805,"53 Woolenscote Circ, Toronto",,
31812,"120 Clearbrooke Circ, Toronto",,
31833,"186 Cabernet Circ, Toronto",,


In [33]:
# export the dataframes of lat and long to csv
output_data_file_1 = 'Lat_Lng/Lat_Lng.csv'
# Export the Data into a CSV.
data_df.to_csv(output_data_file_1)
# for future increase of code will try and match last few lat/long as well

In [36]:
# merge lat / long data into rest of cleaned dataset
merge_df = x_df.merge(data_df, how = 'inner', left_index=True, right_index=True)
merge_df

Unnamed: 0,Address_x,Municipality,Community,List_Price($),Sold_Price($),Type,Style,BR,ER,WR,FR,Kitchen,Gar_Type,AC,Heat,Contract_Date,MLS_ID,Address_y,Lat,Lng
0,"793 Adelaide St W, Toronto",Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,2021-01-13,C5083064,"793 Adelaide St W, Toronto",43.64380671428571,-79.4094258
1,"75 Major St, Toronto",Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,2020-05-29,C4773721,"75 Major St, Toronto",43.659648485714285,-79.40345567142857
2,"81 Major St, Toronto",Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,2020-11-05,C4979474,"81 Major St, Toronto",43.65977672857143,-79.40350535714286
3,"784 Adelaide St W, Toronto",Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,2020-09-22,C4923168,"784 Adelaide St W, Toronto",43.6441171,-79.409249425
4,"16 Whitaker Ave, Toronto",Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,2020-11-10,C4985281,"16 Whitaker Ave, Toronto",43.643947983509534,-79.40645687386683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31857,"8 Wardlaw Cres, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",Detached,1 1/2 Storey,4.0,2.0,3,N,1.0,Attached,Central Air,Gas,2021-10-03,W5391840,"8 Wardlaw Cres, Toronto",43.739634607486565,-79.56562064679817
31858,"156 Thistle Down Blvd, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",Detached,Backsplit,4.0,1.0,2,N,1.0,Attached,Central Air,Gas,2021-11-12,W5430335,"156 Thistle Down Blvd, Toronto",43.74098727668199,-79.55054547910042
31859,"11 Woodlot Cres, Toronto",Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",Detached,2-Storey,4.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-17,W5458498,"11 Woodlot Cres, Toronto",43.73290208585937,-79.61406852542748
31860,"16 Forest Path Crt, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",Detached,Backsplit,5.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-02,W5447590,"16 Forest Path Crt, Toronto",43.7419279,-79.5556235


In [48]:
comparison_column = merge_df[(merge_df["Address_x"] == merge_df["Address_y"])]
comparison_column.count()

Address_x        31862
Municipality     31862
Community        31862
List_Price($)    31862
Sold_Price($)    31862
Type             31862
Style            31862
BR               31862
ER               31862
WR               31862
FR               31862
Kitchen          31862
Gar_Type         31862
AC               31862
Heat             31862
Contract_Date    31862
MLS_ID           31862
Address_y        31862
Lat              30718
Lng              30718
dtype: int64

In [49]:
# final cleaning
merge_df.drop(['Address_x'], axis=1, inplace=True)
merge_df= merge_df.rename(columns={'Address_y': 'Address'})
merge_df

Unnamed: 0,Municipality,Community,List_Price($),Sold_Price($),Type,Style,BR,ER,WR,FR,Kitchen,Gar_Type,AC,Heat,Contract_Date,MLS_ID,Address,Lat,Lng
0,Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,2021-01-13,C5083064,"793 Adelaide St W, Toronto",43.64380671428571,-79.4094258
1,Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,2020-05-29,C4773721,"75 Major St, Toronto",43.659648485714285,-79.40345567142857
2,Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,2020-11-05,C4979474,"81 Major St, Toronto",43.65977672857143,-79.40350535714286
3,Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,2020-09-22,C4923168,"784 Adelaide St W, Toronto",43.6441171,-79.409249425
4,Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,2020-11-10,C4985281,"16 Whitaker Ave, Toronto",43.643947983509534,-79.40645687386683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31857,Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",Detached,1 1/2 Storey,4.0,2.0,3,N,1.0,Attached,Central Air,Gas,2021-10-03,W5391840,"8 Wardlaw Cres, Toronto",43.739634607486565,-79.56562064679817
31858,Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",Detached,Backsplit,4.0,1.0,2,N,1.0,Attached,Central Air,Gas,2021-11-12,W5430335,"156 Thistle Down Blvd, Toronto",43.74098727668199,-79.55054547910042
31859,Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",Detached,2-Storey,4.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-17,W5458498,"11 Woodlot Cres, Toronto",43.73290208585937,-79.61406852542748
31860,Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",Detached,Backsplit,5.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-02,W5447590,"16 Forest Path Crt, Toronto",43.7419279,-79.5556235


In [54]:
# checking to see null values left in dataframe
merge_df.count()

Municipality     31862
Community        31862
List_Price($)    31862
Sold_Price($)    31862
Type             31862
Style            31862
BR               31862
ER               31862
WR               31862
FR               31862
Kitchen          31862
Gar_Type         31862
AC               31862
Heat             31862
Contract_Date    31862
MLS_ID           31862
Address          31862
Lat              30718
Lng              30718
dtype: int64

In [55]:
# save this df to a new csv file to cleaned_resources folder. This file will now be used for further analysis
# Create the output file (CSV).
output_data_file = "Resources_Cleaned/Inactive_Listings.csv"
# Export the City_Data into a CSV.
merge_df.to_csv(output_data_file)

In [2]:
# Read in clean complete dataset with lat/lng and do final cleaning by dropping lat/lng values
exp_df = pd.read_csv('Resources_Cleaned/Inactive_Listings.csv')
exp_df

Unnamed: 0.1,Unnamed: 0,Municipality,Community,List_Price($),Sold_Price($),Type,Style,BR,ER,WR,FR,Kitchen,Gar_Type,AC,Heat,Contract_Date,MLS_ID,Address,Lat,Lng
0,0,Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,2021-01-13,C5083064,"793 Adelaide St W, Toronto",43.643807,-79.409426
1,1,Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,2020-05-29,C4773721,"75 Major St, Toronto",43.659648,-79.403456
2,2,Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,2020-11-05,C4979474,"81 Major St, Toronto",43.659777,-79.403505
3,3,Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,2020-09-22,C4923168,"784 Adelaide St W, Toronto",43.644117,-79.409249
4,4,Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,2020-11-10,C4985281,"16 Whitaker Ave, Toronto",43.643948,-79.406457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31857,31857,Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",Detached,1 1/2 Storey,4.0,2.0,3,N,1.0,Attached,Central Air,Gas,2021-10-03,W5391840,"8 Wardlaw Cres, Toronto",43.739635,-79.565621
31858,31858,Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",Detached,Backsplit,4.0,1.0,2,N,1.0,Attached,Central Air,Gas,2021-11-12,W5430335,"156 Thistle Down Blvd, Toronto",43.740987,-79.550545
31859,31859,Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",Detached,2-Storey,4.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-17,W5458498,"11 Woodlot Cres, Toronto",43.732902,-79.614069
31860,31860,Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",Detached,Backsplit,5.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-02,W5447590,"16 Forest Path Crt, Toronto",43.741928,-79.555623


In [3]:
exp_df.isnull().sum()

Unnamed: 0          0
Municipality        0
Community           0
List_Price($)       0
Sold_Price($)       0
Type                0
Style               0
BR                  0
ER                  0
WR                  0
FR                  0
Kitchen             0
Gar_Type            0
AC                  0
Heat                0
Contract_Date       0
MLS_ID              0
Address             0
Lat              1144
Lng              1144
dtype: int64

In [4]:
exp_df.drop('Unnamed: 0', axis=1, inplace=True)
exp_df = exp_df.dropna()
exp_df

Unnamed: 0,Municipality,Community,List_Price($),Sold_Price($),Type,Style,BR,ER,WR,FR,Kitchen,Gar_Type,AC,Heat,Contract_Date,MLS_ID,Address,Lat,Lng
0,Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,2021-01-13,C5083064,"793 Adelaide St W, Toronto",43.643807,-79.409426
1,Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,2020-05-29,C4773721,"75 Major St, Toronto",43.659648,-79.403456
2,Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,2020-11-05,C4979474,"81 Major St, Toronto",43.659777,-79.403505
3,Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,2020-09-22,C4923168,"784 Adelaide St W, Toronto",43.644117,-79.409249
4,Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,2020-11-10,C4985281,"16 Whitaker Ave, Toronto",43.643948,-79.406457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31857,Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",Detached,1 1/2 Storey,4.0,2.0,3,N,1.0,Attached,Central Air,Gas,2021-10-03,W5391840,"8 Wardlaw Cres, Toronto",43.739635,-79.565621
31858,Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",Detached,Backsplit,4.0,1.0,2,N,1.0,Attached,Central Air,Gas,2021-11-12,W5430335,"156 Thistle Down Blvd, Toronto",43.740987,-79.550545
31859,Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",Detached,2-Storey,4.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-17,W5458498,"11 Woodlot Cres, Toronto",43.732902,-79.614069
31860,Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",Detached,Backsplit,5.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-02,W5447590,"16 Forest Path Crt, Toronto",43.741928,-79.555623


In [5]:
exp_df.isnull().sum()

Municipality     0
Community        0
List_Price($)    0
Sold_Price($)    0
Type             0
Style            0
BR               0
ER               0
WR               0
FR               0
Kitchen          0
Gar_Type         0
AC               0
Heat             0
Contract_Date    0
MLS_ID           0
Address          0
Lat              0
Lng              0
dtype: int64

In [6]:
exp_df.dtypes

Municipality      object
Community         object
List_Price($)     object
Sold_Price($)     object
Type              object
Style             object
BR               float64
ER               float64
WR                 int64
FR                object
Kitchen          float64
Gar_Type          object
AC                object
Heat              object
Contract_Date     object
MLS_ID            object
Address           object
Lat              float64
Lng              float64
dtype: object

In [7]:
# save this df to a new csv file to cleaned_resources folder. This file will now be used for further analysis
# Create the output file (CSV).
output_data_file_new = "Resources_Cleaned/Inactive_Listings_Final.csv"
# Export the City_Data into a CSV.
exp_df.to_csv(output_data_file_new)