In [1]:
# Importing of dependencies required for analysis
import pandas as pd
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
import requests
import urllib.parse
import datetime
import time
from sklearn import metrics
import numpy as np

In [2]:
# Read in the uncleaned inactive listings dataset
df = pd.read_csv('Resources_Uncleaned/Inactive_Listings_Raw.csv')
df

Unnamed: 0,LSC,EC,St#,Street Name,Abbr,Dir,Municipality,Community,List Price,Sold Price,...,Fam,Kit,Gar Type,(A/C),Heat,Contract Date,Sold Date,List Brokerage,Co-Op Brokerage,MLS #
0,Sld,,793 Adelaide St W,,,,Toronto C00,Niagara,"$699,999","$780,000",...,N,1.0,,Central Air,Gas,1/13/2021,1/21/2021,ROYAL LEPAG...,CENTURY 21 HE...,C5083064
1,Sld,,75 Major St,,,,Toronto C01,University,"$799,000","$1,225,000",...,Y,2.0,,,Gas,5/29/2020,6/4/2020,ROYAL LEPAG...,RE/MAX REALTR...,C4773721
2,Sld,,81 Major St,,,,Toronto C01,University,"$799,000","$1,203,000",...,N,2.0,,,Gas,11/5/2020,11/6/2020,CENTURY 21 ...,"RIFE REALTY, ...",C4979474
3,Sld,N,784 Adelaide St W,,,,Toronto C01,Niagara,"$899,000","$959,000",...,Y,1.0,,Central Air,Gas,9/22/2020,10/1/2020,RE/MAX WEST...,HOMELIFE CULT...,C4923168
4,Sld,,16 Whitaker Ave,,,,Toronto C01,Niagara,"$899,000","$1,073,000",...,N,1.0,,Central Air,Gas,11/10/2020,11/17/2020,KELLER WILL...,"THE AGENCY, B...",C4985281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31912,Sld,,8 Wardlaw Cres,,,,Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",...,N,1.0,Attached,Central Air,Gas,10/3/2021,12/28/2021,CENTURY 21 ...,RE/MAX PARAMO...,W5391840
31913,Sld,,156 Thistle Down Blvd,,,,Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",...,N,1.0,Attached,Central Air,Gas,11/12/2021,11/20/2021,SUTTON GROU...,INTERNATIONAL...,W5430335
31914,Sld,,11 Woodlot Cres,,,,Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",...,Y,1.0,Attached,Central Air,Gas,12/17/2021,1/3/2022,HOMELIFE CU...,CENTURY 21 GR...,W5458498
31915,Sld,,16 Forest Path Crt,,,,Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",...,Y,1.0,Attached,Central Air,Gas,12/2/2021,12/23/2021,WORLD CLASS...,WORLD CLASS R...,W5447590


In [3]:
# Check for null values within the dataframe
df.isnull().sum()

LSC                    0
EC                 24463
St#                    0
Street Name        31917
Abbr               31917
Dir                31917
Municipality           0
Community              0
List Price             0
Sold Price             0
Type                   0
Style                 44
BR                    52
(+)                13163
Wr                     0
Fam                   48
Kit                   50
Gar Type              39
(A/C)                 49
Heat                  50
Contract Date          0
Sold Date              0
List Brokerage         0
Co-Op Brokerage        0
MLS #                  0
dtype: int64

In [4]:
# Convert all NaNs in the Extra Room column / (+) to 0s
df['(+)'] = df['(+)'].fillna(0)

# drop LSC, EC, Street Names, Abbr, Dir, List/Coop Brokerage and Sold Date.
# We remove sold date as feature as sold date does not exist in active listings dataset(the data we want predictions on)
df.drop(['LSC', 'EC', 'Street Name', 'Abbr', 'Dir', 'List Brokerage', 'Co-Op Brokerage', 'Sold Date'], axis=1, inplace=True)

In [5]:
df.isnull().sum()

St#                0
Municipality       0
Community          0
List Price         0
Sold Price         0
Type               0
Style             44
BR                52
(+)                0
Wr                 0
Fam               48
Kit               50
Gar Type          39
(A/C)             49
Heat              50
Contract Date      0
MLS #              0
dtype: int64

In [6]:
# Since out of almost 32,000 data points only sum NaN values exist, we can go ahead and drop them
x_df = df.dropna()
x_df.isnull().sum()

St#               0
Municipality      0
Community         0
List Price        0
Sold Price        0
Type              0
Style             0
BR                0
(+)               0
Wr                0
Fam               0
Kit               0
Gar Type          0
(A/C)             0
Heat              0
Contract Date     0
MLS #             0
dtype: int64

In [7]:
# Now there are no NaNs and total rows are 31862.
x_df

Unnamed: 0,St#,Municipality,Community,List Price,Sold Price,Type,Style,BR,(+),Wr,Fam,Kit,Gar Type,(A/C),Heat,Contract Date,MLS #
0,793 Adelaide St W,Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,1/13/2021,C5083064
1,75 Major St,Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,5/29/2020,C4773721
2,81 Major St,Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,11/5/2020,C4979474
3,784 Adelaide St W,Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,9/22/2020,C4923168
4,16 Whitaker Ave,Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,11/10/2020,C4985281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31912,8 Wardlaw Cres,Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",Detached,1 1/2 Storey,4.0,2.0,3,N,1.0,Attached,Central Air,Gas,10/3/2021,W5391840
31913,156 Thistle Down Blvd,Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",Detached,Backsplit,4.0,1.0,2,N,1.0,Attached,Central Air,Gas,11/12/2021,W5430335
31914,11 Woodlot Cres,Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",Detached,2-Storey,4.0,0.0,4,Y,1.0,Attached,Central Air,Gas,12/17/2021,W5458498
31915,16 Forest Path Crt,Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",Detached,Backsplit,5.0,0.0,4,Y,1.0,Attached,Central Air,Gas,12/2/2021,W5447590


In [8]:
# rename some columns
x_df = x_df.rename(columns={'St#': 'Address', '(+)':'ER', 'Wr': 'WR', 'Fam': 'FR', 'Kit':'Kitchen', 'Contract Date ': 'Contract_Date', 'Gar Type': 'Gar_Type', '(A/C)':'AC', 'MLS #': 'MLS_ID'})

In [9]:
# Add Toronto to all the street addresses so when the code to find lat/lng runs it will only find addresses in GTA
x_df['Address'] = x_df['Address'].astype(str) + ', Toronto'

In [10]:
# resetting the DataFrame index
x_df = x_df.reset_index()
x_df.drop(['index'], axis=1, inplace=True)
x_df

Unnamed: 0,Address,Municipality,Community,List Price,Sold Price,Type,Style,BR,ER,WR,FR,Kitchen,Gar_Type,AC,Heat,Contract_Date,MLS_ID
0,"793 Adelaide St W, Toronto",Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,1/13/2021,C5083064
1,"75 Major St, Toronto",Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,5/29/2020,C4773721
2,"81 Major St, Toronto",Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,11/5/2020,C4979474
3,"784 Adelaide St W, Toronto",Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,9/22/2020,C4923168
4,"16 Whitaker Ave, Toronto",Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,11/10/2020,C4985281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31857,"8 Wardlaw Cres, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",Detached,1 1/2 Storey,4.0,2.0,3,N,1.0,Attached,Central Air,Gas,10/3/2021,W5391840
31858,"156 Thistle Down Blvd, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",Detached,Backsplit,4.0,1.0,2,N,1.0,Attached,Central Air,Gas,11/12/2021,W5430335
31859,"11 Woodlot Cres, Toronto",Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",Detached,2-Storey,4.0,0.0,4,Y,1.0,Attached,Central Air,Gas,12/17/2021,W5458498
31860,"16 Forest Path Crt, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",Detached,Backsplit,5.0,0.0,4,Y,1.0,Attached,Central Air,Gas,12/2/2021,W5447590


In [11]:
# convert time to date time
x_df['Contract_Date'] = pd.to_datetime(x_df['Contract_Date'])
x_df = x_df.rename(columns={'List Price': 'List_Price($)', 'Sold Price':'Sold_Price($)'})
x_df.dtypes

Address                  object
Municipality             object
Community                object
List_Price($)            object
Sold_Price($)            object
Type                     object
Style                    object
BR                      float64
ER                      float64
WR                        int64
FR                       object
Kitchen                 float64
Gar_Type                 object
AC                       object
Heat                     object
Contract_Date    datetime64[ns]
MLS_ID                   object
dtype: object

In [12]:
x_df

Unnamed: 0,Address,Municipality,Community,List_Price($),Sold_Price($),Type,Style,BR,ER,WR,FR,Kitchen,Gar_Type,AC,Heat,Contract_Date,MLS_ID
0,"793 Adelaide St W, Toronto",Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,2021-01-13,C5083064
1,"75 Major St, Toronto",Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,2020-05-29,C4773721
2,"81 Major St, Toronto",Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,2020-11-05,C4979474
3,"784 Adelaide St W, Toronto",Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,2020-09-22,C4923168
4,"16 Whitaker Ave, Toronto",Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,2020-11-10,C4985281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31857,"8 Wardlaw Cres, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,299,900","$1,300,000",Detached,1 1/2 Storey,4.0,2.0,3,N,1.0,Attached,Central Air,Gas,2021-10-03,W5391840
31858,"156 Thistle Down Blvd, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,349,000","$1,365,000",Detached,Backsplit,4.0,1.0,2,N,1.0,Attached,Central Air,Gas,2021-11-12,W5430335
31859,"11 Woodlot Cres, Toronto",Toronto W10,West Humber-Clairville,"$1,599,900","$1,600,000",Detached,2-Storey,4.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-17,W5458498
31860,"16 Forest Path Crt, Toronto",Toronto W10,Thistletown-Beaumonde Heights,"$1,750,000","$1,850,000",Detached,Backsplit,5.0,0.0,4,Y,1.0,Attached,Central Air,Gas,2021-12-02,W5447590


In [13]:
# MOCKUP MODEL SMALL DATASET
new_df=x_df.head(200)
new_df

Unnamed: 0,Address,Municipality,Community,List_Price($),Sold_Price($),Type,Style,BR,ER,WR,FR,Kitchen,Gar_Type,AC,Heat,Contract_Date,MLS_ID
0,"793 Adelaide St W, Toronto",Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,1.0,,Central Air,Gas,2021-01-13,C5083064
1,"75 Major St, Toronto",Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,2,Y,2.0,,,Gas,2020-05-29,C4773721
2,"81 Major St, Toronto",Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,3,N,2.0,,,Gas,2020-11-05,C4979474
3,"784 Adelaide St W, Toronto",Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,1.0,,Central Air,Gas,2020-09-22,C4923168
4,"16 Whitaker Ave, Toronto",Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,1.0,,Central Air,Gas,2020-11-10,C4985281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,"93 Argyle St, Toronto",Toronto C01,Trinity-Bellwoods,"$1,089,000","$1,125,000",Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,2.0,,Central Air,Gas,2020-09-24,C4926077
196,"159 Wolseley St, Toronto",Toronto C01,Trinity-Bellwoods,"$1,095,000","$1,125,000",Att/Row/Townhouse,2-Storey,4.0,0.0,1,N,1.0,,,Gas,2020-04-02,C4736141
197,"28 Grace St, Toronto",Toronto C01,Trinity-Bellwoods,"$1,099,000","$1,200,000",Semi-Detached,2-Storey,3.0,0.0,2,N,1.0,Detached,Central Air,Gas,2020-03-25,C4731058
198,"1A Bellwoods Ave, Toronto",Toronto C01,Trinity-Bellwoods,"$1,099,000","$1,405,000",Semi-Detached,3-Storey,3.0,0.0,2,N,1.0,,Central Air,Gas,2020-08-11,C4865471


In [14]:
# Using street addresses to find Lat/Lng and adding to dataframe
data=[]
for i in new_df['Address']:
    try: 
        url = "https://nominatim.openstreetmap.org/?addressdetails=1&q=" + i + "+" +"&format=json&limit=1"
        response = requests.get(url).json()
        data.append({"Address":i, "Lat": response[0]["lat"], "Lng": response[0]["lon"]})
    except:
        data.append({"Address":i, "Lat": np.NaN, "Lng": np.NaN})

In [15]:
data_df = pd.DataFrame(data)
data_df

Unnamed: 0,Address,Lat,Lng
0,"793 Adelaide St W, Toronto",43.64380671428571,-79.4094258
1,"75 Major St, Toronto",43.659648485714285,-79.40345567142857
2,"81 Major St, Toronto",43.65977672857143,-79.40350535714286
3,"784 Adelaide St W, Toronto",43.6441171,-79.409249425
4,"16 Whitaker Ave, Toronto",43.643947983509534,-79.40645687386683
...,...,...,...
195,"93 Argyle St, Toronto",43.6465156,-79.4210775
196,"159 Wolseley St, Toronto",43.647409076742576,-79.40655610176812
197,"28 Grace St, Toronto",43.651463750000005,-79.41439255
198,"1A Bellwoods Ave, Toronto",43.6462914,-79.4107627


In [16]:
new_df = new_df.merge(data_df, how = 'inner', on = data_df['Address'])

In [17]:
new_df

Unnamed: 0,key_0,Address_x,Municipality,Community,List_Price($),Sold_Price($),Type,Style,BR,ER,...,FR,Kitchen,Gar_Type,AC,Heat,Contract_Date,MLS_ID,Address_y,Lat,Lng
0,"793 Adelaide St W, Toronto","793 Adelaide St W, Toronto",Toronto C00,Niagara,"$699,999","$780,000",Att/Row/Townhouse,2-Storey,3.0,0.0,...,N,1.0,,Central Air,Gas,2021-01-13,C5083064,"793 Adelaide St W, Toronto",43.64380671428571,-79.4094258
1,"75 Major St, Toronto","75 Major St, Toronto",Toronto C01,University,"$799,000","$1,225,000",Semi-Detached,2-Storey,3.0,0.0,...,Y,2.0,,,Gas,2020-05-29,C4773721,"75 Major St, Toronto",43.659648485714285,-79.40345567142857
2,"81 Major St, Toronto","81 Major St, Toronto",Toronto C01,University,"$799,000","$1,203,000",Semi-Detached,2-Storey,3.0,1.0,...,N,2.0,,,Gas,2020-11-05,C4979474,"81 Major St, Toronto",43.65977672857143,-79.40350535714286
3,"784 Adelaide St W, Toronto","784 Adelaide St W, Toronto",Toronto C01,Niagara,"$899,000","$959,000",Att/Row/Townhouse,2-Storey,3.0,0.0,...,Y,1.0,,Central Air,Gas,2020-09-22,C4923168,"784 Adelaide St W, Toronto",43.6441171,-79.409249425
4,"16 Whitaker Ave, Toronto","16 Whitaker Ave, Toronto",Toronto C01,Niagara,"$899,000","$1,073,000",Att/Row/Townhouse,2-Storey,2.0,0.0,...,N,1.0,,Central Air,Gas,2020-11-10,C4985281,"16 Whitaker Ave, Toronto",43.643947983509534,-79.40645687386683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,"93 Argyle St, Toronto","93 Argyle St, Toronto",Toronto C01,Trinity-Bellwoods,"$1,089,000","$1,125,000",Att/Row/Townhouse,2-Storey,3.0,0.0,...,Y,2.0,,Central Air,Gas,2020-09-24,C4926077,"93 Argyle St, Toronto",43.6465156,-79.4210775
200,"159 Wolseley St, Toronto","159 Wolseley St, Toronto",Toronto C01,Trinity-Bellwoods,"$1,095,000","$1,125,000",Att/Row/Townhouse,2-Storey,4.0,0.0,...,N,1.0,,,Gas,2020-04-02,C4736141,"159 Wolseley St, Toronto",43.647409076742576,-79.40655610176812
201,"28 Grace St, Toronto","28 Grace St, Toronto",Toronto C01,Trinity-Bellwoods,"$1,099,000","$1,200,000",Semi-Detached,2-Storey,3.0,0.0,...,N,1.0,Detached,Central Air,Gas,2020-03-25,C4731058,"28 Grace St, Toronto",43.651463750000005,-79.41439255
202,"1A Bellwoods Ave, Toronto","1A Bellwoods Ave, Toronto",Toronto C01,Trinity-Bellwoods,"$1,099,000","$1,405,000",Semi-Detached,3-Storey,3.0,0.0,...,N,1.0,,Central Air,Gas,2020-08-11,C4865471,"1A Bellwoods Ave, Toronto",43.6462914,-79.4107627


In [18]:
new_df.isnull().sum()

key_0            0
Address_x        0
Municipality     0
Community        0
List_Price($)    0
Sold_Price($)    0
Type             0
Style            0
BR               0
ER               0
WR               0
FR               0
Kitchen          0
Gar_Type         0
AC               0
Heat             0
Contract_Date    0
MLS_ID           0
Address_y        0
Lat              0
Lng              0
dtype: int64

In [19]:
# Model cleaning including removing commas and dollar signs from sold price and list price and adding $ to the columns names
new_df['List_Price($)'] = new_df['List_Price($)'].replace('[\$,]', '', regex=True).astype(float)
new_df['Sold_Price($)'] = new_df['Sold_Price($)'].replace('[\$,]', '', regex=True).astype(float)


In [20]:
new_df["Lng"] = pd.to_numeric(new_df["Lng"])
new_df["Lat"] = pd.to_numeric(new_df["Lat"])
import datetime
import time
z=[]
for i in new_df['Contract_Date']:
    y = time.mktime(i.timetuple())
    z.append(y)
    
new_df['Contract_Date']=z

In [22]:
new_df.drop(['MLS_ID', 'Address_x', 'Address_y'], axis=1, inplace=True)
new_df.dtypes

key_0             object
Municipality      object
Community         object
List_Price($)    float64
Sold_Price($)    float64
Type              object
Style             object
BR               float64
ER               float64
WR                 int64
FR                object
Kitchen          float64
Gar_Type          object
AC                object
Heat              object
Contract_Date    float64
Lat              float64
Lng              float64
dtype: object

In [23]:
new_df.drop(['key_0'], axis=1, inplace=True)

In [24]:
cat = new_df.dtypes[new_df.dtypes == "object"].index.tolist()
cat

['Municipality', 'Community', 'Type', 'Style', 'FR', 'Gar_Type', 'AC', 'Heat']

In [25]:
# define ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
# transform data
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(new_df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = new_df[cat].columns
encode_df.head()

Unnamed: 0,Municipality,Community,Type,Style,FR,Gar_Type,AC,Heat
0,0.0,0.0,0.0,2.0,0.0,4.0,0.0,1.0
1,1.0,2.0,4.0,2.0,1.0,4.0,1.0,1.0
2,1.0,2.0,4.0,2.0,0.0,4.0,1.0,1.0
3,1.0,0.0,0.0,2.0,1.0,4.0,0.0,1.0
4,1.0,0.0,0.0,2.0,0.0,4.0,0.0,1.0


In [26]:
final_df = new_df.merge(encode_df,left_index=True, right_index=True)
final_df

Unnamed: 0,Municipality_x,Community_x,List_Price($),Sold_Price($),Type_x,Style_x,BR,ER,WR,FR_x,...,Lat,Lng,Municipality_y,Community_y,Type_y,Style_y,FR_y,Gar_Type_y,AC_y,Heat_y
0,Toronto C00,Niagara,699999.0,780000.0,Att/Row/Townhouse,2-Storey,3.0,0.0,1,N,...,43.643807,-79.409426,0.0,0.0,0.0,2.0,0.0,4.0,0.0,1.0
1,Toronto C01,University,799000.0,1225000.0,Semi-Detached,2-Storey,3.0,0.0,2,Y,...,43.659648,-79.403456,1.0,2.0,4.0,2.0,1.0,4.0,1.0,1.0
2,Toronto C01,University,799000.0,1203000.0,Semi-Detached,2-Storey,3.0,1.0,3,N,...,43.659777,-79.403505,1.0,2.0,4.0,2.0,0.0,4.0,1.0,1.0
3,Toronto C01,Niagara,899000.0,959000.0,Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,...,43.644117,-79.409249,1.0,0.0,0.0,2.0,1.0,4.0,0.0,1.0
4,Toronto C01,Niagara,899000.0,1073000.0,Att/Row/Townhouse,2-Storey,2.0,0.0,2,N,...,43.643948,-79.406457,1.0,0.0,0.0,2.0,0.0,4.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,Toronto C01,Trinity-Bellwoods,1089000.0,1125000.0,Att/Row/Townhouse,2-Storey,3.0,0.0,2,Y,...,43.646516,-79.421077,1.0,1.0,0.0,2.0,1.0,4.0,0.0,1.0
200,Toronto C01,Trinity-Bellwoods,1095000.0,1125000.0,Att/Row/Townhouse,2-Storey,4.0,0.0,1,N,...,43.647409,-79.406556,1.0,1.0,0.0,2.0,0.0,4.0,1.0,1.0
201,Toronto C01,Trinity-Bellwoods,1099000.0,1200000.0,Semi-Detached,2-Storey,3.0,0.0,2,N,...,43.651464,-79.414393,1.0,1.0,4.0,2.0,0.0,3.0,0.0,1.0
202,Toronto C01,Trinity-Bellwoods,1099000.0,1405000.0,Semi-Detached,3-Storey,3.0,0.0,2,N,...,43.646291,-79.410763,1.0,1.0,4.0,3.0,0.0,4.0,0.0,1.0


In [27]:
final_df.columns

Index(['Municipality_x', 'Community_x', 'List_Price($)', 'Sold_Price($)',
       'Type_x', 'Style_x', 'BR', 'ER', 'WR', 'FR_x', 'Kitchen', 'Gar_Type_x',
       'AC_x', 'Heat_x', 'Contract_Date', 'Lat', 'Lng', 'Municipality_y',
       'Community_y', 'Type_y', 'Style_y', 'FR_y', 'Gar_Type_y', 'AC_y',
       'Heat_y'],
      dtype='object')

In [28]:
final_df = final_df.drop(['Municipality_x', 'Community_x', 'Type_x', 'Style_x', 'FR_x', 'Gar_Type_x', 'AC_x', 'Heat_x'] ,1)

  """Entry point for launching an IPython kernel.


In [29]:
# Split our preprocessed data into our features and target arrays
y = final_df["Sold_Price($)"].values
X = final_df.drop(["Sold_Price($)"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [31]:
#Liner Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()  
regressor.fit(X_train_scaled, y_train)

#evaluate the model (intercept and slope)
regressor.intercept_
regressor.coef_

array([ 5.83079364e+05,  4.64955049e+04,  3.78368884e+03,  6.20205717e+04,
        8.79529997e+03,  6.46072690e+03,  2.50824150e+06, -2.49464931e+06,
        8.02949231e+03, -1.32546605e+04,  8.12198024e+03,  1.71381289e+04,
       -1.54586952e+04,  1.02921557e+04,  2.21249284e+03,  4.21707835e+04])

In [32]:
y_predd = regressor.predict(X_test_scaled)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_predd})
df1 = df.head(10)
df1

Unnamed: 0,Actual,Predicted
0,3650000.0,3581769.0
1,1615000.0,1635842.0
2,1650000.0,1609843.0
3,985000.0,886528.6
4,1669000.0,1706393.0
5,1203000.0,1108868.0
6,2750000.0,2580279.0
7,1550000.0,1415569.0
8,1250000.0,1497396.0
9,1180000.0,1379295.0


In [33]:
from sklearn import metrics
import numpy as np

print('Mean Absolute Error: {:.2f}'.format(metrics.mean_absolute_error(y_test, y_predd))) 
print('Mean Squared Error:{:.2f}'.format(metrics.mean_squared_error(y_test, y_predd)))  
print('Root Mean Squared Error:{:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_predd))))
print('Variance score is: {:.2f}'.format(metrics.explained_variance_score(y_test,y_predd)))

Mean Absolute Error: 117737.32
Mean Squared Error:20377477645.71
Root Mean Squared Error:142749.70
Variance score is: 0.93


In [34]:
print('Linear Regression Model:')
print("Train Score {:.2f}".format(regressor.score(X_train_scaled,y_train)))
print("Test Score {:.2f}".format(regressor.score(X_test_scaled, y_test)))

Linear Regression Model:
Train Score 0.95
Test Score 0.93
