## Load Data

In [1]:
!ls

LICENSE
Mod4_Project_Tino.ipynb
README.md
column_names.md
kc_housing_data_for_feat_engineering_lab.csv


In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 300)

In [32]:
df_orig = pd.read_csv("kc_housing_data_for_feat_engineering_lab.csv")

In [33]:
df_orig.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,yr_old,year_sold,since_sold,price_log
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,62,2014,3,12.309982
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,66,2014,3,13.195614
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,84,2015,2,12.100712
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,52,2014,3,13.311329
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,30,2015,2,13.142166


## Feature Engineering

In [49]:
df = df_orig.copy()

### Distance to biggest Employers
We assume Hoosing prices are effected by the distance to the biggest employers in the area. The three biggest Employers are:
* Boing
* Microsoft
* Amazon <br>
therefore three Columns are created estimating the distance with log lang data

In [36]:
#Lat and Long Data of the Employers
employers_dict = { "boing": {"lat":47.6213723 , "long":-122.2890233 },
                  "microsoft":{"lat":47.6423318 , "long":-122.1456849},
                  "amazon":{"lat":47.6222917 , "long":-122.3386826 }}
                     

In [37]:
# Distance estimation using geometric distance
def get_distance(lat,long,t_lat,t_long):
    dx = lat-t_lat
    dy = long-t_long
    return (dx**2+dy**2)**0.5


def get_emp_distance(lat,long,employer):
    t_lat = employers_dict[employer]["lat"]
    t_long = employers_dict[employer]["long"]
    d = get_distance(lat,long,t_lat,t_long)
    return d

In [39]:
# Create Distance Features for Boing, Amazon , Microsoft
for emp in employers_dict.keys():
    print(emp)
    df[emp] = [get_emp_distance(df.iloc[i]["lat"],df.iloc[i]["long"],emp) for i in range(len(df))]

boing
microsoft
amazon


In [46]:
df["zipcode"] = df.zipcode.astype("str")

In [50]:
df_with_features = df.copy()

## Normalization

In [None]:
#df = df_with_features.copy()

In [57]:
# Train-Test-Split
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import
X = df.drop(["price", "price_log", "id" ,"date"], axis = 1)
y = df.price_log
X_train, y_train , X_test, y_test = train_test_split(X,y, random_state = 34)

In [58]:
X_train


Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,yr_old,year_sold,since_sold
20784,2,1.50,1310,1264,2.0,0,0,3,8,1120,190,2006,0,98106,47.5772,-122.409,1330,1265,11,2014,3
8093,4,1.75,1760,7268,1.0,0,0,4,7,1080,680,1979,0,98058,47.4267,-122.148,1830,8786,38,2015,2
2521,2,1.00,1020,5130,1.0,0,0,4,6,1020,0,1948,0,98002,47.3010,-122.226,1200,6497,69,2014,3
18569,5,2.50,2510,10240,1.0,0,0,4,8,1410,1100,1984,0,98059,47.4732,-122.141,2170,10500,33,2015,2
11177,3,2.50,2170,8169,2.0,0,0,3,8,2170,0,2003,0,98059,47.4833,-122.139,2240,6733,14,2015,2
16432,4,1.00,1200,7200,1.5,0,0,3,6,1200,0,1944,0,98178,47.4951,-122.248,1070,6050,73,2014,3
17170,5,2.50,3220,4759,2.0,0,0,3,8,3220,0,2003,0,98075,47.5957,-122.032,2550,4759,14,2014,3
17916,3,3.25,1510,1245,3.0,0,0,3,7,1510,0,2007,0,98133,47.7293,-122.343,1510,1245,10,2014,3
471,3,4.25,3840,6161,2.0,0,0,3,10,3840,0,2000,0,98074,47.6336,-122.064,3230,7709,17,2015,2
15226,4,2.50,1990,5577,2.0,0,0,3,7,1990,0,1999,0,98092,47.3191,-122.191,2020,6400,18,2015,2
