In [128]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import *
from sklearn.model_selection import cross_validate, ShuffleSplit


In [94]:
dataset = pd.read_csv('kc_house_data.csv')

In [95]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  float64
 9   view           21534 non-null  float64
 10  condition      21597 non-null  int64  
 11  grade          21597 non-null  int64  
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

In [96]:
dataset['waterfront'] = dataset['waterfront'].fillna(0)

In [97]:
dataset['date'] = pd.to_datetime(dataset['date'])

In [98]:
dataset.dropna(subset = ['view'], inplace = True)

In [99]:
dataset.yr_renovated.fillna(dataset.yr_built, inplace=True)

In [100]:
dataset['zipcode'] = dataset['zipcode'].apply(str)

In [101]:
dataset['sqft_basement'] = dataset['sqft_basement'].replace('?', np.nan) ##move to impute .tonumeric()

In [102]:
dataset['sqft_basement'] = dataset['sqft_basement'].astype(float) #

In [103]:
dataset['sqft_basement'] = dataset['sqft_basement'].fillna(0)

In [104]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21534 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             21534 non-null  int64         
 1   date           21534 non-null  datetime64[ns]
 2   price          21534 non-null  float64       
 3   bedrooms       21534 non-null  int64         
 4   bathrooms      21534 non-null  float64       
 5   sqft_living    21534 non-null  int64         
 6   sqft_lot       21534 non-null  int64         
 7   floors         21534 non-null  float64       
 8   waterfront     21534 non-null  float64       
 9   view           21534 non-null  float64       
 10  condition      21534 non-null  int64         
 11  grade          21534 non-null  int64         
 12  sqft_above     21534 non-null  int64         
 13  sqft_basement  21534 non-null  float64       
 14  yr_built       21534 non-null  int64         
 15  yr_renovated   2153

In [117]:
X = dataset.drop('price', axis = 1)
y = dataset['price']

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [119]:
len(X_train), len(X_test), len(y_train), len(y_test)

(17227, 4307, 17227, 4307)

In [120]:
living = X_train['sqft_living'].to_frame()

In [121]:
grade_dummies = pd.get_dummies(X_train['grade'], prefix = 'gr', drop_first = True)
bath_dummies = pd.get_dummies(X_train['bathrooms'], prefix = 'bath', drop_first = True)

In [122]:
len(living), len(grade_dummies), len(bath_dummies)

(17227, 17227, 17227)

In [126]:
concat_df = pd.concat([living, grade_dummies, bath_dummies], axis = 1)
concat_df

Unnamed: 0,sqft_living,gr_5,gr_6,gr_7,gr_8,gr_9,gr_10,gr_11,gr_12,gr_13,...,bath_4.75,bath_5.0,bath_5.25,bath_5.5,bath_5.75,bath_6.0,bath_6.25,bath_6.5,bath_6.75,bath_8.0
15714,1350,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10920,1830,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2782,3527,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14991,1920,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19542,1980,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11323,3400,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12006,2430,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5409,1620,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
866,3460,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [132]:
splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

living_bath_grade_model = LinearRegression()

living_bath_grade_model_scores = cross_validate(
    estimator= living_bath_grade_model,
    X= concat_df,
    y=y_train,
    return_train_score=True,
    cv=None
)

print("Current Model")
print("Train score:     ", living_bath_grade_model_scores["train_score"].mean())
print("Validation score:", living_bath_grade_model_scores["test_score"].mean())


Current Model
Train score:      0.5931243520578917
Validation score: 0.5739193117278385
