In [233]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import ensemble

In [226]:
df = pd.read_csv('data/train_data.csv')

In [227]:
def print_score(model, x, y):
    print('Score:        {:.4f}%'.format(model.score(x, y) * 100))
    print('Kaggle Score: {:.0f}'.format(np.sqrt(np.mean((model.predict(x) - y)**2))))
    
    

# Gradient Boosting

In [244]:
dfGB = df[[
    'bedrooms',
    'bathrooms',
    'sqft_living',
    'sqft_lot',
    'floors',
    'waterfront',
    'view',
    'condition',
    'grade',
    'sqft_above',
    'sqft_basement',
    'yr_built',
    'yr_renovated',
    'sqft_living15',
    'sqft_lot15',
    'zipcode',
    'lat',
    'long',
]]

x_train, x_test, y_train, y_test = train_test_split(dfGB, df['price'], test_size=0.4, random_state=42)

gb = ensemble.GradientBoostingRegressor(n_estimators=200, max_depth=5, learning_rate=0.1, criterion='mse')
gb.fit(x_train, y_train)

print_score(gb, x_test, y_test)

Score:        89.0084%
Kaggle Score: 118223


# Random Forest

In [228]:
dfRF = df[[
    'bedrooms',
    'bathrooms',
    'sqft_living',
    'sqft_lot',
    'floors',
    'waterfront',
    'view',
    'condition',
    'grade',
    'sqft_above',
    'sqft_basement',
    'yr_built',
    'yr_renovated',
    'sqft_living15',
    'sqft_lot15',
    'zipcode',
    'lat',
    'long',
]]

x_train, x_test, y_train, y_test = train_test_split(dfRF, df['price'], test_size=0.4, random_state=42)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

print_score(rf, x_test, y_test)

Score:        87.1473%
Kaggle Score: 127841


# Linear Regression

In [229]:
dfLR = df[[
    'bedrooms',
    'bathrooms',
    'sqft_living',
    'sqft_lot',
    'floors',
    'waterfront',
    'view',
    'condition',
    'grade',
    'sqft_above',
    'sqft_basement',
    'yr_built',
    'yr_renovated',
    'sqft_living15',
    'sqft_lot15',
    'zipcode',
    'lat',
    'long',
]]

x_train, x_test, y_train, y_test = train_test_split(dfLR, df['price'], test_size=0.4, random_state=42)

lr = linear_model.LinearRegression()
lr.fit(x_train, y_train)

print_score(lr, x_test, y_test)

Score:        70.2038%
Kaggle Score: 194649


# Decision Tree

In [230]:
dfDT = df[[
    'bedrooms',
    'bathrooms',
    'sqft_living',
    'sqft_lot',
    'floors',
    'waterfront',
    'view',
    'condition',
    'grade',
    'sqft_above',
    'sqft_basement',
    'yr_built',
    'yr_renovated',
    'sqft_living15',
    'sqft_lot15',
    'zipcode',
    'lat',
    'long',
]]

x_train, x_test, y_train, y_test = train_test_split(dfDT, df['price'], test_size=0.4, random_state=42)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_train, y_train)

print_score(dt, x_test, y_test)

Score:        0.9398%
Kaggle Score: 210412


# Submission File Generation

In [231]:
dfTest = pd.read_csv('data/val_data.csv').rename({'Unnamed: 0': 'Unique_idx'}, axis='columns')

model = None
features = None

dfTest['price'] = model.predict(features)

AttributeError: 'NoneType' object has no attribute 'predict'

In [None]:
dfTest[['Unique_idx', 'price']].to_csv('submission.csv', index=None, header=True)