In [0]:
!pip install tpot
import pandas as pd
import seaborn as sns
import timeit
import os
from sklearn.preprocessing import LabelEncoder, Imputer, \
                                OneHotEncoder, StandardScaler, MinMaxScaler,  PolynomialFeatures
import numpy as np
import ast
import matplotlib.pyplot as plt
import tpot
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV,  cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn import neighbors
from sklearn.metrics.scorer import make_scorer
from tpot import TPOTRegressor
import io
from tpot.builtins import StackingEstimator

In [0]:
from google.colab import drive, files
drive.mount('/content/drive')

In [0]:
#uploaded = files.upload()
test = pd.read_csv('drive/My Drive/Colab Notebooks/test.csv')
train = pd.read_csv('drive/My Drive/Colab Notebooks/train.csv')

In [0]:
date_ix = 0
cat_attribs = ['date', 'zipcode']
num_attribs = ['yr_renovated','yr_built', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement',  
       'lat', 'long', 'sqft_living15', 'sqft_lot15']

In [0]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, cats = True):
        self.cats = cats
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        month = [i[4:6] for i in X[:, date_ix]]
        year = [i[0:4] for i in X[:, date_ix]]
        return np.c_[X[:, 1:], year, month]

In [0]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, num = True):
        self.attribute_names = attribute_names
        self.num = num
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if (self.num):
            X['yr_renovated'] = X.apply(lambda x: x['yr_built'] \
                                        if x['yr_renovated']==0 else x['yr_renovated'], axis = 1)
            X['bed2bath'] = X.apply(lambda x: x['bedrooms']/x['bathrooms'] if x['bathrooms']>0 else 0, axis = 1)
            X['total_qual'] = X['condition']*X['grade']
            X['sqft2room'] = X.apply(lambda x: x['sqft_living']/x['bedrooms'] if x['bedrooms']>0 else 0, axis = 1)
            X['basement'] = X.apply(lambda x: 1 if x['sqft_basement']>0 else 0, axis = 1)
            X['extra_lot'] = X['sqft_lot']-X['sqft_living']/(X['basement'] + X['floors'])
            X['age'] = 2015 - X['yr_built']
            X['ren_age'] = 2015 - X['yr_renovated']
            X['water_view'] = X['waterfront']*X['view']
            return np.c_[X[list(set(self.attribute_names) - set(['yr_built', 'yt_renovated']))].values, 
                         X['bed2bath'].values, 
                         X['total_qual'].values,
                        X['sqft2room'].values,
                        X['basement'].values,
                        X['extra_lot'].values,
                         X['age'].values,
                         X['ren_age'].values,
                        X['water_view'].values]
        else:
            return X[self.attribute_names].values

In [0]:
encoder = OneHotEncoder(sparse = False)

In [0]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy = "median"))
])

In [0]:
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs, num = False)),
    ('attrib_adder', AttributesAdder()),
    ('hot_encode', encoder)
])

In [0]:
full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [0]:
housing_test = full_pipeline.fit_transform(test)
housing_train = full_pipeline.fit_transform(train)
price_train = np.array(train['price'])

In [0]:
exported_pipeline = make_pipeline(
    exported_pipeline = GradientBoostingRegressor(alpha=0.8, learning_rate=0.1, loss="huber", 
                                                  max_depth=8, max_features=0.7000000000000001, 
                                                  min_samples_leaf=4, min_samples_split=3, 
                                                  n_estimators=100, subsample=0.9500000000000001))

exported_pipeline.fit(housing_train, price_train)

In [0]:
test_results = exported_pipeline.predict(housing_test)

In [0]:
test_results


array([458320.0667476 , 401275.68716715, 527927.98778957, ...,
       690217.14015902, 390124.51945932, 398093.1530373 ])

In [0]:

results_xls = pd.DataFrame(test['id'])

results_xls['price'] = test_results

results_xls.to_csv('drive/My Drive/Colab Notebooks/results.csv')
