In [0]:
!pip install tpot
import pandas as pd
import seaborn as sns
import timeit
import os
from sklearn.preprocessing import LabelEncoder, Imputer, \
                                OneHotEncoder, StandardScaler, MinMaxScaler,  PolynomialFeatures
import numpy as np
import ast
import matplotlib.pyplot as plt
import tpot
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV,  cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn import neighbors
from sklearn.metrics.scorer import make_scorer
from tpot import TPOTRegressor
import io

In [0]:
from google.colab import drive, files
drive.mount('/content/drive')

In [0]:
uploaded = files.upload()
xls = pd.read_csv(io.StringIO(uploaded['train.csv'].decode('utf-8')))

In [0]:
xls.columns

In [0]:
#train_set, test_set = train_test_split(xls, test_size = 0.2, random_state=42)

In [0]:
date_ix = 0
cat_attribs = ['date', 'zipcode']
num_attribs = ['yr_renovated','yr_built', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'lat', 'long', 'sqft_living15', 'sqft_lot15']

In [0]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, cats = True):
        self.cats = cats
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        month = [i[4:6] for i in X[:, date_ix]]
        year = [i[0:4] for i in X[:, date_ix]]
        return np.c_[X[:, 1:], year, month]

In [0]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, num = True):
        self.attribute_names = attribute_names
        self.num = num
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if (self.num):
            X['yr_renovated'] = X.apply(lambda x: x['yr_built'] \
                                        if x['yr_renovated']==0 else x['yr_renovated'], axis = 1)
            X['bed2bath'] = X.apply(lambda x: x['bedrooms']/x['bathrooms'] if x['bathrooms']>0 else 0, axis = 1)
            X['total_qual'] = X['condition']*X['grade']
            X['sqft2room'] = X.apply(lambda x: x['sqft_living']/x['bedrooms'] if x['bedrooms']>0 else 0, axis = 1)
            X['basement'] = X.apply(lambda x: 1 if x['sqft_basement']>0 else 0, axis = 1)
            X['extra_lot'] = X['sqft_lot']-X['sqft_living']/(X['basement'] + X['floors'])
            X['age'] = 2015 - X['yr_built']
            X['ren_age'] = 2015 - X['yr_renovated']
            X['water_view'] = X['waterfront']*X['view']
            return np.c_[X[list(set(self.attribute_names) - set(['yr_built', 'yt_renovated']))].values, 
                         X['bed2bath'].values, 
                         X['total_qual'].values,
                        X['sqft2room'].values,
                        X['basement'].values,
                        X['extra_lot'].values,
                         X['age'].values,
                         X['ren_age'].values,
                        X['water_view'].values]
        else:
            return X[self.attribute_names].values

In [0]:
encoder = OneHotEncoder(sparse = False)

In [0]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy = "median"))
])

In [0]:
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs, num = False)),
    ('attrib_adder', AttributesAdder()),
    ('hot_encode', encoder)
])

In [0]:
full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [0]:
housing_train = full_pipeline.fit_transform(xls)
#housing_test = full_pipeline.fit_transform(test_set)
price_train = np.array(xls['price'])
#price_test = np.array(test_set['price'])

In [0]:
def custom_rmsle(real, predicted):
    sum=0.0
    for x in range(len(predicted)):
        if predicted[x]<0 or real[x]<0: #check for negative values
            continue
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

In [0]:
rmsle = make_scorer(custom_rmsle, greater_is_better=False)

In [0]:
tpot = TPOTRegressor(verbosity=3,  
                    random_state=55, 
                    scoring=rmsle,
                     #create a folder for intermediate results
                    periodic_checkpoint_folder="drive/My Drive/Colab Notebooks/intermediate_results",
                    n_jobs=-1, 
                    warm_start = True,
                    generations=20, 
                    population_size=80,
                    early_stop=8)


In [0]:
tpot.fit(housing_train, price_train)
tpot.export('tpot_fi_kaggle.py')
