# Model Selection

In [16]:
import numpy as np
import pandas as pd

In [17]:
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1
pd.options.display.width = None
pd.options.display.max_columns = None

In [18]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [19]:
df= pd.read_csv('files/Kaggle Competition/House Prediction/final_data_cleaned_dummies.csv',encoding = 'iso-8859-1')

In [20]:
df = reduce_mem_usage(df)

Memory usage of dataframe is 80.28 MB --> 19.46 MB (Decreased by 75.8%)


In [21]:
df.head()

Unnamed: 0,followers,square,livingRoom,drawingRoom,bathRoom,renovationCondition,buildingStructure,elevator,subway,district,communityAverage,floorNumber,UsedTime,tradeTime_2002,tradeTime_2003,tradeTime_2008,tradeTime_2009,tradeTime_2010,tradeTime_2011,tradeTime_2012,tradeTime_2013,tradeTime_2014,tradeTime_2015,tradeTime_2016,tradeTime_2017,tradeTime_2018,floorType_µÍ,floorType_µ×,floorType_¶¥,floorType_¸ß,floorType_Î´Öª,floorType_ÖÐ,totalPrice
0,106,131.0,2.0,1.0,1.0,3,6,1.0,1.0,7,56021.0,26.0,11.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,415.0
1,126,132.380005,2.0,2.0,2.0,4,6,1.0,0.0,7,71539.0,22.0,12.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,575.0
2,48,198.0,3.0,2.0,3.0,3,6,1.0,0.0,7,48160.0,4.0,11.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1030.0
3,138,134.0,3.0,1.0,1.0,1,6,1.0,0.0,6,51238.0,21.0,8.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,297.5
4,286,81.0,2.0,1.0,1.0,2,2,0.0,1.0,1,62588.0,6.0,56.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,392.0


In [22]:
df.shape

(318851, 33)

In [23]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [24]:
x

array([[106.  , 131.  ,   2.  , ...,   1.  ,   0.  ,   0.  ],
       [126.  , 132.38,   2.  , ...,   1.  ,   0.  ,   0.  ],
       [ 48.  , 198.  ,   3.  , ...,   0.  ,   0.  ,   1.  ],
       ...,
       [  2.  , 102.17,   2.  , ...,   0.  ,   0.  ,   1.  ],
       [  4.  , 178.34,   4.  , ...,   0.  ,   0.  ,   1.  ],
       [  0.  ,  92.45,   2.  , ...,   0.  ,   0.  ,   1.  ]],
      dtype=float32)

In [25]:
y

array([ 415.,  575., 1030., ...,  359.,  720.,  450.], dtype=float32)

In [26]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 0)

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [31]:
pipeline_svr = Pipeline([('scale',StandardScaler()), ('svr', SVR(kernel='linear'))])
pipeline_dec = Pipeline([('decision', DecisionTreeRegressor(criterion='mse'))])
pipeline_ran = Pipeline([('random', RandomForestRegressor(n_estimators=100, criterion='mse', n_jobs=-1))])
pipeline_xg = Pipeline([('xgboost', XGBRegressor(n_estimators=100, n_jobs=-1))])

In [32]:
pipelines= [pipeline_svr, pipeline_dec, pipeline_ran, pipeline_xg]

In [33]:
best_score=0.0
best_regressor=0
best_pipeline=""

In [34]:
%%time
# Dictionary of pipelines and regressor types for ease of reference
pipe_dict = {0: 'Support Vector Regression', 1: 'Decision Tree Regression', 2: 'Random Forest Regression', 3: 'XgBoost'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(x_train, y_train)

Wall time: 1h 28min 52s


In [35]:
%%time
best_score=0.0
for i,regressor in enumerate(pipelines):
    score = regressor.score(x_test,y_test)
    print("{} Test Accuracy: {}".format(pipe_dict[i],score))
    if best_score < score:
        best_score = score 
        best_pipeline = regressor
        best_regressor = pipe_dict[i]
print('\n--------------------------------------\n')
print('Best Regressor - ', best_regressor)
print('Best Score - ', best_score)

Support Vector Regression Test Accuracy: 0.7750162793203426
Decision Tree Regression Test Accuracy: 0.8512653644276963
Random Forest Regression Test Accuracy: 0.926644218776173
XgBoost Test Accuracy: 0.9042237902908016

--------------------------------------

Best Regressor -  Random Forest Regression
Best Score -  0.926644218776173
Wall time: 3min 52s
