In [26]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.evaluate import GroupTimeSeriesSplit

from sklearn.preprocessing import OneHotEncoder,PowerTransformer,OrdinalEncoder,StandardScaler,MinMaxScaler,PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer,TransformedTargetRegressor
from sklearn.impute import SimpleImputer  

from category_encoders.target_encoder import TargetEncoder
from category_encoders.count import CountEncoder

from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.ensemble import StackingRegressor,VotingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import cross_val_score,TimeSeriesSplit,train_test_split,cross_validate
from sklearn.metrics import mean_squared_error

from time import time
from copy import deepcopy
from sklearn.preprocessing import FunctionTransformer

# Summary of my Solutions

<ul>
    <li> For time related feature (CPI,rent index,etc) im using lagged value in 3 periods, for example if i want predict price in month 12, i use the time related feature in month 9. i used 3 month since the problem is to predict 3 months ahead. </li>
    <li> i used feature engineering in general, like mean of column, diff, and etc. u can see below in feature engineering section. </li>
    <li> i used tranformation for my numeric variables, for categorical variables i used target encoding or mean encoding
    </li>
    <li> feature selection using rfecv (recursive feature elimination with cross validation) </li>
    <li> for CV method i use time series grouped Cross validation i used statsmodels library for this method, since there are multiple house in each days. </li>
    <li> im also done with hyperparameter tune with each models and different subset of features </li>
    <li> my final model is stacking from 9 different models, 3 LightGBM, 3 Catboost, 3 XGBoost, each models has different features and different parameters. and i used Standard LightGBM with no tune for final estimators </li>
</ul> 

# Import Data

In [4]:
cpi = pd.read_csv("../datasets/raw/cpi.csv",parse_dates=['Data Series'])
geo_attr = pd.read_csv("../datasets/raw/geo_attributes.csv")
interest = pd.read_csv("../datasets/raw/interest.csv",parse_dates=['Data Series'])
properties = pd.read_csv("../datasets/raw/properties.csv")
rentindex = pd.read_csv("../datasets/raw/rentindex.csv",parse_dates=['Data Series'])
test = pd.read_csv("../datasets/raw/test.csv",parse_dates=['contractDate'])
train = pd.read_csv("../datasets/raw/train.csv",parse_dates=['contractDate'])
vacant = pd.read_csv("../datasets/processed/iteration_1/vacant_edit.csv",parse_dates=['Data Series'])   

since the date format in rent index is different with others, so i try to change it to the same format

In [10]:
ri_edit = rentindex.copy()
ri_edit['Data Series'] = ri_edit['Data Series'].apply(lambda x: x.replace(' 1Q','-01-01'))
ri_edit['Data Series'] = ri_edit['Data Series'].apply(lambda x: x.replace(' 2Q','-04-01'))
ri_edit['Data Series'] = ri_edit['Data Series'].apply(lambda x: x.replace(' 3Q','-07-01'))
ri_edit['Data Series'] = ri_edit['Data Series'].apply(lambda x: x.replace(' 4Q','-10-01')) 
ri_edit['Data Series'] = pd.to_datetime(ri_edit['Data Series'])
#resampling 3 times each month fill with value in 3 period before
ri_edit = ri_edit.set_index('Data Series').resample('M').ffill().reset_index()
#add 2022-11-10, fill with 148.1 
ri_edit = pd.concat([ri_edit,pd.DataFrame({'Data Series':pd.to_datetime('2022-11-10'),'RentIndex':148.1},index=[0])]).sort_index().reset_index(drop=True)
#add 2022-12-10, fill with 148.1
ri_edit = pd.concat([ri_edit,pd.DataFrame({'Data Series':pd.to_datetime('2022-12-10'),'RentIndex':148.1},index=[0])]).sort_index().reset_index(drop=True)

#replcae day with 01 
ri_edit['Data Series'] = ri_edit['Data Series'].apply(lambda x: x.replace(day=1))

make the merge datasets on train and submission datasets with lagged of time related feature (rentindex, cpi, interest, vacant)

In [22]:
def function_merge(df_tmp,df1,df2,df3,df4,df5): 
    data = pd.merge(df_tmp,df1,how='left',on='property_key')
    data = pd.merge(data,geo_attr,how='left',on=['street','project','district'] )
    data.rename(columns={'contractDate':'Data Series'},inplace=True)
    
    df_date = pd.merge(df2,df3,how='left',on='Data Series')
    df_date = pd.merge(df_date,df4,how='left',on='Data Series')
    df_date = pd.merge(df_date,df5,how='left',on='Data Series')
    df_date['Data Series'] =  df_date['Data Series'].apply(lambda x: x + pd.DateOffset(months=3))

    data = pd.merge(data,df_date,how='left',on='Data Series')
    data.columns = data.columns.str.replace(' ','') 
    data.columns = data.columns.str.lower()
    data = data[data["dataseries"] > '2018-03-01']
    return data 

df = function_merge(train,properties,cpi,vacant,interest,ri_edit)
df_test = function_merge(test,properties,cpi,vacant,interest,ri_edit)

In [23]:
df.head()

Unnamed: 0,property_key,dataseries,price,area,floorrange,propertytype,district,typeofarea,tenure,street,...,lat,lng,num_schools_1km,num_supermarkets_500m,num_mrt_stations_500m,cpi,available,vacant,interestrate,rentindex
242,p-89d6ffc3d,2018-04-01,1400000.0,122.0,01-05,Condominium,5,Strata,99 yrs lease commencing from 2004,WEST COAST ROAD,...,1.294645,103.767168,0.0,0.0,0.0,99.035,30153.0,3950.0,0.5,102.8
243,p-7529081f2,2018-04-01,738000.0,62.0,01-05,Condominium,16,Strata,99 yrs lease commencing from 2011,BEDOK RESERVOIR ROAD,...,1.337536,103.920652,1.0,2.0,1.0,99.035,30153.0,3950.0,0.5,102.8
244,p-b21ddfd36,2018-04-01,3132500.0,154.0,21-25,Condominium,3,Strata,99 yrs lease commencing from 2012,PRINCE CHARLES CRESCENT,...,1.292695,103.820411,2.0,0.0,1.0,99.035,30153.0,3950.0,0.5,102.8
245,p-b1ace2a03,2018-04-01,4500000.0,307.3,-,Semi-detached,20,Land,Freehold,WESTLAKE AVENUE,...,1.345626,103.836933,12.0,0.0,3.0,99.035,30153.0,3950.0,0.5,102.8
246,p-0c92e94b9,2018-04-01,1456000.0,123.0,06-10,Condominium,19,Strata,99 yrs lease commencing from 2007,HOUGANG STREET 11,...,1.35139,103.881242,6.0,1.0,3.0,99.035,30153.0,3950.0,0.5,102.8


# Data Preparation
for data preparation i used train test split for time series, which mean i just use the test set from 2012-10-01 until 2022-12-01 so is 3 month periods in test set. The rest of data is used for training the model

In [25]:
X = df.sort_values(by = 'dataseries' ).drop(columns = 'price')
y = df.sort_values(by = 'dataseries' ).price

X_train = X[X.dataseries < '2022-10-01']
X_test = X[X.dataseries >= '2022-10-01']

y_train = y[X.dataseries < '2022-10-01']
y_test = y[X.dataseries >= '2022-10-01']
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(59761, 21) (2946, 21) (59761,) (2946,)


# Data Preprocessing & Feature Engineering

for feature engineering i just applied general feature engineering, which mean it can also applied on the other datasets too, some of the feature engineering also referrence from other discussion. 

In [27]:
#make custom transformer
from sklearn.base import BaseEstimator, TransformerMixin

class MakeDate(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X_ = X.copy()
        X_['dataseries'] = pd.to_datetime(X_['dataseries'])
        X_['year'] = X_['dataseries'].dt.year
        X_['month'] = X_['dataseries'].dt.month
        X_['quarter'] = X_['dataseries'].dt.quarter
        X_['longday'] = (X_['dataseries'] - pd.to_datetime('2018-01-01')).dt.days
        return X_
    
class MakeFloor(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X_ = X.copy()
        X_['floor'] = X_['floorrange'].apply(lambda x: x.split('-')[1])
        X_['floor'] = X_['floor'].replace({'':"1",'B5':"-5"} )
        X_['floor'] = X_['floor'].astype(int)
        X_["floor_area"] = X_['floor'] * X_['area']
        return X_
    
class MakeTenure(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X_ = X.copy()
        X_['tenure2'] = X_['tenure'].str.split(" ").str[-1]
        X_['tenure2'] = X_['tenure2'].replace({'Freehold':-999})
        X_['tenure2'] = X_['tenure2'].astype(int)

        X_['tenure3'] = X_['tenure2'] - X_['year']
        X_['tenure3'] = np.where(X_['tenure3'] <= -999,-999,X_['tenure3'])
        
        X_['tenure'] = X_['tenure'].str.split(" ").str[0]
        X_['tenure'] = np.where(X_['tenure'] == "Freehold","Freehold","Not Freehold")
        return X_ 
class SumBuilding(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X_ = X.copy()
        X_['sum_building'] = X_[['num_schools_1km','num_supermarkets_500m','num_mrt_stations_500m']].sum(axis=1)
        X_['mean_building'] = X_[['num_schools_1km','num_supermarkets_500m','num_mrt_stations_500m']].mean(axis=1)
        X_['std_building'] = X_[['num_schools_1km','num_supermarkets_500m','num_mrt_stations_500m']].std(axis=1)
        return X_
 
class TypeMarket(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X_ = X.copy()
        X_['type_market'] = X_['typeofarea'] + "_" + X_['marketsegment']
        X_['tenure_type'] = X_['tenure'] + "_" + ['typeofarea']
        X_['tenure_market'] = X_['tenure'] + "_" + ['marketsegment']
        return X_
class MakeStreet(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.street_data = {
            'street_type': [
                'ROAD', 'AVENUE', 'DRIVE', 'STREET', 'LORONG', 'WALK', 'CRESCENT','LINK', 'RISE', 'LANE', 'TERRACE', 'BOULEVARD', 'CIRCLE', 'LOOP','COMMONWEALTH', 'QUAY'
            ],
            'geo_type': [
                'PASIR', 'BAY', 'KEPPLE', 'TANAH', 'COAST', 'BEACH', 'SELETAR','SUNRISE', 'TAI', 'COVE'
            ],
            'wind_dir': ['EAST', 'WEST', 'NORTH', 'SOUTH']
        }
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        for col in self.street_data:
            X_[col.lower()] = X_['street'].apply(lambda x: next((i for i in self.street_data[col] if i in x.upper()), 'OTHER'))
        return X_
class DistrictGroup(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.district_month = None
    def fit(self,X,y):
        X_ = X.copy() 
        X_['price'] = y
        self.district_month_dict = X_.groupby(['district', 'month'])['price'].mean().to_dict()
        return self
    def transform(self,X,y=None):
        X_ = X.copy()
        X_['district_month'] = X_.apply(lambda x: self.district_month_dict[(x['district'], x['month'])], axis=1)
        X_['district_area'] = X_.groupby('district')['area'].transform('mean')
        return X_

class DistrictGroup(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_['district_area'] = X_.groupby('district')['area'].transform('mean')
        return X_


pipeline_feat = Pipeline([
    ('make_date',MakeDate()),
    ('make_floor',MakeFloor()),
    ('make_tenure',MakeTenure()),
    ('sum_building',SumBuilding()),
    ('type_market',TypeMarket()),
    ('make_street',MakeStreet()),
    ('DistrictGroup',DistrictGroup())
])

data preprocessing that i used for numerical data is standard scaler and yeo-johnson transformation and for categorical data i used target encoder and count encoder

In [28]:
num_col =  ['floor','area','lat','lng','num_schools_1km','num_supermarkets_500m','num_mrt_stations_500m',
            'cpi','vacant','interestrate','rentindex','available','sum_building','mean_building','std_building',
            'floor_area','district_area']
cat_col = ['propertytype','district','typeofarea','marketsegment',"tenure","type_market","street_type","geo_type",
           "wind_dir","tenure_type","tenure_market"]
freq_col = ['street','project']
notrans_col = ['year','month','quarter','longday','tenure2','tenure3']

num_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('sscale',StandardScaler()),
    ('power',PowerTransformer())
])

notrans_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='median'))
])

te_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',TargetEncoder(cat_col))
]) 


preprocessor = ColumnTransformer([
    ('num_pipe',num_pipe,num_col),
    ('cat_pipe',te_pipe,cat_col),
    ('notrans_pipe',notrans_pipe,notrans_col)
],verbose_feature_names_out= False).set_output(transform = "pandas")

# Modelling

### Function for model training and feature selection

In [34]:
# function for training
def train_model_ts(list_model,X_train,y_train,X_test,y_test,metric,cv,scorer,pipeline,groups):
    df_model = pd.DataFrame(columns = ["model_name","set_data","score","model"])
    set_data = ["test","cv","train"]

    for m in list_model: 
        pipeline_copy = deepcopy(pipeline)
        pipeline_copy.set_params(model = list_model[m])
        spot_check = cross_val_score(pipeline_copy,X_train,y_train,cv = cv,scoring = scorer,n_jobs= -1,groups = groups )
        spot_check = spot_check.mean()
        model = pipeline_copy.fit(X_train,y_train)
        score = metric(y_test,model.predict(X_test),squared = False)
        score_train = metric(y_train,model.predict(X_train),squared = False)
        model_list = [m] * 3
        tes = pd.DataFrame(list(zip(model_list,set_data,[score,spot_check,score_train],[model,model,model])),columns = ["model_name","set_data","score","model"])
        print(f"model {m} selesai di training")
        print(f"score test {score}")
        print(f"score cv {spot_check}")
        print(f"score train {score_train}")
        print("=====================================")
        df_model = pd.concat([df_model,tes],ignore_index = True)
        
    return df_model


#function for feature selection 
#since im using pipeline for my workflow, so ineed to modify the rfecv function from scikit-learn, 
# so it can work with pipeline. other than that is same as scikit-learn rfecv function
def rfecv(X, y, pipeline,min_features_to_select=3, cv = 3,step=3,scoring_metric="f1",scoring_decimals=3,random_state=42,groups = None):
    # Initialize survivors and ranked list
    estimator = deepcopy(pipeline)
    estimator.steps.pop(-1)
    survivors = estimator.fit_transform(X_train,y_train).columns.tolist()
    ranks = []
    scores = []
    # While the survivor list is longer than min_features_to_select
    while len(survivors) >= min_features_to_select:
        print(ranks)
        remove_column_transformer = FunctionTransformer(lambda x: x.drop(ranks, axis=1))
        estimator = deepcopy(pipeline)
        estimator.steps.insert(-1, ('remove_column_transformer', remove_column_transformer))
        # Get only the surviving features
        
        # Train and get the scores, cross_validate clones 
        # the model internally, so this does not modify
        # the estimator passed to this function
        print("[%.2f] evaluating %i features ..." % (time(), len(survivors)))
        cv_result = cross_validate(estimator, X, y,
                                cv=cv,
                                groups = groups,
                                scoring=scoring_metric,
                                return_estimator=True)
        # Append the mean performance to 
        score = np.mean(cv_result["test_score"])
        if scoring_decimals is None:
            scores.append(score)
        else:
            scores.append(round(score, scoring_decimals))            
        print("[%.2f] ... score %f." % (time(), scores[-1]))
        
        # Get feature weights from the model fitted 
        # on the best fold and square the weights as described 
        # in the paper. If the estimator is a Pipeline,
        # we get the weights from the last element.
        best_estimator = cv_result["estimator"][np.argmax(cv_result["test_score"])]
        if isinstance(best_estimator, Pipeline):
            weights = best_estimator[-1].feature_importances_
        else:
            weights = best_estimator.feature_importances_
        weights = list(np.power(weights, 2))
                
        # Remove step features (but respect min_features_to_select)
        for _ in range(max(min(step, len(survivors) - min_features_to_select), 1)):
            
            # Find the feature with the smallest ranking criterion
            # and update the ranks and survivors
            idx = np.argmin(weights)
            ranks.insert(0, survivors.pop(idx))
            weights.pop(idx)
            
    # Calculate the best set of surviving features
    ranks_reverse = list(reversed(ranks))
    last_max_idx = len(scores) - np.argmax(list(reversed(scores))) - 1
    removed_features = set(ranks_reverse[0:last_max_idx * step])
    best_features = [f for f in X.columns if f not in removed_features]
    
    # Return ranks and scores
    return best_features, max(scores), ranks, scores

### Baseline model

In [35]:
pipeline_base = Pipeline([
    ('pipe_feat',pipeline_feat),
    ('preprocessor',preprocessor),
    ('model',None)
])

list_model = {
   'CatB' : CatBoostRegressor(iterations = 300,silent = True),
    'LGBMRegressor':LGBMRegressor(max_depth = -1),
    'XGBRegressor':XGBRegressor(tree_method = 'hist'),
}


cv = GroupTimeSeriesSplit(n_splits = 5,test_size=5,gap_size = 3)
groups = pd.factorize(X_train.dataseries)[0]

df_model = train_model_ts(list_model,X_train,y_train,X_test,y_test,mean_squared_error,cv,
                        "neg_root_mean_squared_error",pipeline_base,groups)


model CatB selesai di training
score test 437245.36890725215
score cv -443580.90784629434
score train 262710.9501523442
model LGBMRegressor selesai di training
score test 428930.59875250614
score cv -460846.58360564813
score train 312696.87586684607
model XGBRegressor selesai di training
score test 421815.81806846306
score cv -474717.2412766598
score train 222358.22258086115


### Feature selection

In [31]:
# for lightgbm
pipeline_base = Pipeline([
    ('pipe_feat',pipeline_feat),
    ('preprocessor',preprocessor),
    ('model',LGBMRegressor())
])

#run to get the best features
#rfecv(X_train,y_train,pipeline_base,min_features_to_select=10,step = 1, cv = cv,scoring_metric="neg_root_mean_squared_error",groups = groups)

In [None]:
# for xgboost 
pipeline_base = Pipeline([
    ('pipe_feat',pipeline_feat),
    ('preprocessor',preprocessor),
    ('model',XGBRegressor(random_state = 42,n_jobs=-1,tree_method = "hist"))
])

rfecv(X_train,y_train,pipeline_base,min_features_to_select=15,step = 1, cv = cv,scoring_metric="neg_root_mean_squared_error",groups = groups)

In [None]:
# for catboost
pipeline_base = Pipeline([
    ('pipe_feat',pipeline_feat),
    ('preprocessor',preprocessor),
    ('model',CatBoostRegressor(iterations = 300,silent = True))
])

rfecv(X_train,y_train,pipeline_base,min_features_to_select=15,step = 1, cv = cv,scoring_metric="neg_root_mean_squared_error",groups = groups)

using the syntax from above for another model, i can get the best feature to use for my model. Im also explicitly stating the feature should i drop, so in the end i get the 2 part of the best feature for each model

In [32]:
drop_cols_lgb = ['cpi', 'district_area', 'num_schools_1km', 'floor', 'rentindex', 'marketsegment', 'typeofarea', 'geo_type', 'month', 'vacant', 'tenure', 'interestrate', 'available', 'wind_dir', 'quarter', 'year', 'tenure_market', 'tenure_type', 'mean_building']
drop_cols_xgb = ['interestrate', 'cpi', 'wind_dir', 'available', 'vacant', 'quarter', 'year', 'tenure_market', 'tenure_type', 'mean_building']
drop_cols_catb = ['cpi', 'month', 'floor', 'year', 'geo_type', 'rentindex', 'vacant', 'quarter', 'wind_dir', 'interestrate', 'available', 'tenure_market', 'tenure_type', 'tenure']
drop_cols_catb2= ['mean_building', 'geo_type', 'rentindex', 'available', 'month', 'vacant', 'interestrate', 'year', 'wind_dir', 'tenure_market',
                  'quarter', 'tenure', 'tenure_type','district_area','street','project']
drop_cols_xgb2 = ['month', 'geo_type', 'rentindex', 'interestrate', 'available', 'cpi', 'wind_dir', 'mean_building', 'vacant', 'quarter', 'year', 
                  'tenure_market', 'tenure_type','district_area','street','project']
drop_cols_lgb2 = ['floor', 'rentindex', 'geo_type', 'month', 'typeofarea', 'tenure', 'interestrate', 'vacant', 'available', 'wind_dir', 'mean_building', 
                  'quarter', 'year', 'tenure_market', 'tenure_type','district_area','street','project']

# Stacking
for final model i used stacking method with LightGBM for my final estimators, i have 9 total of models with consist of 3 baseline models, and 6 models that have been hyperparameter tuned. 

In [36]:
pipeline_catb = Pipeline([
    ('drop_cols',FunctionTransformer(lambda x: x.drop(drop_cols_catb, axis=1))),
    ("algo",CatBoostRegressor(iterations = 300,silent = True,colsample_bylevel = 0.9599,max_depth = 8 ))
])

pipeline_lgb = Pipeline([
    ('drop_cols',FunctionTransformer(lambda x: x.drop(drop_cols_lgb, axis=1))),
    ("algo",LGBMRegressor(n_jobs = -1,colsample_bytree = 0.8,learning_rate = 0.14166,max_depth = 20,num_leaves = 144,
                           reg_alpha = 10,reg_lambda = 2.8426,subsample = 0.229))
])

pipeline_xgb = Pipeline([
    ('drop_cols',FunctionTransformer(lambda x: x.drop(drop_cols_xgb, axis=1))),
    ("algo",XGBRegressor(colsample_bytree = 0.5655,gamma = 2,learning_rate = 0.0434,max_depth = 10,n_estimators = 196,
                         reg_alpha = 3.795386101923234,reg_lambda = 0.001,subsample = 0.4933248871865059,random_state = 42,n_jobs = -1,tree_method = "hist"))
]) 

estimators = [
    ("CatB", CatBoostRegressor(iterations = 300,silent = True,colsample_bylevel = 0.81959, max_depth = 8)),
    ("LGBM", LGBMRegressor(random_state = 42,n_jobs = -1,colsample_bytree = 0.5526,learning_rate = 0.08664,max_depth = 10,num_leaves = 155,
                           reg_alpha = 0.1458,reg_lambda = 1.38465,subsample = 0.72301)),
    ("XGB", XGBRegressor(colsample_bytree = 0.64846,gamma = 9,learning_rate = 0.0425,max_depth = 10,n_estimators = 167,reg_alpha = 0.0165,
                         reg_lambda = 0.07,subsample = 0.44,random_state = 42,n_jobs = -1,tree_method = "hist")),
    ("CatB2", pipeline_catb),
    ("LGBM2", pipeline_lgb),
    ("XGB2", pipeline_xgb)
]

reg = StackingRegressor(
    estimators=estimators,
    final_estimator= LGBMRegressor(random_state = 42)
)

reg_xgb = StackingRegressor(
    estimators=estimators,
    final_estimator= XGBRegressor(random_state = 42,tree_method = "hist")
)

voting = VotingRegressor(
    estimators = estimators
)

list_model = {
    'stack1' : reg,
    "stack2" : reg_xgb,
    "voting" : voting
}

df_model_stack = train_model_ts(list_model,X_train,y_train,X_test,y_test,mean_squared_error,cv,
                        "neg_root_mean_squared_error",pipeline_base,groups)

model stack1 selesai di training
score test 396835.5255616923
score cv -453974.0373185974
score train 337876.6682145801
model stack2 selesai di training
score test 411736.64360886684
score cv -469501.3772593538
score train 337579.7132624123
model voting selesai di training
score test 377446.47130130866
score cv -429777.7355599934
score train 186252.99858357332


# Submission

In [None]:
df_test['prediction'] = df_model_stack.model[0].predict(df_test)
df_sub = df_test[['property_key','dataseries','prediction']]
df_sub.columns = ['property_key','contractDate','prediction']
df_sub.to_csv("voting3.csv",index=False)  