In [4]:
import numpy as np
import pandas as pd

In [5]:
train = pd.read_csv('./data/diamonds_train.csv')
test = pd.read_csv('./data/diamonds_predict.csv')
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


In [6]:
target = 'price'
cat_features = ['cut', 'color', 'clarity']
num_features = ['carat', 'depth', 'table', 'x', 'y', 'z']

for cat_feat in cat_features:
    train[cat_feat] = train[cat_feat].astype('category')
    test[cat_feat] = test[cat_feat].astype('category')

features = cat_features + num_features   

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train, test = train_test_split(train)
X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn import ensemble
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [10]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), # how to fill missing data
                ('scaler', StandardScaler())]) # transform to a standard distribution (mean and variance)
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))]) # convert the values to 0 and 1

In [11]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, num_features),
                                ('cat', categorical_transformer, cat_features)])

In [14]:
level0 = list()
level0.append(('extra',ensemble.ExtraTreesRegressor(n_estimators=100, min_samples_leaf=2)))
level0.append(('cart',DecisionTreeRegressor()))
level0.append(('knn', KNeighborsRegressor()))
level0.append(('svm', SVR()))
level0.append(('forest', ensemble.RandomForestRegressor(n_estimators=110, min_samples_leaf=2)))
level0.append(('boost', ensemble.GradientBoostingRegressor() ))
level1 = linear_model.LinearRegression()

In [15]:
model = StackingRegressor(estimators = level0, final_estimator = level1, cv=5)

In [16]:
mod = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)])

mod_fit = mod.fit(X_train, y_train)

mod_fit.score(X_test,y_test)



0.9802740083280991

In [17]:
y_predict = mod_fit.predict(X_test)

In [18]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_predict, squared = False)

569.8041786481026

In [None]:
# The predection with a stacked model does not seem to bring an advantage compared to the random forest model. 