In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor,\
ExtraTreesRegressor, StackingRegressor


from xgboost import XGBRegressor

from sklearn import set_config
set_config(display='diagram')
import warnings
warnings.simplefilter('ignore')

In [2]:
def get_metrics(y_tr, y_pr):
    print("MAE:   %.3f"%mean_absolute_error(y_tr, y_pr))
    print("MSE:   %.3f"%mean_squared_error(y_tr, y_pr))
    print("MAPE:  %.3f"%mean_absolute_percentage_error(y_tr, y_pr))

### Custom classes for pipeline construction

In [3]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """
    The class provides basic functionality for retrieving
    a subset of columns from the dataset.
    """
    
    def __init__(self, feature_names):
        """
        Initialize class instance by setting
        a list of columns to retrieve from the dataset.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        """
        Fit FeatureSelector to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Transform X using feature selection. 
        Return column-subset of X.
        """
        return X[self.feature_names]

In [4]:
class PlatemarkTransformer(BaseEstimator, TransformerMixin):
    """
    The class provides functionality for converting matter
    columns to year values. Extracts year from string.
    """
    
    def __init__(self, drop=False):
        """
        Initializes class instance by setting convert options. 
        
        Parameters
        ----------
        drop: bool, 
            if True, removes the original columns from the dataset.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.drop = drop
    
    def fit(self, X, y=None):
        """
        Fit DateTransformer to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Transfor X using the parameters set in the constructor.
        Return transformed dataframe.
        """
        X['matter'] = X['matter'].add(X['platemark'])
        X = pd.get_dummies(X, columns=['matter'])
        if self.drop:
            X.drop('platemark', axis=1, inplace=True)
        return X

In [5]:
plate_pipeline = Pipeline(
    steps=[
        ('plate_selector', FeatureSelector(['matter', 'platemark'])),
        ('plate_transformer', PlatemarkTransformer(drop=True))
    ]
)

cat_ohe = OneHotEncoder(sparse=False)
cat_pipeline = Pipeline(
    steps=[
        ('cat_selector', FeatureSelector(['matter', 'type'])),
        ('cat_encoder', cat_ohe)
    ]
)

num_scale_pipeline = Pipeline(
    steps=[
        ('num_selector', FeatureSelector(['weight', 'size'])),
        ('standard_scaler', MinMaxScaler())
    ]
)

num_pipeline = Pipeline(
    steps=[
        ('num_selector', FeatureSelector(['is_defect']))
    ]
)

In [6]:
full_pipeline = FeatureUnion(transformer_list=[
    ('plate', plate_pipeline),
    ('cat', cat_pipeline),
    ('num_scale', num_scale_pipeline),
    ('num', num_pipeline)
])
full_pipeline

In [7]:
df = pd.read_csv('rings.csv')

features = ['matter', 'injection_params', 'weight', 'size',
       'is_defect', 'platemark', 'type']

X, y = df[features], df['price_before_takeoff']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## ExtraTreesRegressor

In [9]:
pipET = Pipeline(
    steps=[
        ('preparation', full_pipeline),
        ('ExtraTree', ExtraTreesRegressor(random_state=42, criterion='mae', n_jobs=-1))
    ]
)
pipET.fit(X_train, y_train)
y_true, y_pred = y_test, pipET.predict(X_test)
get_metrics(y_true, y_pred)

MAE:   131.543
MSE:   338071.733
MAPE:  0.163


## RandomForest

In [10]:
pipRF = Pipeline(
    steps=[
        ('preparation', full_pipeline),
        ('RFR', RandomForestRegressor(random_state=42, n_jobs=-1))
    ]
)
pipRF.fit(X_train, y_train)
y_true, y_pred = y_test, pipRF.predict(X_test)
get_metrics(y_true, y_pred)

MAE:   139.202
MSE:   326967.171
MAPE:  0.182


## Results

## XGBoost

## BaggingRegressor

## AdaBoostRegressor

## StackingRegressor