In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor,\
ExtraTreesRegressor, StackingRegressor


from xgboost import XGBRegressor

from sklearn import set_config
set_config(display='diagram')
import warnings
warnings.simplefilter('ignore')

In [2]:
def get_metrics(y_tr, y_pr):
    print("MAE:   %.3f"%mean_absolute_error(y_tr, y_pr))
    print("MSE:   %.3f"%mean_squared_error(y_tr, y_pr))
    print("MAPE:  %.3f"%mean_absolute_percentage_error(y_tr, y_pr))

### Custom classes for pipeline construction

In [40]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """
    The class provides basic functionality for retrieving
    a subset of columns from the dataset.
    """
    
    def __init__(self, feature_names):
        """
        Initialize class instance by setting
        a list of columns to retrieve from the dataset.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        """
        Fit FeatureSelector to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Transform X using feature selection. 
        Return column-subset of X.
        """
        return X[self.feature_names]

In [41]:
class PlatemarkTransformer(BaseEstimator, TransformerMixin):
    """
    The class provides functionality for converting matter
    columns to year values. Extracts year from string.
    """
    
    def __init__(self, drop=False):
        """
        Initializes class instance by setting convert options. 
        
        Parameters
        ----------
        drop: bool, 
            if True, removes the original columns from the dataset.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.drop = drop
    
    def fit(self, X, y=None):
        """
        Fit DateTransformer to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Transfor X using the parameters set in the constructor.
        Return transformed dataframe.
        """
        X['matter'] = X['matter'].add(X['platemark'])
        X = pd.get_dummies(X, columns=['matter'])
        if self.drop:
            X.drop('platemark', axis=1, inplace=True)
        return X

In [62]:
class ParseInjection(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        i = 0
        for inj in X.injection_params:
            if inj == "None":
                i += 1
                continue
            stones = dict()
            inj_strs = (inj.replace(', ', '|').replace('. ', '|')
                        .replace(';', '|').split('|'))
            for inj_substr in inj_strs:
                str_split = inj_substr.split()
                if str_split[0].isdigit() and (str_split[1].isalpha() or 
                                               str_split[1] in ["Кер.кольцо", "Гор.хр"]):
                    stones.setdefault(str_split[1].lower(), 0)
                    stones[str_split[1].lower()] += int(str_split[0])
                elif any([True if "БРКр" in x else False for x in str_split]):
                    for sub in str_split:
                        if "БРКр" in sub:
                            stones.setdefault("бриллиант", 0)
                            br_num = sub.split("БРКр")[0]
                            stones["бриллиант"] += (int(br_num) if "Родий" not in sub 
                                                    else int(br_num.split("Родий")[1]))
                elif str_split[0][1:] in ["Сапфир", "Изумруд"]:
                    stones.setdefault(str_split[0][1:].lower(), 0)
                    stones[str_split[0][1:].lower()] += int(str_split[0][0])
                elif str_split[0].isdigit() and str_split[2] in ["Сапфир", "Изумруд"]:
                    stones.setdefault(str_split[2].lower(), 0)
                    stones[str_split[2].lower()] += int(str_split[0])
                elif len(str_split) > 2 and str_split[2] == "Эмаль":
                    stones.setdefault(str_split[2].lower(), 0)
                    stones[str_split[2].lower()] += int(str_split[0])
            for name, num in stones.items():
                name = name if name not in ["бр", "брилл"] else "бриллиант"
                X.loc[X.index[i], name] = num
            i += 1
        return X.drop(["injection_params", "куб"], axis=1)

In [75]:
plate_pipeline = Pipeline(
    steps=[
        ('plate_selector', FeatureSelector(['matter', 'platemark'])),
        ('plate_transformer', PlatemarkTransformer(drop=True))
    ]
)

cat_ohe = OneHotEncoder(sparse=False)
cat_pipeline = Pipeline(
    steps=[
        ('cat_selector', FeatureSelector(['matter', 'type'])),
        ('cat_encoder', cat_ohe)
    ]
)

num_scale_pipeline = Pipeline(
    steps=[
        ('num_selector', FeatureSelector(['weight', 'size'])),
        ('standard_scaler', MinMaxScaler())
    ]
)

num_pipeline = Pipeline(
    steps=[
        ('num_selector', FeatureSelector(['is_defect']))
    ]
)
 
injection_pipeline = Pipeline(
    steps=[
        ("injection_selector", FeatureSelector(X.columns[6:])),
        ("injection_imp", SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0))
    ]
)

In [78]:
full_pipeline = FeatureUnion(transformer_list=[
    ('plate', plate_pipeline),
    ('cat', cat_pipeline),
    ('num_scale', num_scale_pipeline),
    ('num', num_pipeline),
    ("injection", injection_pipeline)
])
full_pipeline

In [80]:
df = pd.read_csv('rings.csv', index_col=0)

In [81]:
df

Unnamed: 0,id,matter,injection_params,weight,size,is_defect,platemark,type,price_before_takeoff,price_after_discount
0,Т132013977,gold,"3 фианит 0,8 Круг",1.86,15.0,1,585,wedding,1062.86,340.12
1,Т132013977,gold,"3 фианит 0,8 Круг",1.90,15.0,0,585,wedding,865.87,389.64
2,Т132013977,gold,"3 фианит 0,8 Круг",1.90,15.0,0,585,wedding,865.87,389.64
3,Т132013977,gold,"3 фианит 0,8 Круг",1.72,15.0,0,585,wedding,783.84,352.73
4,Т132013977,gold,"3 фианит 0,8 Круг",1.72,15.0,0,585,wedding,783.84,352.73
...,...,...,...,...,...,...,...,...,...,...
9143,Z1-8827,silver,,1.09,17.5,0,925,engagement,64.29,38.57
9144,Z1-8827,silver,,1.16,17.0,0,925,engagement,68.46,41.08
9145,89010096,silver,1 Фианит SWAROVSKI Круг 6,1.51,16.5,0,925,decorative,82.85,49.71
9146,89010096,silver,1 Фианит SWAROVSKI Круг 6,1.54,16.0,0,925,decorative,84.53,50.72


In [82]:
features = ['matter', 'injection_params', 'weight', 'size',
       'is_defect', 'platemark', 'type']

X, y = df[features], df['price_before_takeoff']

In [83]:
pi = ParseInjection().fit(X)

In [84]:
X = pi.transform(X)

In [85]:
X.columns[6:]

Index(['фианит', 'бриллиант', 'кер.кольцо', 'cvd', 'эмаль', 'топаз',
       'наношпинель', 'цирконий', 'оникс', 'сапфир', 'изумруд', 'жемчуг',
       'кристалл', 'гранат', 'гор.хр', 'цитрин', 'корунд', 'аметист'],
      dtype='object')

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## ExtraTreesRegressor

In [113]:
X.columns.size

24

In [135]:
params = {
    "max_depth": range(1, 14, 4),
    'n_estimators': range(50, 126, 25),
    'max_features': range(1, 25, 4),
    'min_samples_leaf': range(1, 17, 5),
    'min_samples_split': range(1, 27, 5),
}

extra_grid = GridSearchCV(
    estimator=ExtraTreesRegressor(random_state=42),
    param_grid=params,
    n_jobs=-1,
    scoring=("neg_mean_squared_error"),
    refit=True,
)

In [136]:
pipET = Pipeline(
    steps=[
        ('preparation', full_pipeline),
        ('ExtraTree', extra_grid)
    ]
)
pipET.fit(X_train, y_train)

In [137]:
extra_grid.best_params_

{'max_depth': 13,
 'max_features': 17,
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 50}

In [138]:
get_metrics(y_test, pipET.predict(X_test))

MAE:   136.148
MSE:   249054.478
MAPE:  0.204


n_est: 100
MAE:   95.152
MSE:   185614.020
MAPE:  0.128

max_features: 21
MAE:   96.271
MSE:   204223.732
MAPE:  0.129

min_samples_leaf: 20
MAE:   154.250
MSE:   314368.952
MAPE:  0.239

min_samples_split: 15
MAE:   105.070
MSE:   207418.744
MAPE:  0.142

In [88]:
X_train

Unnamed: 0,matter,weight,size,is_defect,platemark,type,фианит,бриллиант,кер.кольцо,cvd,...,оникс,сапфир,изумруд,жемчуг,кристалл,гранат,гор.хр,цитрин,корунд,аметист
8482,silver,1.69,18.5,0,925,decorative,,,,,...,,,,,,,,,,
5225,gold,2.44,22.5,0,585,wedding,,,,,...,,,,,,,,,,
3328,gold,2.29,21.0,0,585,wedding,,,,,...,,,,,,,,,,
8428,gold,1.30,17.5,0,585,engagement,,1.0,,,...,,,,,,,,,,
4031,silver,6.47,16.0,0,925,decorative,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,gold,2.51,19.0,0,585,wedding,,,,,...,,,,,,,,,,
5191,gold,2.10,19.5,0,585,wedding,,,,,...,,,,,,,,,,
5390,gold,2.65,20.0,0,585,wedding,,,,,...,,,,,,,,,,
860,gold,2.46,17.0,0,375,wedding,,,,,...,,,,,,,,,,


In [90]:
predict = pipET.predict(X_test)

In [93]:
mean_absolute_percentage_error(y_test, predict)

0.1235029474627417

In [94]:
get_metrics(y_test, predict)

MAE:   89.912
MSE:   169044.868
MAPE:  0.124


In [139]:
pipET = Pipeline(
    steps=[
        ('preparation', full_pipeline),
        ('ExtraTree', ExtraTreesRegressor(random_state=42, criterion='mae', n_jobs=-1))
    ]
)
pipET.fit(X_train, y_train)
y_true, y_pred = y_test, pipET.predict(X_test)
get_metrics(y_true, y_pred)

MAE:   89.912
MSE:   169044.868
MAPE:  0.124


## RandomForest

In [95]:
pipRF = Pipeline(
    steps=[
        ('preparation', full_pipeline),
        ('RFR', RandomForestRegressor(random_state=42, n_jobs=-1))
    ]
)
pipRF.fit(X_train, y_train)
y_true, y_pred = y_test, pipRF.predict(X_test)
get_metrics(y_true, y_pred)

MAE:   98.076
MSE:   167436.033
MAPE:  0.143


In [96]:
params = {
    'n_estimators': np.arange(10, 200, 50),
    'criterion': ('mse', 'mae'),
    'max_depth': np.arange(1, 16, 5)
}

pip = Pipeline(
    steps=[
        ('preparation', full_pipeline),
        ('gc', GridSearchCV(RandomForestRegressor(random_state=42), params, n_jobs=-1,
                                scoring='accuracy', cv=5, refit=True, verbose=2))
    ]
)
pip.fit(X_train, y_train)
y_true, y_pred = y_test, pip.predict(X_test)
print('Best params found:\n', pip['gc'].best_params_)
get_metrics(y_true, y_pred)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best params found:
 {'criterion': 'mse', 'max_depth': 1, 'n_estimators': 10}
MAE:   397.112
MSE:   610167.235
MAPE:  1.286


Best params found:
 {'criterion': 'mse', 'max_depth': 1, 'n_estimators': 10}
MAE:   345.615
MSE:   582424.397
MAPE:  0.481

## Results

## XGBoost

In [99]:
pip = Pipeline(
    steps=[
        ('preparation', full_pipeline),
        ('XGBReg', XGBRegressor(random_state=42, eta=0.01, max_depth=3, reg_lambda=0))
    ]
)
pip.fit(X_train, y_train)
y_true, y_pred = y_test, pip.predict(X_test)
#print('Best params found:\n', pip['gc'].best_params_)
get_metrics(y_true, y_pred)

MAE:   394.171
MSE:   605497.079
MAPE:  0.340


## BaggingRegressor

## AdaBoostRegressor

## StackingRegressor