In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn import set_config
set_config(display='diagram')

In [2]:
cat_selector = make_column_selector(dtype_include=object)
num_selector = make_column_selector(dtype_include=np.number)

In [3]:
df = pd.read_csv('rings.csv')
df = df[df.columns[2:]]
df = df[df.matter != 'silver_and_gold']
df.drop('price_after_discount', axis=1, inplace=True)

In [4]:
df.drop('injection_params', axis=1, inplace=True)

In [5]:
X, y = df[df.columns[:df.columns.shape[0] - 1]], df[df.columns[-1]]
X.info(), y

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9140 entries, 0 to 9147
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   matter     9140 non-null   object 
 1   weight     9140 non-null   float64
 2   size       9140 non-null   float64
 3   is_defect  9140 non-null   int64  
 4   platemark  9140 non-null   object 
 5   type       9140 non-null   object 
dtypes: float64(2), int64(1), object(3)
memory usage: 499.8+ KB


(None,
 0       1062.86
 1        865.87
 2        865.87
 3        783.84
 4        783.84
          ...   
 9143      64.29
 9144      68.46
 9145      82.85
 9146      84.53
 9147      85.06
 Name: price_before_takeoff, Length: 9140, dtype: float64)

In [6]:
cats = cat_selector(X)
for cat in cats:
    print(X[cat].unique())

['gold' 'silver']
['585' '375' '925']
['wedding' 'decorative' 'engagement' 'pechatki']


In [7]:
nums = num_selector(X)
for num in nums:
    print(X[num].isna().sum())

0
0
0


In [8]:
cat_linear_processor = OneHotEncoder()
num_linear_processor = StandardScaler()
preproc = make_column_transformer(
    (num_linear_processor, num_selector), (cat_linear_processor, cat_selector))
preproc

In [9]:
from xgboost import XGBRegressor

xgb_pipe = make_pipeline(preproc, XGBRegressor())
xgb_pipe

In [10]:
from sklearn.linear_model import LinearRegression

lr_pipe = make_pipeline(preproc, LinearRegression())
lr_pipe

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=0)

In [12]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators' :[10, 50, 100, 250, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.5, 1]    
}
cv = GridSearchCV(xgb_pipe, params, scoring='neg_mean_absolute_error', return_train_score=False)

In [13]:
xgb_pipe.fit(X_train, y_train)
pr = xgb_pipe.predict(X_test)

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

In [15]:
mean_squared_error(pr, y_test)

645641.5316700914

In [16]:
mean_absolute_percentage_error(pr, y_test)

0.4059887252155697

In [17]:
mean_absolute_error(pr, y_test)

189.25062205141728

In [18]:
lr_pipe.fit(X_train, y_train)
pr = lr_pipe.predict(X_test)

In [19]:
mean_squared_error(pr, y_test)

771993.3405949288

In [20]:
mean_absolute_percentage_error(pr, y_test)

1908170833167950.5

In [21]:
mean_absolute_error(pr, y_test)

380.56034139874043