In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from functions import *

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor, make_column_selector

# 2. Model selection

We will apply the lessons learned during the data exploration to our dataset, before looking into potential regression algorithms for our specific problem.

In [3]:
diam_data = pd.read_csv('diamonds.csv', index_col = 0)
diam_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


Our ultimate target is the price, which will be computed as $price/carat \times weight$. For our model, the target will however be the price/carat.

In [4]:
y = np.divide(diam_data['price'], diam_data['carat'])
y.head()

1    1417.391304
2    1552.380952
3    1421.739130
4    1151.724138
5    1080.645161
dtype: float64

In [5]:
X = diam_data.drop(columns=['price'])
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


We know a step is the log-transformation of our target variable.

In [6]:
target_transform = TransformedTargetRegressor(
    regressor=None, #To be set for each model
    func=np.log, #Use log-transform
    inverse_func=np.exp #Inverse = exponential
)

Another step includes binning the 'carat' column, before using it in the model.

Scikit-learn does not have a simple function to achieve this with given bin edges, so we will create one that can be used in a pipeline.

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

In [8]:
class CustomBinDiscretizer(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 bins,
                 right: 'bool' = True,
                 labels=None,
                 retbins: 'bool' = False,
                 precision: 'int' = 3,
                 include_lowest: 'bool' = False,
                 duplicates: 'str' = 'raise',
                 ordered: 'bool' = True):
        self.bins = bins
        self.right = right
        self.labels = labels
        self.retbins = retbins
        self.precision = precision
        self.include_lowest = include_lowest
        self.duplicates = duplicates
        self.ordered = ordered

    def fit(self, X, y=None):
        #Nothing to fit, given custom bins
        return self

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame), ("Only pandas dataframes can be used as inputs for this function")
        X_new = X.copy()
        for col in X.columns:
            X_new.loc[:, col] = pd.cut(x=X.loc[:, col].values, **self.__dict__)
        return X_new

In [9]:
discretizer = CustomBinDiscretizer(
    labels=False, #No labels, using int ordinal instead directly
    bins=[0, .5, .75, 1, 1.5, 2, +np.inf]
)

Depending on the algorithm used, we may also need additional steps:
- Encoding (ordinally, or one-hot) categorical variables
- Standard-scaling numerical values

In [10]:
#Define ordinal order for the encoded categorical variables
cut_order = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
clarity_order = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [11]:
#Initialize encoders
oh_encoder = OneHotEncoder()
or_encoder = OrdinalEncoder(categories = [cut_order, clarity_order])

In [12]:
scaler = StandardScaler()

In [13]:
preprocess = ColumnTransformer(
    [
        ('discretizer', discretizer, ['carat']),
        ('oh_encoder', oh_encoder, ['color']),
        ('or_encoder', or_encoder, ['cut', 'clarity']),
        ('scaler', scaler, ['depth', 'table', 'x'])
    ],
    remainder='drop'
)

We also need to impute missing 'x' data, using iterative imputation from scikit-learn.

In [14]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [15]:
imputer = IterativeImputer(missing_values=0, random_state=50)

In [16]:
prep_imputer = ColumnTransformer(
    [('imputer', imputer, ['carat', 'x'])],
    remainder='passthrough'
)

In [17]:
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor