In [2]:
import pandas as pd
import numpy as np

In [7]:
data = pd.read_csv("./data/gemstone.csv")

In [8]:
data.drop(labels = 'id', axis=1, inplace=True)

In [18]:
X = data.drop(labels='price', axis=1)
y = data[['price']]

In [22]:
categorical_cols = X.select_dtypes(include='object').columns

In [25]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [23]:
numerical_cols = X.select_dtypes(exclude='object').columns

In [24]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [26]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [27]:
from sklearn.impute import SimpleImputer ## Handling missing values
from sklearn.preprocessing import StandardScaler ## Handling feature scaling
from sklearn.preprocessing import OrdinalEncoder ## Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [28]:
num_pipeline=Pipeline(

    steps=[
        ('imputer',SimpleImputer()),
        ('scaler',StandardScaler())


    ]

)

In [29]:
cat_pipeline=Pipeline(

    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))


    ]

    
)

In [30]:
preprocessor=ColumnTransformer(

    [

        ('num_pipeline',num_pipeline,numerical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]



)

In [31]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [32]:
preprocessor.fit_transform(X_train)



array([[ 0.56167397,  0.31982286, -0.6510053 , ...,  4.        ,
         3.        ,  4.        ],
       [-0.60408689, -0.39681436,  1.58987791, ...,  2.        ,
         1.        ,  2.        ],
       [-0.52081826, -0.25348692, -1.09918195, ...,  4.        ,
         4.        ,  1.        ],
       ...,
       [ 1.87315495,  0.7498052 , -0.20282866, ...,  2.        ,
         4.        ,  4.        ],
       [-1.04124722,  1.03646009, -0.20282866, ...,  1.        ,
         0.        ,  1.        ],
       [-0.54163542, -0.68346925, -0.20282866, ...,  4.        ,
         0.        ,  2.        ]])

In [34]:
preprocessor.transform(X_test)

array([[-8.33075636e-01,  1.76495416e-01,  1.58987791e+00, ...,
         3.00000000e+00,  1.00000000e+00,  2.00000000e+00],
       [ 2.91050915e-01, -9.70124145e-01,  6.93524626e-01, ...,
         3.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 6.24125449e-01, -3.96814364e-01, -2.02828660e-01, ...,
         3.00000000e+00,  6.00000000e+00,  3.00000000e+00],
       ...,
       [-1.04124722e+00,  8.21468919e-01, -2.02828660e-01, ...,
         2.00000000e+00,  2.00000000e+00,  3.00000000e+00],
       [ 4.36771024e-01, -1.81823197e-01,  6.93524626e-01, ...,
         3.00000000e+00,  3.00000000e+00,  2.00000000e+00],
       [ 4.15953865e-01, -1.01840275e-14,  1.14170127e+00, ...,
         3.00000000e+00,  1.00000000e+00,  1.00000000e+00]])

In [33]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [35]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [38]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.561674,0.319823,-0.651005,0.704838,0.692859,0.723432,4.0,3.0,4.0
1,-0.604087,-0.396814,1.589878,-0.533052,-0.486169,-0.533329,2.0,1.0,2.0
2,-0.520818,-0.253487,-1.099182,-0.409263,-0.376688,-0.410385,4.0,4.0,1.0
3,-0.999613,0.248159,-1.099182,-1.178523,-1.134635,-1.107067,4.0,4.0,4.0
4,0.228599,0.964796,-0.651005,0.413049,0.330729,0.477544,2.0,4.0,3.0


In [39]:
X_test.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.833076,0.176495,1.589878,-0.904419,-0.898829,-0.86118,3.0,1.0,2.0
1,0.291051,-0.970124,0.693525,0.536838,0.532848,0.409242,3.0,1.0,1.0
2,0.624125,-0.396814,-0.202829,0.819784,0.734967,0.709771,3.0,6.0,3.0
3,0.832297,-1.328443,2.934408,0.978942,0.886556,0.737092,1.0,1.0,2.0
4,-1.041247,-0.253487,0.693525,-1.29347,-1.21043,-1.243672,2.0,3.0,7.0
