In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./data/train.csv")
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
data = data.drop("id", axis=1)

In [4]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [7]:
data.shape

(193573, 10)

In [6]:
X = data.iloc[:,:-1]
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [8]:
y = data.iloc[:,-1]
y

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569     2874
193570     3036
193571      681
193572     2258
Name: price, Length: 193573, dtype: int64

In [30]:
categorical_X_features = X.select_dtypes(include="object").columns
numerical_X_features = X.select_dtypes(exclude="object").columns

In [31]:
from sklearn.impute import SimpleImputer # handling missing values
from sklearn.preprocessing import StandardScaler # To scale features
from sklearn.preprocessing import OrdinalEncoder # To encode categorical features

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [53]:
cut_categories = ["Fair","Good","Very Good","Premium", "Ideal"]
color_categories = ["D", "E", "F", "G", "H", "I", "J"]
clarity_categories = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"] 

In [54]:
numerical_pipeline = Pipeline(

        steps = [
            ("Imputer", SimpleImputer()),
            ("scaler", StandardScaler())
        ]
)

In [55]:
categorical_pipeline = Pipeline(

        steps = [
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ordinalencoder", OrdinalEncoder(categories = [cut_categories, color_categories, clarity_categories]))
        ]
)

In [56]:
preprocessor = ColumnTransformer(
    [
        ("numerical_pipeline", numerical_pipeline, numerical_X_features),
        ("categorical_pipeline", categorical_pipeline, categorical_X_features)
    ]
)

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

In [58]:
preprocessor.fit_transform(X_train)

array([[-1.06044536,  0.90546103, -0.63974199, ...,  4.        ,
         1.        ,  6.        ],
       [-0.15298696,  0.07463378,  0.92396069, ...,  3.        ,
         2.        ,  2.        ],
       [ 0.45198531,  1.92091656, -0.63974199, ...,  1.        ,
         5.        ,  2.        ],
       ...,
       [-0.58511   , -1.31007831,  3.53013183, ...,  1.        ,
         1.        ,  2.        ],
       [-0.47707924,  0.44389034, -1.68221045, ...,  4.        ,
         5.        ,  4.        ],
       [ 0.71125914, -0.20230864, -0.63974199, ...,  4.        ,
         2.        ,  6.        ]])