# Diamond Data preprocessing using Pipeline and ColumnTransformer

In [21]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler , PowerTransformer , OneHotEncoder , OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [22]:
diamond_data = pd.read_csv(r"C:\Users\user\Downloads\diamonds.csv\diamonds.csv")
diamond_data

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [23]:
diamond_data.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [24]:
# droping unknown columns 

diamond_data.drop(columns = 'Unnamed: 0',inplace = True)

In [25]:

diamond_data.shape

(53940, 10)

In [26]:
# info of the data
diamond_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [27]:

# checking null values

diamond_data.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [28]:
# basic stastistics of the data

diamond_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
carat,53940.0,0.79794,0.474011,0.2,0.4,0.7,1.04,5.01
depth,53940.0,61.749405,1.432621,43.0,61.0,61.8,62.5,79.0
table,53940.0,57.457184,2.234491,43.0,56.0,57.0,59.0,95.0
price,53940.0,3932.799722,3989.439738,326.0,950.0,2401.0,5324.25,18823.0
x,53940.0,5.731157,1.121761,0.0,4.71,5.7,6.54,10.74
y,53940.0,5.734526,1.142135,0.0,4.72,5.71,6.54,58.9
z,53940.0,3.538734,0.705699,0.0,2.91,3.53,4.04,31.8


# Spliting The Data Into Train And Test

In [29]:
X_Train , X_Test , Y_Train , Y_Test = train_test_split(diamond_data.drop('price',axis =1),
                                                       diamond_data['price'],
                                                       test_size = 0.20,
                                                       random_state = 100)

In [30]:
X_Train.shape , X_Test.shape , len(Y_Train) , len(Y_Test)

((43152, 9), (10788, 9), 43152, 10788)

In [31]:
diamond_data.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [32]:

X_Train_Odinal_catg  = ['cut']
X_Train_Nominal_catg = ['color','clarity']
X_Train_continous    = ['carat','depth','table','x','y','z']

# Continuous Columns Pipeline

In [33]:

cont_pipeline = Pipeline(steps = [
    
    ('SimpleImputer', SimpleImputer(strategy = 'median')),
    ('RobustScaler' , RobustScaler()),
    ('PowerTransformer' , PowerTransformer())
    
])

# Categorical Odinal Column Pipeline

In [34]:
diamond_data['cut'].value_counts(normalize = True)

Ideal        0.399537
Premium      0.255673
Very Good    0.223990
Good         0.090953
Fair         0.029848
Name: cut, dtype: float64

In [37]:
cat_pipeline_ordinal = Pipeline(steps = [
    
    ('SimpleImputer' , SimpleImputer(strategy = 'most_frequent')),
    ('OrdinalEncoder' , OrdinalEncoder(categories = [['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']]))
    
])

# Categorical Nominal Column Pipeline

In [38]:
cat_pipeline_nominal = Pipeline(steps =[
    
    ('SimpleImputer' , SimpleImputer(strategy = 'most_frequent')),
    ('OneHotEncoder' , OneHotEncoder(sparse_output = False , drop = 'first'))
    
])

# ColumnTransformer To Combine All The Pipelines


In [39]:

pre_col_Transformer = ColumnTransformer(transformers = [
    ('cat_pipeline_ordinal' , cat_pipeline_ordinal , X_Train_Odinal_catg),
    ('cat_pipeline_nominal' , cat_pipeline_nominal , X_Train_Nominal_catg),
    ('cont_pipeline' , cont_pipeline , X_Train_continous),
    
],remainder = 'passthrough')

# One Final Pipeline

In [40]:
final_pipeline = Pipeline(steps = [
    ('pre_col_Transformer' , pre_col_Transformer)
])

# Train Data Preprocessing

In [41]:
final_pipeline.fit_transform(X_Train)   # Array out_put of Train Data

array([[ 1.        ,  0.        ,  0.        , ...,  1.91252925,
         1.83624526,  1.78259616],
       [ 0.        ,  0.        ,  0.        , ...,  0.23735783,
         0.31296124,  0.27883258],
       [ 0.        ,  0.        ,  0.        , ..., -0.0339054 ,
         0.00840006, -0.06939276],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.54065231,
         0.48842032,  0.49722525],
       [ 1.        ,  0.        ,  0.        , ...,  1.04144837,
         1.01317964,  1.07851817],
       [ 0.        ,  0.        ,  0.        , ..., -1.14426046,
        -1.20858117, -1.12102348]])

In [42]:

# pandas out_put of Train Data

X_Train_processed = pd.DataFrame(final_pipeline.fit_transform(X_Train) ,
                                 columns = final_pipeline.get_feature_names_out(),
                                 index = X_Train.index)    

In [43]:
X_Train_processed.head()

Unnamed: 0,cat_pipeline_ordinal__cut,cat_pipeline_nominal__color_E,cat_pipeline_nominal__color_F,cat_pipeline_nominal__color_G,cat_pipeline_nominal__color_H,cat_pipeline_nominal__color_I,cat_pipeline_nominal__color_J,cat_pipeline_nominal__clarity_IF,cat_pipeline_nominal__clarity_SI1,cat_pipeline_nominal__clarity_SI2,cat_pipeline_nominal__clarity_VS1,cat_pipeline_nominal__clarity_VS2,cat_pipeline_nominal__clarity_VVS1,cat_pipeline_nominal__clarity_VVS2,cont_pipeline__carat,cont_pipeline__depth,cont_pipeline__table,cont_pipeline__x,cont_pipeline__y,cont_pipeline__z
27187,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.872687,-1.081114,0.763221,1.912529,1.836245,1.782596
3118,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.264667,0.158309,-0.662113,0.237358,0.312961,0.278833
49238,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.119152,-0.466062,-1.180787,-0.033905,0.0084,-0.069393
53575,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.045189,0.651916,-0.607288,0.104072,0.0271,0.122581
29795,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.339541,-0.120638,0.358524,-1.362183,-1.4256,-1.371937


# Test Data Preprocessing

In [44]:
final_pipeline.transform(X_Test)  # array out_put of Test Data

array([[ 0.        ,  1.        ,  0.        , ..., -0.25452028,
        -0.27365404, -0.30064513],
       [ 0.        ,  0.        ,  0.        , ...,  0.9378924 ,
         0.98458125,  0.93261927],
       [ 0.        ,  0.        ,  0.        , ..., -0.5577705 ,
        -0.52187576, -0.42801054],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.08595299,
         0.01776223,  0.12258051],
       [ 1.        ,  1.        ,  0.        , ...,  0.47508878,
         0.46387557,  0.27883258],
       [ 1.        ,  0.        ,  0.        , ...,  0.66895839,
         0.64800224,  0.52385557]])

In [45]:
# pandas out_put of Tast Data

X_Test_processed = pd.DataFrame(final_pipeline.transform(X_Test) ,
                                 columns = final_pipeline.get_feature_names_out(),
                                 index = X_Test.index)

In [46]:
X_Test_processed.head()

Unnamed: 0,cat_pipeline_ordinal__cut,cat_pipeline_nominal__color_E,cat_pipeline_nominal__color_F,cat_pipeline_nominal__color_G,cat_pipeline_nominal__color_H,cat_pipeline_nominal__color_I,cat_pipeline_nominal__color_J,cat_pipeline_nominal__clarity_IF,cat_pipeline_nominal__clarity_SI1,cat_pipeline_nominal__clarity_SI2,cat_pipeline_nominal__clarity_VS1,cat_pipeline_nominal__clarity_VS2,cat_pipeline_nominal__clarity_VVS1,cat_pipeline_nominal__clarity_VVS2,cont_pipeline__carat,cont_pipeline__depth,cont_pipeline__table,cont_pipeline__x,cont_pipeline__y,cont_pipeline__z
52264,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.330152,-0.189985,-0.092975,-0.25452,-0.273654,-0.300645
21073,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.942294,-0.189985,-1.180787,0.937892,0.984581,0.932619
42161,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.527575,1.007773,0.358524,-0.55777,-0.521876,-0.428011
35974,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.852471,-0.808669,-0.607288,-0.790641,-0.749498,-0.860004
7641,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.287305,0.581044,0.358524,0.246087,0.295786,0.320543


# DIAGRAM PIPELINE

In [47]:
final_pipeline