In [1]:
import pandas as pd

In [2]:
df1 = pd.DataFrame(pd.read_csv(r"/content/gemstone.csv"))

In [3]:
df1.head(11)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619.0
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387.0
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772.0
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666.0
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453.0
5,5,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506.0
6,6,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229.0
7,7,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224.0
8,8,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886.0
9,9,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421.0


In [4]:
df1.drop(columns=['id'], inplace= True)

In [5]:
X = df1.drop(columns=['price'], axis = 1)
Y = df1["price"]

In [6]:
numerical_columns = X.select_dtypes(exclude= 'object').columns
categorical_columns = X.select_dtypes(include= 'object').columns

In [7]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [8]:
df1["cut"].value_counts()

cut
Ideal        56127
Premium      30054
Very Good    22780
Good          7057
Fair          1198
Name: count, dtype: int64

In [9]:
df1["color"].value_counts().sort_values()

color
J     3855
I    10632
D    14780
H    18780
F    20756
E    21663
G    26750
Name: count, dtype: int64

In [10]:

df1["clarity"].value_counts().sort_values()

clarity
I1        313
IF       2600
VVS1     6427
VVS2     9572
SI2     18365
VS1     18609
VS2     29069
SI1     32261
Name: count, dtype: int64

In [11]:
cut_categories = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_categories = ["D", "E", "F", "G", "H", "I", "J"]
clarity_categories = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

In [12]:
from sklearn.impute import SimpleImputer # Missing values
from sklearn.preprocessing import StandardScaler # Feature scaling (Numerical datatypes)
from sklearn.preprocessing import OrdinalEncoder # To rank the categorical variables
#Pipeline
from sklearn.pipeline import Pipeline # To club everything together
from sklearn.compose import ColumnTransformer # Begin the work

In [13]:
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy= "median")),
        ("scaler", StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy= "most_frequent")),
        ("OrdinalEncoder", OrdinalEncoder(categories= [cut_categories, color_categories, clarity_categories])),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, numerical_columns),
        ('cat_pipeline', cat_pipeline, categorical_columns)
    ]
)