In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
diamond_df = pd.read_csv(r"C:\Data\diamonds.csv")
diamond_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [43]:
diamond_df.shape

(53940, 10)

In [44]:
diamond_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [45]:
diamond_df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [46]:
diamond_df.cut.unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [47]:
diamond_df.color.unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [48]:
diamond_df.clarity.unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

## 1.Data Prepration 

In [49]:
# Identifying the inputs (X) and output (y)

y = diamond_df['price']

X = diamond_df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]

### Seprating the numerical & categorical data from X_train

In [50]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
26250,1.63,Ideal,G,VS1,61.7,55.0,7.56,7.6,4.68
31510,0.34,Ideal,G,VS2,62.2,57.0,4.47,4.44,2.77
40698,0.4,Ideal,E,VVS2,61.7,56.0,4.73,4.77,2.93
42634,0.58,Premium,H,SI1,62.1,55.0,5.38,5.35,3.33
47714,0.63,Very Good,D,SI1,62.8,57.0,5.4,5.46,3.41


In [51]:
X_train.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [52]:
X_train_cate = X_train.select_dtypes(include=['object'])

X_train_cate.head()

Unnamed: 0,cut,color,clarity
26250,Ideal,G,VS1
31510,Ideal,G,VS2
40698,Ideal,E,VVS2
42634,Premium,H,SI1
47714,Very Good,D,SI1


In [53]:
X_train_num = X_train.select_dtypes(include=['float64','int64'])

X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
26250,1.63,61.7,55.0,7.56,7.6,4.68
31510,0.34,62.2,57.0,4.47,4.44,2.77
40698,0.4,61.7,56.0,4.73,4.77,2.93
42634,0.58,62.1,55.0,5.38,5.35,3.33
47714,0.63,62.8,57.0,5.4,5.46,3.41


In [54]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# column names are (annoyingly) lost after Scaling
# (i.e. the dataframe is converted to a numpy ndarray)

X_train_num_rescaled = pd.DataFrame(scaler.fit_transform(X_train_num), 
                                    columns = X_train_num.columns, 
                                    index = X_train_num.index)

X_train_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
26250,1.752033,-0.036583,-1.100099,1.628446,1.621835,1.609092
31510,-0.965068,0.312362,-0.203888,-1.123365,-1.126106,-1.084595
40698,-0.838691,-0.036583,-0.651994,-0.891821,-0.839137,-0.858946
42634,-0.459561,0.242573,-1.100099,-0.312961,-0.334768,-0.294823
47714,-0.354247,0.731095,-0.203888,-0.29515,-0.239112,-0.181999


In [55]:
X_train_num.describe()

Unnamed: 0,carat,depth,table,x,y,z
count,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0
mean,0.798186,61.752419,57.455001,5.731423,5.734968,3.539049
std,0.474776,1.432909,2.231643,1.12291,1.149965,0.709074
min,0.2,43.0,43.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,5.7,5.71,3.52
75%,1.04,62.5,59.0,6.54,6.54,4.04
max,5.01,79.0,79.0,10.74,58.9,31.8


In [56]:
print("Number of Numerical Features:", scaler.n_features_in_)
print("Mean of each column:", scaler.mean_)
print("Std of each column:", np.sqrt(scaler.var_))

Number of Numerical Features: 6
Mean of each column: [ 0.79818618 61.75241935 57.45500093  5.73142334  5.73496756  3.53904894]
Std of each column: [0.47477076 1.43289254 2.23161764 1.12289688 1.1499519  0.70906532]


### Applying OneHotEncoding on Categorical Columns

In [58]:
X_train_cate['cut'].value_counts(normalize=True)

Ideal        0.400723
Premium      0.254936
Very Good    0.223535
Good         0.090911
Fair         0.029894
Name: cut, dtype: float64

In [60]:
X_train_cate['color'].value_counts(normalize=True)

G    0.206572
E    0.181938
F    0.179482
H    0.153481
D    0.126298
I    0.100204
J    0.052025
Name: color, dtype: float64

In [62]:
X_train_cate['clarity'].value_counts(normalize=True)

SI1     0.242329
VS2     0.226826
SI2     0.170096
VS1     0.150816
VVS2    0.094712
VVS1    0.067737
IF      0.033463
I1      0.014020
Name: clarity, dtype: float64

In [69]:
from sklearn.preprocessing import OneHotEncoder

encoder_ = OneHotEncoder(drop='first', sparse=False)

X_train_cate_ohe = pd.DataFrame(encoder_.fit_transform(X_train_cate), 
                               columns=encoder_.get_feature_names_out(X_train_cate.columns), 
                               index = X_train_cate.index)

X_train_cate_ohe.head()

Unnamed: 0,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
26250,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
31510,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
40698,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
42634,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
47714,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [70]:
encoder_.categories_

[array(['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], dtype=object),
 array(['D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype=object),
 array(['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2'],
       dtype=object)]

In [72]:
encoder_infrequent_categories_

NameError: name 'encoder_infrequent_categories_' is not defined

In [74]:
# OneHotEncoding the categorical features

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse=False)

# column names are (annoyingly) lost after OneHotEncoding
# (i.e. the dataframe is converted to a numpy ndarray)

X_train_cate_ohe = pd.DataFrame(encoder.fit_transform(X_train_cate), 
                               columns=encoder.get_feature_names_out(X_train_cate.columns), 
                               index = X_train_cate.index)

X_train_cate_ohe.head()

Unnamed: 0,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
26250,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
31510,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
40698,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
42634,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
47714,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Applying Label Encoding on Categorical Columns

In [81]:
X_train_cat_le = pd.DataFrame(index=X_train_cate.index)

X_train_cat_le.head()

26250
31510
40698
42634
47714


In [79]:
X_train_cate.cut.unique()

array(['Ideal', 'Premium', 'Very Good', 'Good', 'Fair'], dtype=object)

In [83]:
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

X_train_cat_le['cut'] = X_train_cate['cut'].apply(lambda x : cut_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut
26250,4
31510,4
40698,4
42634,5
47714,3


In [85]:
X_train_cate.color.unique()

array(['G', 'E', 'H', 'D', 'F', 'I', 'J'], dtype=object)

In [87]:
color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

X_train_cat_le['color'] = X_train_cate['color'].apply(lambda x : color_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut,color
26250,4,4
31510,4,4
40698,4,6
42634,5,3
47714,3,7


In [89]:
X_train_cate.clarity.unique()

array(['VS1', 'VS2', 'VVS2', 'SI1', 'IF', 'VVS1', 'SI2', 'I1'],
      dtype=object)

In [90]:
clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

X_train_cat_le['clarity'] = X_train_cate['clarity'].apply(lambda x : clarity_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut,color,clarity
26250,4,4,5
31510,4,4,4
40698,4,6,6
42634,5,3,3
47714,3,7,3


### Concatinating the Encoded Categorical Features and Rescaled Numerical Features:

In [92]:
X_train_transformed = pd.concat([X_train_num_rescaled, X_train_cat_le], axis=1)

X_train_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
26250,1.752033,-0.036583,-1.100099,1.628446,1.621835,1.609092,4,4,5
31510,-0.965068,0.312362,-0.203888,-1.123365,-1.126106,-1.084595,4,4,4
40698,-0.838691,-0.036583,-0.651994,-0.891821,-0.839137,-0.858946,4,6,6
42634,-0.459561,0.242573,-1.100099,-0.312961,-0.334768,-0.294823,5,3,3
47714,-0.354247,0.731095,-0.203888,-0.29515,-0.239112,-0.181999,3,7,3


In [93]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
10176,1.1,Ideal,H,SI2,62.0,55.0,6.61,6.65,4.11
16083,1.29,Ideal,H,SI1,62.6,56.0,6.96,6.93,4.35
13420,1.2,Premium,I,SI1,61.1,58.0,6.88,6.8,4.18
20407,1.5,Ideal,F,SI1,60.9,56.0,7.43,7.36,4.5
8909,0.9,Very Good,F,VS2,61.7,57.0,6.17,6.21,3.82


In [94]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10788 entries, 10176 to 47950
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    10788 non-null  float64
 1   cut      10788 non-null  object 
 2   color    10788 non-null  object 
 3   clarity  10788 non-null  object 
 4   depth    10788 non-null  float64
 5   table    10788 non-null  float64
 6   x        10788 non-null  float64
 7   y        10788 non-null  float64
 8   z        10788 non-null  float64
dtypes: float64(6), object(3)
memory usage: 842.8+ KB


In [95]:
X_test_cat = X_test.select_dtypes(include=['object'])

X_test_cat.head()

Unnamed: 0,cut,color,clarity
10176,Ideal,H,SI2
16083,Ideal,H,SI1
13420,Premium,I,SI1
20407,Ideal,F,SI1
8909,Very Good,F,VS2


In [96]:
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

X_test_num.head()

Unnamed: 0,carat,depth,table,x,y,z
10176,1.1,62.0,55.0,6.61,6.65,4.11
16083,1.29,62.6,56.0,6.96,6.93,4.35
13420,1.2,61.1,58.0,6.88,6.8,4.18
20407,1.5,60.9,56.0,7.43,7.36,4.5
8909,0.9,61.7,57.0,6.17,6.21,3.82


In [97]:
X_test_num_rescaled = pd.DataFrame(scaler.transform(X_test_num), 
                                   columns = X_test_num.columns, 
                                   index = X_test_num.index)

X_test_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
10176,0.635704,0.172784,-1.100099,0.78242,0.795714,0.805216
16083,1.035897,0.591517,-0.651994,1.094114,1.039202,1.14369
13420,0.846332,-0.455316,0.244217,1.022869,0.926154,0.903938
20407,1.478216,-0.594894,-0.651994,1.512674,1.413131,1.355236
8909,0.214448,-0.036583,-0.203888,0.390576,0.413089,0.396227


In [98]:
X_test_cat_le = pd.DataFrame(index = X_test_cat.index)

X_test_cat_le.head()

10176
16083
13420
20407
8909


In [99]:
X_test_cat_le['cut'] = X_test_cat['cut'].apply(lambda x : cut_encoder[x])

X_test_cat_le['color'] = X_test_cat['color'].apply(lambda x : color_encoder[x])

X_test_cat_le['clarity'] = X_test_cat['clarity'].apply(lambda x : clarity_encoder[x])

X_test_cat_le.head()

Unnamed: 0,cut,color,clarity
10176,4,3,2
16083,4,3,3
13420,5,2,3
20407,4,5,3
8909,3,5,4


In [39]:
# split into train and test : Data Prerocessing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(37758, 9) (37758,)
(16182, 9) (16182,)


In [40]:
# split into train and test : Data Prerocessing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(40455, 9) (40455,)
(13485, 9) (13485,)


In [41]:
# split into train and test : Data Prerocessing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(43152, 9) (43152,)
(10788, 9) (10788,)
