In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('diamonds.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.shape

(53940, 10)

In [4]:
y=df['price']
X=df[['carat','cut','color','clarity','depth','table','x','y','z']]

In [7]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,random_state=100)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(37758, 9) (16182, 9)
(37758,) (16182,)


In [9]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
29477,0.32,Ideal,F,VS1,61.3,56.0,4.4,4.44,2.71
52542,0.72,Very Good,I,VVS2,61.7,55.0,5.76,5.81,3.57
8725,0.38,Very Good,D,SI1,62.0,55.0,4.67,4.72,2.91
15577,1.0,Very Good,H,VVS2,62.6,56.0,6.36,6.39,3.99
19651,1.7,Premium,H,SI2,59.8,61.0,7.67,7.62,4.57


In [10]:
X_train_numerical=df[['carat','depth','table','x','y','z']]
X_train_numerical.head()

#or

#X_rain_num = X_train.select_dtypes(include=['int64', 'float64'])
#y_rain_num.head()

Unnamed: 0,carat,depth,table,x,y,z
0,0.23,61.5,55.0,3.95,3.98,2.43
1,0.21,59.8,61.0,3.89,3.84,2.31
2,0.23,56.9,65.0,4.05,4.07,2.31
3,0.29,62.4,58.0,4.2,4.23,2.63
4,0.31,63.3,58.0,4.34,4.35,2.75


In [11]:
X_train.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [13]:
X_train_categorical=X_train.select_dtypes(include='object')
X_train_categorical.head()

Unnamed: 0,cut,color,clarity
29477,Ideal,F,VS1
52542,Very Good,I,VVS2
8725,Very Good,D,SI1
15577,Very Good,H,VVS2
19651,Premium,H,SI2


# Scaling the numerical feature

In [14]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_numerical_rescaled=pd.DataFrame(scaler.fit_transform(X_train_numerical), 
                                    columns = X_train_numerical.columns, 
                                    index = X_train_numerical.index)

X_train_numerical_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
0,-1.198168,-0.174092,-1.099672,-1.587837,-1.536196,-1.571129
1,-1.240361,-1.360738,1.585529,-1.641325,-1.658774,-1.741175
2,-1.198168,-3.385019,3.375663,-1.498691,-1.457395,-1.741175
3,-1.071587,0.454133,0.242928,-1.364971,-1.317305,-1.28772
4,-1.029394,1.082358,0.242928,-1.240167,-1.212238,-1.117674


In [15]:
X_train_numerical.describe()

Unnamed: 0,carat,depth,table,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,6.54,6.54,4.04
max,5.01,79.0,95.0,10.74,58.9,31.8


In [16]:
print("Number of Numerical Features:", scaler.n_features_in_)
print("Mean of each column:", scaler.mean_)
print("Std of each column:", np.sqrt(scaler.var_))

Number of Numerical Features: 6
Mean of each column: [ 0.79793975 61.74940489 57.45718391  5.73115721  5.73452595  3.53873378]
Std of each column: [0.47400685 1.43260804 2.23446985 1.12175035 1.14212409 0.70569231]


# d. Applying OneHotEncoding on Categorical Columns

In [24]:
X_train_categorical['cut'].value_counts(normalize=True)

Ideal        0.400339
Premium      0.254224
Very Good    0.224006
Good         0.092033
Fair         0.029398
Name: cut, dtype: float64

In [25]:
X_train_categorical['clarity'].value_counts(normalize=True)

SI1     0.242094
VS2     0.226283
SI2     0.169765
VS1     0.152153
VVS2    0.095212
VVS1    0.068515
IF      0.032602
I1      0.013375
Name: clarity, dtype: float64

In [26]:
X_train_categorical['color'].value_counts(normalize=True)

G    0.210419
E    0.181736
F    0.178479
H    0.153583
D    0.122888
I    0.100244
J    0.052651
Name: color, dtype: float64

In [28]:
from sklearn.preprocessing import OneHotEncoder
encoder_=OneHotEncoder(drop='first', min_frequency=3000, sparse=False)
X_train_categorical_ohe = pd.DataFrame(encoder_.fit_transform(X_train_categorical), 
                               columns=encoder_.get_feature_names_out(X_train_categorical.columns), 
                               index = X_train_categorical.index)

X_train_categorical_ohe.head()



Unnamed: 0,cut_Ideal,cut_Premium,cut_Very Good,cut_infrequent_sklearn,color_E,color_F,color_G,color_H,color_I,color_infrequent_sklearn,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS2,clarity_infrequent_sklearn
29477,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
52542,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8725,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15577,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19651,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [30]:
encoder_.categories_

[array(['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], dtype=object),
 array(['D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype=object),
 array(['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2'],
       dtype=object)]

In [31]:
encoder_.infrequent_categories_

[array(['Fair'], dtype=object),
 array(['J'], dtype=object),
 array(['I1', 'IF', 'VVS1'], dtype=object)]

In [32]:
# OneHotEncoding the categorical features

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse=False)

# column names are (annoyingly) lost after OneHotEncoding
# (i.e. the dataframe is converted to a numpy ndarray)

X_train_categorical_ohe = pd.DataFrame(encoder.fit_transform(X_train_categorical), 
                               columns=encoder.get_feature_names_out(X_train_categorical.columns), 
                               index = X_train_categorical.index)

X_train_categorical_ohe.head()



Unnamed: 0,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
29477,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
52542,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8725,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
15577,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19651,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# e. Applying Label Encoding on Categorical Columns

In [33]:
X_train_categorical_le = pd.DataFrame(index=X_train_categorical.index)

X_train_categorical_le.head()

29477
52542
8725
15577
19651


In [34]:
X_train_categorical.cut.unique()

array(['Ideal', 'Very Good', 'Premium', 'Good', 'Fair'], dtype=object)

In [35]:
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

X_train_categorical_le['cut'] = X_train_categorical['cut'].apply(lambda x : cut_encoder[x])

X_train_categorical_le.head()

Unnamed: 0,cut
29477,4
52542,3
8725,3
15577,3
19651,5


In [36]:
X_train_categorical.color.unique()

array(['F', 'I', 'D', 'H', 'G', 'E', 'J'], dtype=object)

In [37]:
color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

X_train_categorical_le['color'] = X_train_categorical['color'].apply(lambda x : color_encoder[x])

X_train_categorical_le.head()

Unnamed: 0,cut,color
29477,4,5
52542,3,2
8725,3,7
15577,3,3
19651,5,3


In [38]:
X_train_categorical.clarity.unique()

array(['VS1', 'VVS2', 'SI1', 'SI2', 'VVS1', 'VS2', 'I1', 'IF'],
      dtype=object)

In [40]:
clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

X_train_categorical_le['clarity'] = X_train_categorical['clarity'].apply(lambda x : clarity_encoder[x])

X_train_categorical_le.head()

Unnamed: 0,cut,color,clarity
29477,4,5,5
52542,3,2,6
8725,3,7,3
15577,3,3,6
19651,5,3,2


# f. Concatinating the Encoded Categorical Features and Rescaled Numerical Features:

In [41]:
X_train_transformed = pd.concat([X_train_numerical_rescaled, X_train_categorical_le], axis=1)

X_train_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-1.198168,-0.174092,-1.099672,-1.587837,-1.536196,-1.571129,4.0,6.0,2.0
1,-1.240361,-1.360738,1.585529,-1.641325,-1.658774,-1.741175,5.0,6.0,3.0
2,-1.198168,-3.385019,3.375663,-1.498691,-1.457395,-1.741175,2.0,6.0,5.0
3,-1.071587,0.454133,0.242928,-1.364971,-1.317305,-1.28772,,,
4,-1.029394,1.082358,0.242928,-1.240167,-1.212238,-1.117674,2.0,1.0,2.0


# g. Preparing Test Data

In [42]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
52264,0.57,Ideal,E,VS2,61.5,57.0,5.35,5.32,3.28
21073,1.16,Ideal,G,VS1,61.5,55.0,6.75,6.81,4.17
42161,0.51,Ideal,G,SI1,63.2,58.0,5.05,5.08,3.2
35974,0.42,Ideal,F,VS1,60.6,56.0,4.83,4.87,2.94
7641,0.8,Premium,G,IF,62.6,58.0,5.89,5.93,3.7


In [43]:
X_test.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [44]:
X_test_cat = X_test.select_dtypes(include=['object'])

X_test_cat.head()

Unnamed: 0,cut,color,clarity
52264,Ideal,E,VS2
21073,Ideal,G,VS1
42161,Ideal,G,SI1
35974,Ideal,F,VS1
7641,Premium,G,IF


In [45]:
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

X_test_num.head()

Unnamed: 0,carat,depth,table,x,y,z
52264,0.57,61.5,57.0,5.35,5.32,3.28
21073,1.16,61.5,55.0,6.75,6.81,4.17
42161,0.51,63.2,58.0,5.05,5.08,3.2
35974,0.42,60.6,56.0,4.83,4.87,2.94
7641,0.8,62.6,58.0,5.89,5.93,3.7


In [46]:
X_test_num_rescaled = pd.DataFrame(scaler.transform(X_test_num), 
                                   columns = X_test_num.columns, 
                                   index = X_test_num.index)

X_test_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
52264,-0.480879,-0.174092,-0.204605,-0.339788,-0.362943,-0.366638
21073,0.763829,-0.174092,-1.099672,0.908262,0.941644,0.894535
42161,-0.607459,1.012555,0.242928,-0.607227,-0.573078,-0.480002
35974,-0.79733,-0.802316,-0.652139,-0.803349,-0.756946,-0.848435
7641,0.004346,0.593739,0.242928,0.141603,0.17115,0.228522


In [47]:
X_test_cat_le = pd.DataFrame(index = X_test_cat.index)

X_test_cat_le.head()

52264
21073
42161
35974
7641


In [48]:
X_test_cat_le['cut'] = X_test_cat['cut'].apply(lambda x : cut_encoder[x])

X_test_cat_le['color'] = X_test_cat['color'].apply(lambda x : color_encoder[x])

X_test_cat_le['clarity'] = X_test_cat['clarity'].apply(lambda x : clarity_encoder[x])

X_test_cat_le.head()

Unnamed: 0,cut,color,clarity
52264,4,6,4
21073,4,4,5
42161,4,4,3
35974,4,5,5
7641,5,4,8


In [49]:
X_test_transformed = pd.concat([X_test_num_rescaled, X_test_cat_le], axis=1)

X_test_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
52264,-0.480879,-0.174092,-0.204605,-0.339788,-0.362943,-0.366638,4,6,4
21073,0.763829,-0.174092,-1.099672,0.908262,0.941644,0.894535,4,4,5
42161,-0.607459,1.012555,0.242928,-0.607227,-0.573078,-0.480002,4,4,3
35974,-0.79733,-0.802316,-0.652139,-0.803349,-0.756946,-0.848435,4,5,5
7641,0.004346,0.593739,0.242928,0.141603,0.17115,0.228522,5,4,8


# Linear Regression

In [50]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train_transformed, y_train)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [57]:
X_train_transformed.isnull().sum()

carat          0
depth          0
table          0
x              0
y              0
z              0
cut        16182
color      16182
clarity    16182
dtype: int64

In [58]:
import pandas as pd

# Assuming X_train_transformed is a DataFrame
# Fill missing values in 'cut' column with mode
X_train_transformed['cut'].fillna(X_train_transformed['cut'].mode().iloc[0], inplace=True)
X_train_transformed['color'].fillna(X_train_transformed['color'].mode().iloc[0], inplace=True)
X_train_transformed['clarity'].fillna(X_train_transformed['clarity'].mode().iloc[0], inplace=True)


In [59]:
X_train_transformed.isnull().sum()

carat      0
depth      0
table      0
x          0
y          0
z          0
cut        0
color      0
clarity    0
dtype: int64

In [60]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train_transformed, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [53940, 37758]