In [1]:
import pandas as pd

Model Training

In [2]:
df=pd.read_csv("data/diamonds.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df=df.drop(labels=["Unnamed: 0"],axis=1)

In [5]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [7]:
df.duplicated().sum()

146

In [8]:
df=df.drop_duplicates()

In [9]:
df.duplicated().sum()

0

In [10]:
X=df.drop(labels=['price'],axis=1)
Y=df[['price']]

In [11]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


In [12]:
Y

Unnamed: 0,price
0,326
1,326
2,327
3,334
4,335
...,...
53935,2757
53936,2757
53937,2757
53938,2757


In [13]:
categorical_cols=X.select_dtypes(include='object').columns
numerical_cols=X.select_dtypes(exclude='object').columns

In [14]:
cut_categories=['Fair','Good','Very Good','Premium','Ideal']
color_categories=['D','E','F','G','H','I','J']
clarity_categories=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [17]:

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer

In [19]:
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer(
    [('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)]
)

In [20]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [21]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [22]:
X_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.051584,-0.033284,0.243924,-1.284193,-1.278314,-1.277823,0.980476,-0.937929,-0.639483
1,0.429992,1.435995,0.691446,0.465214,0.525171,0.681416,-0.810878,-0.937929,-0.029483
2,0.916795,0.036682,-0.651121,1.036449,0.989434,1.012758,0.980476,-0.349957,1.190516
3,-1.030418,0.876270,-0.203599,-1.284193,-1.260458,-1.191386,-0.810878,-0.937929,-0.639483
4,0.218338,1.016201,-0.651121,0.375959,0.346608,0.494136,-0.810878,1.413958,-0.639483
...,...,...,...,...,...,...,...,...,...
37650,-0.818765,1.226098,-1.546165,-0.918245,-0.876548,-0.773607,-1.706556,2.001930,2.410515
37651,-1.030418,0.876270,-0.203599,-1.230639,-1.260458,-1.162574,0.084799,0.238015,-1.249483
37652,0.239503,0.946236,-0.203599,0.295629,0.346608,0.436511,-1.706556,2.001930,-1.859482
37653,0.472322,0.316545,1.586491,0.608023,0.668021,0.681416,-1.706556,2.001930,-1.249483


In [23]:
X_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.030418,1.785824,-1.098643,-1.284193,-1.305098,-1.133761,-1.706556,1.413958,-0.639483
1,-0.183804,0.946236,-0.203599,-0.070319,-0.108727,0.018732,-0.810878,0.825987,-0.639483
2,-0.522450,-0.313146,-0.651121,-0.391638,-0.349787,-0.399047,0.980476,1.413958,-0.639483
3,1.636417,-1.082769,1.138968,1.571982,1.516195,1.372913,0.084799,-0.349957,-1.249483
4,0.556984,-0.033284,-0.651121,0.759757,0.730518,0.739041,0.980476,0.825987,-0.639483
...,...,...,...,...,...,...,...,...,...
16134,1.678748,-1.642494,1.586491,1.688014,1.632261,1.401725,0.084799,-1.525900,-1.249483
16135,-0.204969,1.086167,-0.651121,-0.079244,-0.028374,0.076357,-1.706556,-0.349957,1.800515
16136,0.726307,-0.662975,0.243924,0.947194,0.900153,0.825478,0.084799,-0.349957,-1.249483
16137,-0.522450,-1.152734,0.243924,-0.347010,-0.385500,-0.485484,0.084799,-0.349957,-0.639483


In [24]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [25]:
regressor=LinearRegression()
regressor.fit(X_train,y_train)

In [26]:
regressor.intercept_

array([3928.71730182])

In [27]:
regressor.coef_

array([[ 5160.67602012,  -121.55709618,   -63.07364595, -1168.73269142,
          145.14252022,    -6.80040089,   136.53534809,  -546.34323226,
          815.72120944]])

In [29]:
import numpy as np
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted),
    mse=mean_squared_error(true,predicted),
    rmse=np.sqrt(mse)
    r2_square=r2_score(true,predicted)
    return mae,rmse,r2_square

In [36]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}

trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred=model.predict(X_test)
    mae,rmse,r2_square=evaluate_model(y_test,y_pred)


    print((list(models.keys())[i]))

    model_list.append(list(models.keys())[i])
    print('Model Training Performance')
    print('RMSE :- ',rmse)
    print("MAE :- ",mae)
    print("R2 score",r2_square)

    r2_list.append(r2_square)

    print('-'*35)
    print("\n")










LinearRegression
Model Training Performance
RMSE :-  [1228.33742334]
MAE :-  (805.9253773190505,)
R2 score 0.9051115842202947
-----------------------------------




Lasso
Model Training Performance
RMSE :-  [1227.17334886]
MAE :-  (806.8005526586711,)
R2 score 0.9052913472750835
-----------------------------------


Ridge
Model Training Performance
RMSE :-  [1228.28666852]
MAE :-  (806.0188660784271,)
R2 score 0.9051194256230943
-----------------------------------


ElasticNet
Model Training Performance
RMSE :-  [1642.54023995]
MAE :-  (1078.1266417701347,)
R2 score 0.8303281228188145
-----------------------------------


