## Model Training

In [56]:
import pandas as pd

In [57]:
df1=pd.read_csv("data/gemstone.csv")
df=df1.sample(20000)
df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
184396,184396,0.26,Very Good,E,VVS1,63.0,59.0,4.00,4.04,2.54,506
117785,117785,0.60,Good,F,SI2,63.7,61.0,5.25,5.28,3.35,1058
164524,164524,0.90,Very Good,E,SI1,62.2,61.0,6.11,6.06,3.79,4064
4849,4849,1.21,Ideal,G,SI2,62.8,57.0,6.79,6.75,4.24,5491
35348,35348,1.06,Very Good,G,VS2,59.9,56.0,6.58,6.62,3.95,6279
...,...,...,...,...,...,...,...,...,...,...,...
2202,2202,0.85,Good,E,VS2,59.9,57.0,6.13,6.19,3.68,4372
112610,112610,0.91,Good,D,SI1,63.6,57.0,6.10,6.07,3.88,4229
43161,43161,0.31,Ideal,I,SI1,61.2,54.0,4.38,4.41,2.69,443
60229,60229,2.12,Ideal,H,SI1,62.2,55.0,8.20,8.27,5.12,18275


In [58]:
df=df.drop(labels=['id'],axis=1)

In [59]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
184396,0.26,Very Good,E,VVS1,63.0,59.0,4.00,4.04,2.54,506
117785,0.60,Good,F,SI2,63.7,61.0,5.25,5.28,3.35,1058
164524,0.90,Very Good,E,SI1,62.2,61.0,6.11,6.06,3.79,4064
4849,1.21,Ideal,G,SI2,62.8,57.0,6.79,6.75,4.24,5491
35348,1.06,Very Good,G,VS2,59.9,56.0,6.58,6.62,3.95,6279
...,...,...,...,...,...,...,...,...,...,...
2202,0.85,Good,E,VS2,59.9,57.0,6.13,6.19,3.68,4372
112610,0.91,Good,D,SI1,63.6,57.0,6.10,6.07,3.88,4229
43161,0.31,Ideal,I,SI1,61.2,54.0,4.38,4.41,2.69,443
60229,2.12,Ideal,H,SI1,62.2,55.0,8.20,8.27,5.12,18275


In [60]:
# Indepenent and Dependent features
X=df.drop(labels=['price'],axis=1)
Y=df[['price']]

In [61]:
categorical_cols=X.select_dtypes(include='object').columns
numerical_cols=X.select_dtypes(exclude='object').columns

In [62]:
# defining custom ranking for each ordinal variable
cut_categories=["Fair","Good","Very Good","Premium","Ideal"]
clarity_categories=["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"] 
color_categories=["D","E","F","G","H","I","J"]

In [63]:
from sklearn.impute import SimpleImputer            #handle missing values
from sklearn.preprocessing import StandardScaler   #handle feature scaling
from sklearn.preprocessing import OrdinalEncoder   # if categorical data has ranks in it we use ordinal encoding otherwise OneHotEncoding
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### numerical pipeline

In [64]:
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

### categorical pipeline

In [65]:
cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

### joining numerical and categorical pipeline

In [71]:
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

### Train-Test split

In [70]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=42)

In [72]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

In [73]:
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [74]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.626054,-1.315354,-0.115515,-0.498609,-0.487373,-0.586439,0.870156,-0.312061,0.025997
1,-0.820583,0.435213,-1.15076,-0.832465,-0.868826,-0.819468,0.870156,-1.546026,-1.984822
2,0.735651,0.435213,-0.633137,0.872908,0.83863,0.899125,0.870156,1.538886,-0.644276
3,0.454664,0.711619,1.437353,0.62026,0.566164,0.651531,-0.134477,-1.546026,-1.314549
4,-1.036727,0.988024,0.402108,-1.256553,-1.286608,-1.183577,-1.139111,1.538886,0.025997


### model Training

In [76]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [78]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [79]:
regression.coef_

array([[ 6389.33199352,  -128.9540756 ,   -47.4466781 , -2356.14086117,
           63.15220757,    46.14138   ,    85.42294269,  -470.72293107,
          656.34543402]])

In [80]:
regression.intercept_

array([3948.9085])

In [86]:
import numpy as np
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae,rmse,r2_square

### Training multiple models

In [89]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}
model_list=[]
r2_list=[]


for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    #making prediction
    y_pred=model.predict(X_test)
    mae,rmse,r2_square=evaluate_model(y_test,y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print("Model Training Performance")
    print("RMSE: ",rmse)
    print("MAE: ",mae)
    print("R2_SCORE: ",r2_square*100)
    r2_list.append(r2_score)
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE:  1004.5963818022544
MAE:  671.2294366813018
R2_SCORE:  93.63833183354775


Lasso
Model Training Performance
RMSE:  1004.4434530723522
MAE:  672.1611580436492
R2_SCORE:  93.64026854723684


Ridge
Model Training Performance
RMSE:  1004.3970102242563
MAE:  671.3773366304135
R2_SCORE:  93.64085664846337


ElasticNet
Model Training Performance
RMSE:  1525.0267465457912
MAE:  1059.6731536864113
R2_SCORE:  85.33970485196635


