# Model Training 

In [1]:
import pandas as pd 


In [7]:
df = pd.read_csv('./dataset/gemstone.csv')

In [8]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [9]:
df= df.drop(labels=['id'], axis=1)

In [10]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [78]:
# seggregating the input columns and the target column 
X= df.drop(labels=['price'], axis=1)
Y = df[['price']]

In [79]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [80]:
Y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [81]:
# Creating categorical and numerical columns 
categorical_cols= X.select_dtypes(include="object").columns
numerical_cols= X.select_dtypes(exclude="object").columns

In [82]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [83]:
numerical_cols


Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [84]:
# Defining the custom ranking for each ordinal variable 

cut_categories= ['Fair','Good','Very Good','Premium','Ideal']
clarity_categories= ['I1','SI2','SI1','VS2', 'VS1','VVS2', 'VVS1', 'IF' ]
color_categories =['D', 'E','F','G','H','I','J']

In [85]:
# to fill the missing values with different strategies
from sklearn.impute import SimpleImputer
# scaling the values 
from sklearn.preprocessing import StandardScaler
# importing the ordinal ranking library 
from sklearn.preprocessing import OrdinalEncoder

## pipelines

from sklearn.pipeline import Pipeline 

# for grouping the pipeline
from sklearn.compose import ColumnTransformer

In [86]:
## Numerical pipeline 
num_pipeline= Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())

    ]
)

#categorical Pipeline 
cat_pipeline= Pipeline(
    steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler', StandardScaler())

    ]

)

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [87]:
## train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X,Y,test_size=0.30, random_state=42)


In [88]:
# fiting the training data using the preprocessor 
X_train= pd.DataFrame(preprocessor.fit_transform(X_train),columns= preprocessor.get_feature_names_out())

In [89]:
y_train

Unnamed: 0,price
11504,1181
95284,7418
184777,12755
5419,1020
45466,445
...,...
119879,1410
103694,15064
131932,7209
146867,816


In [90]:
# we do not fit the test data because we don't want our test dataset to take the value of mean and standaard deviation and use them to predit 
# we awant our test dataset to use its own information to predict to prevent the data leakage
# it would be like giving the question paper to the model to perform best 
X_test= pd.DataFrame(preprocessor.transform(X_test),columns= preprocessor.get_feature_names_out())

In [91]:
X_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.629077,0.258230,-0.120630,-0.600482,-0.581521,-0.572248,0.874100,-1.552614,-0.648127
1,2.605374,-2.148014,-0.120630,2.126042,2.198832,1.959219,-1.137644,0.294987,-1.314417
2,-1.125026,-1.222536,0.921902,-1.374347,-1.414721,-1.469110,-0.131772,-0.936747,2.017037
3,-1.017211,-0.574701,0.921902,-1.158385,-1.161138,-1.194265,-0.131772,1.526720,2.017037
4,0.858771,0.628421,-0.641897,0.947248,0.985258,1.004495,0.874100,0.910853,-0.648127
...,...,...,...,...,...,...,...,...,...
58067,0.255007,0.535873,0.921902,0.416340,0.369414,0.425874,-1.137644,1.526720,-1.314417
58068,-0.607514,0.535873,-0.641897,-0.528495,-0.554351,-0.499920,0.874100,-1.552614,0.018164
58069,-0.823144,-0.019414,-0.641897,-0.834441,-0.862273,-0.847093,0.874100,0.294987,2.017037
58070,0.901897,-0.667249,1.443168,1.046230,0.967145,0.932167,-0.131772,1.526720,-0.648127


In [52]:
## Model Training 

In [92]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [93]:
regression = LinearRegression()


In [94]:
regression.fit(X_train, y_train)

In [95]:
regression.coef_

array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
          652.10059539]])

In [96]:
regression.intercept_

array([3976.8787389])

In [99]:
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [100]:
y_pred

array([[ 1616.03275998],
       [15104.13631181],
       [ 1727.49228115],
       ...,
       [ 1878.27425152],
       [ 6295.06951547],
       [ 5976.94207688]])

In [114]:
# let's create a function that gives the metrics of the model
import numpy as np
def evaluate_model( true, predicted):
    mae = mean_absolute_error( true, predicted)
    mse= mean_squared_error(true, predicted)
    rmse= np.sqrt(mean_squared_error(true, predicted))
    r2 = r2_score(true, predicted)

    return mae,rmse,r2


In [117]:
### Train multiple models 

models ={
    'LinearRegression' : LinearRegression(), 
    'Lasso' : Lasso(), 
    'Ridge' : Ridge(),
    'ElasticNet' : ElasticNet()
}

model_list=[]
r2_list=[]

# for model in models:
for i in range (len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    #making the predictions 
    y_pred = model.predict(X_test)

    mae,rmse,r2= evaluate_model( y_test, y_pred)

    Model_name = (list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print(Model_name)
    print('Model training performance are as below :')
    print(f'RMSE is {rmse}')
    print(f'MAE is {mae}')
    print(f'R2_score is {r2*100}')
    r2_list.append(r2)

    print('='*35)
    print('\n')

LinearRegression
Model training performance are as below :
RMSE is 1014.6296630375463
MAE is 675.0758270067483
R2_score is 93.62906819996049


Lasso
Model training performance are as below :
RMSE is 1014.659130275064
MAE is 676.2421173665508
R2_score is 93.62869814082755


Ridge
Model training performance are as below :
RMSE is 1014.6343233534411
MAE is 675.1077629781329
R2_score is 93.62900967491632


ElasticNet
Model training performance are as below :
RMSE is 1533.3541245902313
MAE is 1060.9432977143008
R2_score is 85.44967219374031




In [118]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']