In [245]:
import pandas as pd 
import numpy as np

In [246]:
data=pd.read_csv("./data/gemstone.csv")

In [247]:
data.drop(labels=["id"],axis=1,inplace=True)

In [248]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [249]:
X=data.drop(labels=["price"],axis=1)

In [250]:
y=data[["price"]]

In [251]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [252]:
categorical_cols=X.select_dtypes(include='object').columns

In [253]:
numerical_cols=X.select_dtypes(exclude='object').columns

In [254]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [255]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [256]:
X.select_dtypes(include="object")

Unnamed: 0,cut,color,clarity
0,Premium,F,VS2
1,Very Good,J,SI2
2,Ideal,G,VS1
3,Ideal,G,VS1
4,Premium,G,VS2
...,...,...,...
193568,Ideal,D,VVS2
193569,Premium,G,VVS2
193570,Very Good,F,SI1
193571,Very Good,D,SI1


In [257]:
# Define the custom ranking for each ordinal variable 
cut_categories=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories=['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories=['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
categories=[cut_categories,color_categories,clarity_categories]

In [258]:
from sklearn.impute import SimpleImputer ##Handling missing values
from sklearn.preprocessing import StandardScaler ## handling feature scaling
from sklearn.preprocessing import OrdinalEncoder ## Ordinal encoding
## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [259]:
# Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [260]:
##train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [261]:
preprocessor.fit_transform(X_train)

array([[-0.97543926, -0.84960654, -0.12153081, ...,  0.87407553,
         1.52872212,  1.35273128],
       [ 0.2351953 ,  1.83363716, -0.12153081, ..., -2.14455824,
        -0.93507064, -0.64678628],
       [ 0.49461699,  0.81585507,  0.39980029, ..., -0.13213573,
         0.29682574,  0.68622543],
       ...,
       [ 0.45138004,  1.55606023, -0.6428619 , ..., -2.14455824,
         0.29682574, -0.64678628],
       [ 0.66756478, -1.77486298,  1.44246248, ...,  0.87407553,
         0.29682574,  0.68622543],
       [ 0.25681377,  0.81585507, -0.12153081, ...,  0.87407553,
         0.29682574, -0.64678628]])

In [262]:
preprocessor.fit_transform(X_test)

array([[-0.56047339, -0.94666761, -0.63268646, ..., -0.1347776 ,
        -0.93190323,  0.00931367],
       [-0.17167285,  0.99091296, -0.11194742, ..., -1.13988934,
         0.91757164, -0.65513801],
       [-1.05727407,  0.25278703, -0.11194742, ...,  0.87033413,
         0.91757164,  2.66712039],
       ...,
       [-0.19327288, -3.34557688,  1.45026972, ..., -0.1347776 ,
         2.15055489,  0.00931367],
       [-0.81967374, -0.20854168, -0.11194742, ...,  0.87033413,
         0.30108002, -0.65513801],
       [ 2.61473099, -0.76213612,  1.45026972, ..., -0.1347776 ,
         2.15055489,  0.00931367]])

In [263]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [264]:
#scaling dataset
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [265]:
#checking for scaling
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.08097,-1.12315,0.874076,1.528722,1.352731
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,-2.144558,-0.935071,-0.646786
2,0.494617,0.815855,0.3998,0.570855,0.606458,0.673737,-0.132136,0.296826,0.686225
3,-1.018676,0.260701,0.921131,-1.214034,-1.24427,-1.195605,-0.132136,0.296826,0.01972
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,0.874076,2.14467,1.352731


In [266]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
import xgboost as xgb
from lightgbm import LGBMRegressor

#for metric evaluation
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [267]:
#function to evaluate model using mae,rmse and R2 score  
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [268]:
## Training  multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'Lightgbm':LGBMRegressor(),
    'XGboost':xgb.XGBRegressor()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 1013.9047094344004
MAE: 674.0255115796832
R2 score 93.68908248567512


Lasso
Model Training Performance
RMSE: 1013.8784226767013
MAE: 675.0716923362158
R2 score 93.68940971841704


Ridge
Model Training Performance
RMSE: 1013.9059272771643
MAE: 674.0555800798212
R2 score 93.68906732505938


Elasticnet
Model Training Performance
RMSE: 1533.4162456064048
MAE: 1060.7368759154729
R2 score 85.56494831165182


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001183 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1228
[LightGBM] [Info] Number of data points in the train set: 135501, number of used features: 9
[LightGBM] [Info] Start training from score 3970.766290
Lightgbm
Model Training Performance
RMSE: 578.6769211933583
MAE: 298.50019838441443
R2 score 97.94424882293809


XGboost
Model Trainin

In [269]:
#printing list of models used.
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet', 'Lightgbm', 'XGboost']

In [270]:
trained_model_list

[]

In [271]:
r2_list

[0.9368908248567511,
 0.9368940971841704,
 0.9368906732505938,
 0.8556494831165181,
 0.979442488229381,
 0.9788343218502168]