In [58]:
import pandas as pd
import numpy as np

In [59]:
data = pd.read_csv("./data/gemstone.csv")

In [60]:
data.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [61]:
data.drop(labels=["id"],axis=1,inplace=True)

In [62]:
data.duplicated().sum()

34

In [63]:
data = data.drop_duplicates()

In [64]:
X=data.drop(labels=["price"],axis=1)

In [65]:
y=data[["price"]]

In [66]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [67]:
print(categorical_cols)
print(numerical_cols)

Index(['cut', 'color', 'clarity'], dtype='object')
Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


In [68]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler ## Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder ## Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [69]:
cut_categories=["Fair","Good","Very Good","Premium","Ideal"]
clarity_categories=["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]
color_categories=["D","E","F","G","H","I","J"]

In [70]:
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer()),
        ('scaler',StandardScaler())
    ]
)

In [71]:
cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
    ]
)

In [72]:
preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]
)


In [73]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=30)

In [74]:
preprocessor.fit_transform(X_train)

array([[ 2.57516974,  0.10845663, -0.20045303, ...,  4.        ,
         4.        ,  1.        ],
       [-0.87815872,  0.10845663, -1.10204094, ...,  4.        ,
         2.        ,  3.        ],
       [-0.8363002 , -0.89643192,  0.25034093, ...,  2.        ,
         0.        ,  3.        ],
       ...,
       [ 0.46131414,  0.18023439, -0.65124699, ...,  4.        ,
         0.        ,  5.        ],
       [-1.04559283,  1.04156743, -0.20045303, ...,  1.        ,
         0.        ,  1.        ],
       [-1.04559283, -0.03509888, -1.10204094, ...,  4.        ,
         4.        ,  5.        ]])

In [76]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [77]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.fit_transform(X_test),columns=preprocessor.get_feature_names_out())


In [78]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,2.57517,0.108457,-0.200453,2.13679,2.115004,2.04448,4.0,4.0,1.0
1,-0.878159,0.108457,-1.102041,-0.925939,-0.987762,-0.89272,4.0,2.0,3.0
2,-0.8363,-0.896432,0.250341,-0.863796,-0.81787,-0.878994,2.0,0.0,3.0
3,1.507777,0.539123,0.701135,1.417715,1.363902,1.399394,3.0,1.0,2.0
4,0.419456,4.343344,-2.003629,0.30803,0.255133,0.768033,0.0,4.0,1.0


linear regression

ridge regression

lasso regression

elastic net

In [79]:
## Model Training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [82]:
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,mse,rmse,r2_square

In [89]:
## Train multiple models

models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}

trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae, mse, rmse, r2_square = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print('RMSE:',rmse)
    print('MAE:',mae)
    print('R2 score:',r2_square*100)

    r2_list.append(r2_square)

    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1431.3257529098735
MAE: 824.2675500441032
R2 score: 87.56621987215061


Lasso
Model Training Performance
RMSE: 1287.960710649326
MAE: 819.1271840504527
R2 score: 89.93227271859382


Ridge
Model Training Performance
RMSE: 1414.7102836077293
MAE: 823.6893635128258
R2 score: 87.85321810314748


ElasticNet
Model Training Performance
RMSE: 1654.5197945755547
MAE: 1085.0354577954308
R2 score: 83.38615526569491




  model = cd_fast.enet_coordinate_descent(


In [90]:
r2_list

[0.8756621987215061,
 0.8993227271859382,
 0.8785321810314748,
 0.8338615526569492]