In [1]:
#Importing required libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer #Handling missing values
from sklearn.preprocessing import StandardScaler #Handling features scaling
from sklearn.preprocessing import OrdinalEncoder #Ordinal Encoding 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('dataset/gemstone.csv')

In [3]:
#Top 5 records of dataset
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
#Shape of dataset
df.shape

(193573, 11)

In [5]:
#Columns of dataset
df.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')

In [6]:
#Dropping 'id' column from dataset which is not usefull
df.drop('id', axis = 1, inplace= True)

In [7]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [8]:
#Independent and Dependent Variables
X= df.drop('price', axis = 1)
y= df[['price']]

In [9]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [10]:
y.head()

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453


In [11]:
X.shape, y.shape

((193573, 9), (193573, 1))

In [12]:
#Separate categorical features and numerical features
categorical_columns= X.dtypes[X.dtypes=='object'].index
numerical_columns= X.dtypes[X.dtypes!='object'].index

print("Categorical Features: ", categorical_columns)
print("Numerical Features: ", numerical_columns)

Categorical Features:  Index(['cut', 'color', 'clarity'], dtype='object')
Numerical Features:  Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


In [13]:
#Unique values in categorical features
for col in categorical_columns:
  print(col, df[col].unique())
  print("="*25)

cut ['Premium' 'Very Good' 'Ideal' 'Good' 'Fair']
color ['F' 'J' 'G' 'E' 'D' 'H' 'I']
clarity ['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1']


In [14]:
#Define custom ranking for each ordinal variable
cut_categories= ['Fair', 'Good', "Very Good", 'Premium', 'Ideal']
color_categories= ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories= ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [15]:
#Numerical pipeline
num_pipeline= Pipeline(
    steps= [
       ('imputer', SimpleImputer(strategy= 'median')),
       ('scaler', StandardScaler()) 
    ]
)


#Categorical Pipeline
cat_pipeline= Pipeline(
    steps= [
        ('imputer', SimpleImputer(strategy= 'most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories= [cut_categories, color_categories, clarity_categories])),
        ('scaler', StandardScaler())
    ]
)

preprocessor= ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])

In [16]:
#Train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(X,y, test_size= 0.3, random_state= 42)

In [17]:
x_train.shape, x_test.shape

((135501, 9), (58072, 9))

In [18]:
y_train.shape, y_test.shape

((135501, 1), (58072, 1))

In [19]:
#Transforming dataset
x_train= pd.DataFrame(preprocessor.fit_transform(x_train), columns= preprocessor.get_feature_names_out())
x_test= pd.DataFrame(preprocessor.transform(x_test), columns= preprocessor.get_feature_names_out())

In [20]:
x_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,-0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.8741,-0.32088,2.017037
4,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.52672,-0.648127


In [21]:
x_test.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.629077,0.25823,-0.12063,-0.600482,-0.581521,-0.572248,0.8741,-1.552614,-0.648127
1,2.605374,-2.148014,-0.12063,2.126042,2.198832,1.959219,-1.137644,0.294987,-1.314417
2,-1.125026,-1.222536,0.921902,-1.374347,-1.414721,-1.46911,-0.131772,-0.936747,2.017037
3,-1.017211,-0.574701,0.921902,-1.158385,-1.161138,-1.194265,-0.131772,1.52672,2.017037
4,0.858771,0.628421,-0.641897,0.947248,0.985258,1.004495,0.8741,0.910853,-0.648127


### Model Training 

In [22]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
#Performance of the model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [23]:
reg= LinearRegression()
#train the model 
reg.fit(x_train,y_train)

In [24]:
#predict the model
y_pred = reg.predict(x_test) 
y_pred

array([[ 1616.03275998],
       [15104.13631181],
       [ 1727.49228115],
       ...,
       [ 1878.27425152],
       [ 6295.06951547],
       [ 5976.94207688]])

In [25]:
#Coefficient and intercept
print(reg.coef_)
print(reg.intercept_)

[[ 6432.97591819  -132.34206204   -70.48787525 -1701.38593925
   -494.17005097   -76.32351645    68.80035873  -464.67990411
    652.10059539]]
[3976.8787389]


### Create the function for evaluating metrics of the models

In [26]:
def evalute_models(true, predicted):
  mae= mean_absolute_error(true, predicted)
  mse = mean_squared_error(true, predicted)
  rmse= np.sqrt(mean_squared_error(true, predicted))
  score= r2_score(true, predicted)
  return mae, rmse, score

In [27]:
evalute_models(y_test, y_pred)

(675.0758270067483, 1014.6296630375463, 0.9362906819996049)

In [31]:
#Training multiple models

models= {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet()
}

model_list= []
r2_score_list= []

for i in range(len(list(models))):
  model= list(models.values())[i]
  #Train the model
  model.fit(x_train, y_train)
  #Make Prediction
  y_predict = model.predict(x_test)
  
  mae, rmse, score= evalute_models(y_test, y_predict)
  print(list(models.keys())[i])

  model_list.append(list(models.keys())[i])
  r2_score_list.append(score)

  print("Model Training Performance")
  print("RMSE: ", rmse*100)
  print("MAE: ", mae*100)
  print("R2 Score: ", score*100)
  print("="*30)

Linear Regression
Model Training Performance
RMSE:  101462.96630375463
MAE:  67507.58270067483
R2 Score:  93.62906819996049
Ridge
Model Training Performance
RMSE:  101463.43233534414
MAE:  67510.77629781366
R2 Score:  93.6290096749163
Lasso
Model Training Performance
RMSE:  101465.91302750638
MAE:  67624.2117366551
R2 Score:  93.62869814082755
ElasticNet
Model Training Performance
RMSE:  153335.41245902312
MAE:  106094.32977143009
R2 Score:  85.44967219374031


In [32]:
model_list

['Linear Regression', 'Ridge', 'Lasso', 'ElasticNet']

In [34]:
## Results
pd.DataFrame({'Model_Name': model_list, "Scores": r2_score_list}).sort_values(by='Scores', ascending= False)

Unnamed: 0,Model_Name,Scores
0,Linear Regression,0.936291
1,Ridge,0.93629
2,Lasso,0.936287
3,ElasticNet,0.854497
