In [2]:
import pandas as pd

## Model Training

In [2]:
df = pd.read_csv('data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df = df.drop(labels=['id'],axis=1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
## Independent and Dependent Features
X = df.drop(labels=['price'],axis=1)
y = df[["price"]]
print(X,"\n",y)

        carat        cut color clarity  depth  table     x     y     z
0        1.52    Premium     F     VS2   62.2   58.0  7.27  7.33  4.55
1        2.03  Very Good     J     SI2   62.0   58.0  8.06  8.12  5.05
2        0.70      Ideal     G     VS1   61.2   57.0  5.69  5.73  3.50
3        0.32      Ideal     G     VS1   61.6   56.0  4.38  4.41  2.71
4        1.70    Premium     G     VS2   62.6   59.0  7.65  7.61  4.77
...       ...        ...   ...     ...    ...    ...   ...   ...   ...
193568   0.31      Ideal     D    VVS2   61.1   56.0  4.35  4.39  2.67
193569   0.70    Premium     G    VVS2   60.3   58.0  5.75  5.77  3.47
193570   0.73  Very Good     F     SI1   63.1   57.0  5.72  5.75  3.62
193571   0.34  Very Good     D     SI1   62.9   55.0  4.45  4.49  2.81
193572   0.71       Good     E     SI2   60.8   64.0  5.73  5.71  3.48

[193573 rows x 9 columns] 
         price
0       13619
1       13387
2        2772
3         666
4       14453
...       ...
193568   1130
193569 

In [7]:
## Segregate Numerical and Categorical columns

numerical_columns = X.columns[X.dtypes!='object'].to_list()
categorical_columns = X.columns[X.dtypes=='object'].to_list()
print("Numerical Columns: ",numerical_columns)
print("Categorical Columns: ",categorical_columns)

Numerical Columns:  ['carat', 'depth', 'table', 'x', 'y', 'z']
Categorical Columns:  ['cut', 'color', 'clarity']


In [8]:
# Define the custom ranking for each ordinal variable

cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [9]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler, OrdinalEncoder ## Handling Feature Scaling & Ordinal Encoding
## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer ## For combining two pipelines

## Data Transformation Component

In [10]:
# Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Step 1: Fill missing values with the median of the column
        ('scaler', StandardScaler())  # Step 2: Scale the numerical features to have a mean of 0 and a standard deviation of 1
    ]
)

# Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Step 1: Fill missing values with the most frequent category in the column
        ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),  # Step 2: Encode categorical variables into numerical values based on predefined categories
        ('scaler', StandardScaler())  # Step 3: Scale the encoded categorical features (may or may not be necessary depending on the model)
    ]
)

# Preprocessor: Combines the numerical and categorical pipelines into one
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_columns),  # Apply the numerical pipeline to the specified numerical columns
    ('cat_pipeline', cat_pipeline, categorical_columns)  # Apply the categorical pipeline to the specified categorical columns
])


In [11]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=30)

print(X_train.head(),'\n\n',X_test.head(),'\n\n\n',y_train.head(),'\n\n',y_test.head())

        carat        cut color clarity  depth  table     x     y     z
33748    1.01       Fair     F     VS2   65.3   56.0  6.21  6.16  4.05
189776   0.40      Ideal     G     SI1   61.9   57.0  4.74  4.78  2.94
17703    1.13  Very Good     H     SI1   61.6   59.0  6.63  6.71  4.12
151542   0.70      Ideal     I     VS2   62.1   56.0  5.66  5.71  3.53
3795     0.54      Ideal     H     VS1   61.2   58.0  5.20  5.24  3.20 

         carat        cut color clarity  depth  table     x     y     z
70432    0.53    Premium     E     VS2   60.8   56.0  5.24  5.21  3.19
64839    0.71  Very Good     H     SI1   62.9   57.0  5.67  5.69  3.56
185316   0.30      Ideal     H      IF   62.1   57.0  4.27  4.29  2.66
84658    1.24    Premium     G     VS2   61.6   61.0  6.88  6.82  4.21
31953    0.36    Premium     E     VS1   60.4   58.0  4.60  4.63  2.80 


         price
33748    4804
189776    700
17703    5581
151542   2147
3795     1583 

         price
70432    1607
64839    2211
185316    76

In [12]:
# Convert transformed training and testing data into DataFrames with appropriate column names
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

# Display the first few rows of the transformed data
print(X_train.head(), '\n\n', X_test.head())


   num_pipeline__carat  num_pipeline__depth  num_pipeline__table  \
0             0.473767             3.220176            -0.641637   
1            -0.845697             0.075806            -0.120391   
2             0.733333            -0.201638             0.922103   
3            -0.196781             0.260769            -0.641637   
4            -0.542870            -0.571564             0.400856   

   num_pipeline__x  num_pipeline__y  num_pipeline__z  cat_pipeline__cut  \
0         0.445161         0.398319         0.747248          -3.151612   
1        -0.880410        -0.854100        -0.862659           0.873627   
2         0.823895         0.897472         0.848774          -1.138993   
3        -0.050801        -0.010079        -0.006943           0.873627   
4        -0.465605        -0.436627        -0.485564           0.873627   

   cat_pipeline__color  cat_pipeline__clarity  
0            -0.318068               0.019657  
1             0.297795              -0.64666

## Model Training

In [29]:
# Importing necessary regression models and evaluation metrics from scikit-learn

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

regression=LinearRegression() # Initialize the Linear Regression model
regression.fit(X_train,y_train) # Train the model on the training data

In [20]:
print("Coefficients :\n",regression.coef_,"\nIntercept :",regression.intercept_)

Coefficients :
 [[ 6434.78387743  -132.64285991   -70.72462747 -1744.76501257
   -471.40887345   -70.2527634     71.08211437  -462.11164254
    651.0562136 ]] 
Intercept : [3968.35483093]


In [27]:
y_pred = regression.predict(X_test)

In [38]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_score_ = r2_score(true,predicted)
    return mae,mse,rmse,r2_score_

evaluate_model(y_test,y_pred)

(677.565219031756, 1042258.6481907823, 1020.9106955021983, 0.9363482324438801)

## Model Evaluation

In [41]:
## Train Multiple Models

models ={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred = model.predict(X_test)

    mae,mse,rmse,r2_score_ = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE :",rmse)
    print("MSE :",mse)
    print("MAE :",mae)
    print("R2 Score :",r2_score_)

    r2_list.append(r2_score_)

    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE : 1020.9106955021983
MSE : 1042258.6481907823
MAE : 677.565219031756
R2 Score : 0.9363482324438801


Lasso
Model Training Performance
RMSE : 1020.8783432223119
MSE : 1042192.5916603324
MAE : 678.6412565975306
R2 Score : 0.9363522665815954


Ridge
Model Training Performance
RMSE : 1020.9104624024321
MSE : 1042258.1722427477
MAE : 677.5922524952764
R2 Score : 0.9363482615104978


ElasticNet
Model Training Performance
RMSE : 1539.931726100097
MSE : 2371389.721049624
MAE : 1062.6828823951396
R2 Score : 0.8551768818889254


