In [54]:
# Importing numerical computation libraries
import numpy as np
import pandas as pd
import math

import pickle  # pickle is a module which helps us to store our files in bytes/ binary format so that we can use it later where ever we want to.

# Visualization libraries for EDA
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning utilities
from sklearn.pipeline import Pipeline  
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import  RobustScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor 

# Evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score

## Data Loading

In [2]:
df=pd.read_csv("diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


## Data Cleaning and Manipulation

In [3]:
# Checking Dimensions
df.shape

(53940, 10)

In [4]:
# Understanding data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [5]:
# Statistical summary of numerical features
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [6]:
# Checking for missing values 
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [7]:
# Renaming x,y,z and depth features name 
df = df.rename(columns = {"depth":"depth_percentage"})
df = df.rename(columns={'x': 'length', 'y': 'width', 'z': 'depth'})
df.head()

Unnamed: 0,carat,cut,color,clarity,depth_percentage,table,price,length,width,depth
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [8]:
# dealing with zero values in all num columns
df_zero = df.loc[(df[["length", "width", "depth"]] == 0).any(axis=1)]
df_zero.head()

Unnamed: 0,carat,cut,color,clarity,depth_percentage,table,price,length,width,depth
2207,1.0,Premium,G,SI2,59.1,59.0,3142,6.55,6.48,0.0
2314,1.01,Premium,H,I1,58.1,59.0,3167,6.66,6.6,0.0
4791,1.1,Premium,G,SI2,63.0,59.0,3696,6.5,6.47,0.0
5471,1.01,Premium,F,SI2,59.2,58.0,3837,6.5,6.47,0.0
10167,1.5,Good,G,I1,64.0,61.0,4731,7.15,7.04,0.0


In [9]:
df_zero.index

Index([ 2207,  2314,  4791,  5471, 10167, 11182, 11963, 13601, 15951, 24394,
       24520, 26123, 26243, 27112, 27429, 27503, 27739, 49556, 49557, 51506],
      dtype='int64')

In [10]:
df = df.drop(index = df_zero.index)
df.shape

(53920, 10)

## Pipeline

In [11]:
x = df.drop("price", axis = 1)
y = df['price']

In [12]:
# Separating ordinal and numerical features
cat_cols = x.select_dtypes(include= 'O').columns
num_cols = x.select_dtypes(include= 'number').columns

In [13]:
# Define the ordered categories for each categorical feature
ordinal_categories = [
    ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'],     # cut
    ['J', 'I', 'H', 'G', 'F', 'E', 'D'],                 # color
    ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']  # clarity
]

In [14]:
# Create the pipeline for categorical features
cat_pipeline = Pipeline(steps=[
    ("missing_value_treatment", SimpleImputer(strategy="most_frequent")),  # Fill missing values
    ("encoder", OrdinalEncoder(categories=ordinal_categories, handle_unknown="use_encoded_value", unknown_value=-1))
])

cat_pipeline

In [15]:
# Create the pipeline for numerical features
num_pipeline = Pipeline(steps=[("missing_value_treatment", SimpleImputer(strategy="median"))])

num_pipeline

In [16]:
# merging the data after imputation and transformation
data_preprocess = ColumnTransformer(
    transformers=[
        ("numerical_columns",num_pipeline,num_cols),
        ("categorical_columns",cat_pipeline,cat_cols)
    ])

data_preprocess

In [17]:
# Complete preprocessing + Random Forest model pipeline
pipeline=Pipeline(steps=[
    ("data_preprocessing",data_preprocess),
    ("model",RandomForestRegressor())])

pipeline

In [18]:
# splitting the data into train and test

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=69)

In [19]:
# Train the model on the training dataset
pipeline.fit(x_train, y_train)

In [20]:
# Generate predictions on the test dataset using the DT model
y_pred = pipeline.predict(x_test)
y_pred

array([ 1879.5 ,  1673.93,  1630.2 , ..., 13629.35,   839.15,   634.43])

In [21]:
# Create a Series for predicted values to align them with actual test labels for evaluation
y_pred_series = pd.Series(y_pred, index=y_test.index)

In [22]:
# Combine actual and predicted values into a single DataFrame for comparison
result = pd.concat([y_test, y_pred_series], axis=1)
result.columns = ["Actual", "Predicted"]  # Rename columns for clarity
result

Unnamed: 0,Actual,Predicted
48271,1956,1879.50
45703,1698,1673.93
44432,1590,1630.20
7954,4320,4668.09
52523,2528,2403.70
...,...,...
25215,13813,13410.57
52597,2542,2580.08
25499,14294,13629.35
32430,792,839.15


In [23]:
# Calculating Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test,y_pred_series)
mae

270.6685291304578

In [24]:
# Calculating Mean Squared Error
mse = mean_squared_error(y_test,y_pred_series)
mse

298284.1031549665

In [25]:
# Calculating Root Mean Squared Error 
rmse = root_mean_squared_error(y_test,y_pred_series)
rmse

546.1539189230143

In [26]:
# Generating predictions on the training data using the model
y_train_pred = pipeline.predict(x_train)

In [27]:
# Calculating R² score for training data
r2_train = r2_score(y_train, y_train_pred)
r2_train

0.9974222908304066

In [28]:
# Calculating R² score for test data
r2_test = r2_score(y_test,y_pred_series)
r2_test

0.9812802710834528

## Hyperparameter Tunning

In [41]:
# Define the hyperparameter grid
hyper_param = {
    'model__n_estimators':[50,60,70,100],
    'model__max_depth': [5,10,15],
    'model__min_samples_split': [2,5,10],
    'model__min_samples_leaf': [1,2,5]
}

random_search = RandomizedSearchCV(pipeline,
                                 param_distributions=hyper_param,
                                 n_iter=108,
                                 cv=5,
                                 scoring="neg_root_mean_squared_error",
                                 n_jobs = -1,
                                 verbose = 1
)

In [42]:
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [43]:
print("Best parameters set found on train set")
print(random_search.best_params_)
print()
print(random_search.best_estimator_)
print()
print('Score on Test Data: ', random_search.score(x_test, y_test))

Best parameters set found on train set
{'model__n_estimators': 100, 'model__min_samples_split': 2, 'model__min_samples_leaf': 2, 'model__max_depth': 15}

Pipeline(steps=[('data_preprocessing',
                 ColumnTransformer(transformers=[('numerical_columns',
                                                  Pipeline(steps=[('missing_value_treatment',
                                                                   SimpleImputer(strategy='median'))]),
                                                  Index(['carat', 'depth_percentage', 'table', 'length', 'width', 'depth'], dtype='object')),
                                                 ('categorical_columns',
                                                  Pipeline(steps=[('missing_value_treatment',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                          

In [44]:
# Retrieve the best DT model found by GridSearchCV
best_model = random_search.best_estimator_

# Generate predictions on the test dataset using the DT model
y_pred_best = best_model.predict(x_test)

In [45]:
# Create a Series for predicted values to align them with actual test labels for evaluation
y_pred_series_best = pd.Series(y_pred_best, index=y_test.index)

In [46]:
# Combine actual and predicted values into a single DataFrame for comparison
result_best = pd.concat([y_test, y_pred_series_best], axis=1)
result_best.columns = ["Actual", "Predicted"]  # Rename columns for clarity
result_best

Unnamed: 0,Actual,Predicted
48271,1956,1886.922398
45703,1698,1683.295602
44432,1590,1651.811539
7954,4320,4640.194343
52523,2528,2414.070783
...,...,...
25215,13813,13390.555452
52597,2542,2659.358283
25499,14294,13841.600554
32430,792,847.317818


In [47]:
# Calculating Mean Absolute Error (MAE)
mae_best = mean_absolute_error(y_test,y_pred_series_best)
mae_best

271.5657620107594

In [48]:
# Calculating Mean Squared Error
mse_best = mean_squared_error(y_test,y_pred_series_best)
mse_best

294708.85465759755

In [49]:
# Calculating Root Mean Squared Error 
rmse_best = root_mean_squared_error(y_test,y_pred_series_best)
rmse_best

542.870937385303

In [50]:
# Generating predictions on the training data using the model
y_train_pred_best = best_model.predict(x_train)

In [51]:
# Calculating R² score for training data
r2_train_best = r2_score(y_train, y_train_pred_best)
r2_train_best

0.9937251268899111

In [52]:
# Calculating R² score for test data
r2_test_best = r2_score(y_test,y_pred_series_best)
r2_test_best

0.9815046467104881

## Pickle 

In [55]:
# Exporting model into pickle file
with open("Diamond_Price_Prediction.pkl", "wb") as f:
    pickle.dump(best_model,f)  # storing your model inside a pickle file with the name diamond_price_prediction