Author: Siddharth Yadav

Date: 17-12-23

Dataset: Diamonds [link](https://www.kaggle.com/datasets/shivam2503/diamonds)

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

In [15]:
from pathlib import Path

IMAGES_PATH = Path() / "images" / "Diamonds dataset"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [4]:
df = pd.read_csv("diamonds.csv", sep=',', index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [29]:
cut_mapping = {'Fair': 1,
               'Good': 2,
               'Very Good': 3,
               'Premium': 4,
               'Ideal': 5}
df['cut'] = df['cut'].replace(cut_mapping)

color_mapping = {
    'J': 1,
    'I': 2,
    'H': 3,
    'G': 4,
    'F': 5,
    'E': 6,
    'D': 7
}
df['color'] = df['color'].replace(color_mapping)

clarity_mapping = {
    'I1': 0,
    'SI2': 1,
    'SI1': 2,
    'VS2': 3,
    'VS1': 4,
    'VVS2': 5,
    'VVS1': 6,
    'IF': 7}
df['clarity'] = df['clarity'].replace(clarity_mapping)


In [30]:
df.describe()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,3.904097,4.405803,3.05102,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.1166,1.701105,1.647136,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,1.0,1.0,0.0,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,3.0,3.0,2.0,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,4.0,4.0,3.0,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,5.0,6.0,4.0,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,5.0,7.0,7.0,79.0,95.0,18823.0,10.74,58.9,31.8


In [31]:
X, y = df.drop(columns=['price']), df['price']

In [32]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
1,0.23,5,6,1,61.5,55.0,3.95,3.98,2.43
2,0.21,4,6,2,59.8,61.0,3.89,3.84,2.31
3,0.23,2,6,4,56.9,65.0,4.05,4.07,2.31
4,0.29,4,2,3,62.4,58.0,4.2,4.23,2.63
5,0.31,2,1,1,63.3,58.0,4.34,4.35,2.75


In [33]:
y.head()

1    326
2    326
3    327
4    334
5    335
Name: price, dtype: int64

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42)

# SVM Regression Model

In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_val_score

svm_reg = make_pipeline(StandardScaler(),
                        LinearSVR(epsilon=0.5, random_state=42))
svm_rmses = -cross_val_score(svm_reg, X_train, y_train,
                             scoring="neg_root_mean_squared_error", cv=5)



In [40]:
svm_rmses

array([1642.98049986, 1672.11118571, 1700.4863424 , 1716.06549702,
       1685.24247849])

In [51]:
from sklearn.metrics import mean_squared_error

svm_reg.fit(X_train, y_train)
y_pred = svm_reg.predict(X_train)
svm_rmse = mean_squared_error(y_train, y_pred,
                              squared=False)
svm_rmse



1616.5310182434216

## Fine-Tuning

In [60]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import loguniform

svm_full_pipeline = Pipeline([
    ("standard_scaler", StandardScaler()),
    ("svm", LinearSVR(random_state=42)),
])

svm_param_distribs = {
    'svm__C': np.logspace(-3, 5, 17),
}

svm_rnd_search = RandomizedSearchCV(
    svm_full_pipeline, svm_param_distribs, n_iter=10, cv=3,
    scoring="neg_root_mean_squared_error", random_state=42)

svm_rnd_search.fit(X_train, y_train)



In [63]:
svm_cv_res = pd.DataFrame(svm_rnd_search.cv_results_)
svm_cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
svm_cv_res = svm_cv_res[["param_svm__C", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score"]]
score_cols = ["split0", "split1", "split2", "mean_test_rmse"]
svm_cv_res.columns = ["C"] + score_cols
svm_cv_res[score_cols] = -svm_cv_res[score_cols].round().astype(np.int64)
svm_cv_res.head()

Unnamed: 0,C,split0,split1,split2,mean_test_rmse
9,31.622777,1322,1342,1377,1347
4,316.227766,1320,1353,1378,1350
5,10000.0,1313,1363,1384,1353
7,3162.27766,1318,1365,1381,1355
6,10.0,1340,1353,1393,1362


# Decision Tree Regressor

In [42]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(
    StandardScaler(),
    DecisionTreeRegressor(random_state=42))
tree_rmses = -cross_val_score(tree_reg, X_train, y_train,
                              scoring="neg_root_mean_squared_error", cv=5)

In [43]:
tree_rmses

array([804.10164178, 761.33345418, 781.76344544, 790.28879708,
       773.61354419])

# Random Forest Regressor

In [44]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(StandardScaler(),
                           RandomForestRegressor(random_state=42))
forest_rmses = -cross_val_score(forest_reg, X_train, y_train,
                                scoring="neg_root_mean_squared_error", cv=5)

In [45]:
forest_rmses

array([601.01371563, 567.39407733, 555.72274671, 574.39084223,
       551.6087629 ])

In [48]:
forest_reg.fit(X_train, y_train)
y_pred = forest_reg.predict(X_train)
forest_rmse = mean_squared_error(y_train, y_pred,
                                 squared=False)
forest_rmse

208.1548993230987

Training error is much lower than the validation error, which usually meanns the model has overfit the training set.

## Fine-Tuning

In [49]:
from scipy.stats import randint

full_pipeline = Pipeline([
    ("standard_scaler", StandardScaler()),
    ("random_forest", RandomForestRegressor(random_state=42)),
])

param_distribs = {"random_forest__max_features": randint(low=2, high=20)}

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3,
    scoring="neg_root_mean_squared_error", random_state=42)

rnd_search.fit(X_train, y_train)

In [50]:
cv_res = pd.DataFrame(rnd_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res = cv_res[["param_random_forest__max_features", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score"]]
cv_res.columns = ["max_features"] + score_cols
cv_res[score_cols] = -cv_res[score_cols].round().astype(np.int64)
cv_res.head()

Unnamed: 0,max_features,split0,split1,split2,mean_test_rmse
0,8,597,545,570,571
4,8,597,545,570,571
7,5,599,548,567,571
1,16,599,548,570,572
2,12,599,548,570,572
