In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

In [2]:
all_data = pd.read_csv('diamonds.csv')

In [3]:
all_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


In [4]:
all_data.info() # no missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   x        53940 non-null  float64
 7   y        53940 non-null  float64
 8   z        53940 non-null  float64
 9   price    53940 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [5]:
all_data.describe()

Unnamed: 0,carat,depth,table,x,y,z,price
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,5.731157,5.734526,3.538734,3932.799722
std,0.474011,1.432621,2.234491,1.121761,1.142135,0.705699,3989.439738
min,0.2,43.0,43.0,0.0,0.0,0.0,326.0
25%,0.4,61.0,56.0,4.71,4.72,2.91,950.0
50%,0.7,61.8,57.0,5.7,5.71,3.53,2401.0
75%,1.04,62.5,59.0,6.54,6.54,4.04,5324.25
max,5.01,79.0,95.0,10.74,58.9,31.8,18823.0


In [6]:
(all_data == 0).sum() #size can not be 0

carat       0
cut         0
color       0
clarity     0
depth       0
table       0
x           8
y           7
z          20
price       0
dtype: int64

In [7]:
all_data = all_data.drop(all_data.index[all_data['z']==0])
(all_data == 0).sum() 

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [8]:
all_data['size'] = all_data['x']*all_data['y']*all_data['z']
cat_attribs = all_data.select_dtypes(['object']).columns.tolist()
num_attribs = all_data.select_dtypes(exclude=['object']).columns.tolist()
num_attribs.remove('price')

In [9]:
for cat in cat_attribs:
    print(cat,all_data[cat].unique())

cut ['Ideal' 'Premium' 'Good' 'Very Good' 'Fair']
color ['E' 'I' 'J' 'H' 'F' 'G' 'D']
clarity ['SI2' 'SI1' 'VS1' 'VS2' 'VVS2' 'VVS1' 'I1' 'IF']


In [10]:
cat_pipeline = Pipeline([
    ('encode', OrdinalEncoder())
])

In [11]:
num_pipeline = Pipeline([
    ('scaler', StandardScaler()),
])

In [12]:
preprocessing = ColumnTransformer([
    ('cat', cat_pipeline, cat_attribs),
    ('num', num_pipeline, num_attribs),
])

In [13]:
X_train, X_test = train_test_split(all_data, test_size=0.3)

y_train = X_train['price']
y_test = X_test['price']

In [14]:
models = {
    'svr_model': SVR(),
    'lin_model': LinearRegression(),
    'tree_model': DecisionTreeRegressor(),
    'lasso': Lasso(),
}

In [15]:
final_model = Pipeline([
    ('preprocessing', preprocessing),
    ('model_regression', models['svr_model'])
])

In [16]:
for model_id, model in enumerate(models):
    final_model.set_params(model_regression = models[model])
    final_model.fit(X_train, y_train)
    print(f'{model}: {final_model.score(X_test, y_test)}') #coefficient of determination of the prediction

svr_model: 0.5674602894404237
lin_model: 0.8833812024210451
tree_model: 0.9659637282247366
lasso: 0.8833933398892917


In [17]:
final_model.set_params(model_regression = models['tree_model'])
final_model.fit(X_train, y_train)

In [18]:
y_pred = final_model.predict(X_test)

In [19]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean squared error = {mse}\nRoot mean squared error = {rmse}\nAbsolute mean error = {mae}')

Mean squared error = 544705.8547385015
Root mean squared error = 738.041905814637
Absolute mean error = 358.5407084569733


In [21]:
joblib.dump(final_model,'Diamonds_price_prediction.plk')

['Diamonds_price_prediction.plk']