In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [108]:
df = pd.read_csv('diamonds.zip')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
Unnamed: 0    53940 non-null int64
carat         53940 non-null float64
cut           53940 non-null object
color         53940 non-null object
clarity       53940 non-null object
depth         53940 non-null float64
table         53940 non-null float64
price         53940 non-null int64
x             53940 non-null float64
y             53940 non-null float64
z             53940 non-null float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB


In [109]:
# unnamed is useless.

df.drop('Unnamed: 0', axis=1,inplace=True)

Let's jump to categorical data first

In [23]:
cat_list = ['cut','color', 'clarity']
for cat in cat_list:
    print(f"column {cat}, unique values: {df[cat].unique()}\n")

column cut, unique values: ['Ideal' 'Premium' 'Good' 'Very Good' 'Fair']

column color, unique values: ['E' 'I' 'J' 'H' 'F' 'G' 'D']

column clarity, unique values: ['SI2' 'SI1' 'VS1' 'VS2' 'VVS2' 'VVS1' 'I1' 'IF']



In [110]:
# cut, color and clarity are oridinal according to the description in Kaggle ==> use factorize method (better than ordical encoder)
# if there were any nominal category, I would have used pd.get_dummies or OneHotEncoding

cut = pd.Categorical(df['cut'], categories=['Fair','Good','Very Good','Premium','Ideal'], ordered=True)
labels_cut, unique = pd.factorize(cut, sort=True)
df['cut'] = labels_cut

color = pd.Categorical(df['color'], categories=['J','I','H','G','F','E','D'], ordered=True)
labels_color, unique = pd.factorize(color, sort=True)
df['color'] = labels_color

clarity = pd.Categorical(df['clarity'], categories=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF'], ordered=True)
labels_clarity, unique = pd.factorize(clarity, sort=True)
df['clarity'] = labels_clarity

In [25]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,4,5,1,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,5,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,5,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,1,3,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,0,1,63.3,58.0,335,4.34,4.35,2.75


In [28]:
df.describe()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,2.904097,3.405803,3.05102,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.1166,1.701105,1.647136,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,0.0,0.0,0.0,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,2.0,2.0,2.0,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,3.0,3.0,3.0,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,4.0,5.0,4.0,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,4.0,6.0,7.0,79.0,95.0,18823.0,10.74,58.9,31.8


min of x, y and z are zero which doesn't make sense, and even more weird that the depth min is not zero!
let's see how many rows we have we either x, y or z being zero

In [111]:
len(df[(df['x']==0) | (df['y']==0) | (df['z']==0)])

20

not many compare to all the instances. there is no harm in dropping these 20 instances.

In [112]:
df = df[(df['x']!=0) & (df['y']!=0) & (df['z']!=0)]

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53920 entries, 0 to 53939
Data columns (total 10 columns):
carat      53920 non-null float64
cut        53920 non-null int64
color      53920 non-null int64
clarity    53920 non-null int64
depth      53920 non-null float64
table      53920 non-null float64
price      53920 non-null int64
x          53920 non-null float64
y          53920 non-null float64
z          53920 non-null float64
dtypes: float64(6), int64(4)
memory usage: 4.5 MB


In [37]:
df.corr()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
carat,1.0,-0.134953,-0.29136,-0.352757,0.028259,0.181646,0.921592,0.977779,0.953991,0.961048
cut,-0.134953,1.0,0.020517,0.189153,-0.218073,-0.433306,-0.053491,-0.126232,-0.122181,-0.150647
color,-0.29136,0.020517,1.0,-0.025783,-0.047373,-0.026481,-0.172431,-0.270671,-0.263915,-0.270011
clarity,-0.352757,0.189153,-0.025783,1.0,-0.067457,-0.160256,-0.146789,-0.372865,-0.359015,-0.37025
depth,0.028259,-0.218073,-0.047373,-0.067457,1.0,-0.295733,-0.010729,-0.025017,-0.029069,0.095023
table,0.181646,-0.433306,-0.026481,-0.160256,-0.295733,1.0,0.127245,0.196097,0.184493,0.152483
price,0.921592,-0.053491,-0.172431,-0.146789,-0.010729,0.127245,1.0,0.887231,0.867864,0.868206
x,0.977779,-0.126232,-0.270671,-0.372865,-0.025017,0.196097,0.887231,1.0,0.974918,0.975435
y,0.953991,-0.122181,-0.263915,-0.359015,-0.029069,0.184493,0.867864,0.974918,1.0,0.956744
z,0.961048,-0.150647,-0.270011,-0.37025,0.095023,0.152483,0.868206,0.975435,0.956744,1.0


x, y and z are highly corrolated to price and also to themselves.
I will try two different approches, first I will leave them as-is and persue.
second, I will remove them and add the volume(x*y*z) as a new feature (feature engineering)

In [113]:
# saving a copy of the data frame for the second approach

df_second_approach = df.copy()

In [114]:
# Alright, time to start the first approach

X = df.drop('price',axis=1)
y = df['price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

feature scaling, different methods to choose between, 1.StandardScaler, 2.MinMax Scaler, 3.MaxAbs Scaler and 4.Robust Scaler (which is more robust for datasets with many outliers)

In [115]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [48]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,BaggingRegressor
from sklearn.neural_network import MLPRegressor
import xgboost
from sklearn import metrics

In [59]:
# definning a function for easier implimentation of the ML models

def machine_learning_score(regressor,X_train,y_train,X_test,y_test):
    regressor.fit(X_train,y_train)
    predictions = regressor.predict(X_test)
    print(f"Mean absolute error {regressor}: {metrics.mean_absolute_error(y_test,predictions)}")
    print(f"Mean squared error {regressor}: {metrics.mean_squared_error(y_test,predictions)}")
    print(f"Root Mean squared error {regressor}: {np.sqrt(metrics.mean_squared_error(y_test,predictions))}")
    print(f"The explained variance score {regressor}: {metrics.explained_variance_score(y_test,predictions)} \n \n")
    sns.distplot((y_test-predictions),bins=50)
#   sns.scatterplot(x=y_test,y=predictions)

In [52]:
# definning a function only for R2 sscore

def machine_learning_r2(regressor,X_train,y_train,X_test,y_test):
    regressor.fit(X_train,y_train)
    predictions = regressor.predict(X_test)
    return metrics.explained_variance_score(y_test,predictions)

In [61]:
# instanciating all the models that we are going to apply

lr = LinearRegression()
lasso = Lasso()
ridge = Ridge()
svr = SVR()
knr = KNeighborsRegressor()
dtree = DecisionTreeRegressor()
rfr = RandomForestRegressor()
abr = AdaBoostRegressor(n_estimators=1000)
mlpr = MLPRegressor()
xgb = xgboost.XGBRegressor()

# results_dict = {'lr':0, 'lasso':0, 'ridge':0, 'svr':0, 'knr':0, 'dtree':0, 'rfr':0, 'abr':0, 'mlpr':0, 'xgb':0}

models_list = [lr., lasso, ridge, svr, knr, dtree, rfr, abr, mlpr, xgb]

for model in models_list:
    print(f'{model} R2 score is: {machine_learning_r2(model,X_train,y_train,X_test,y_test)} \n')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) R2 score is: 0.9073363813262023 



  positive)


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False) R2 score is: 0.9083661849738621 

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001) R2 score is: 0.9074148430346514 

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False) R2 score is: 0.561147561001033 

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform') R2 score is: 0.9674525954137738 

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=Non



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators='warn',
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False) R2 score is: 0.9809868574600056 

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=1000, random_state=None) R2 score is: 0.9353877215471815 



  if getattr(data, 'base', None) is not None and \


MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False) R2 score is: 0.9468849164140898 

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosi

now let's check the R2 score, if take the second approach

In [69]:
# now second approach approach
#creating the volume column

df_second_approach['volume'] = df[['x','y','z']].apply(lambda x: x[0]*x[1]*x[2], axis=1)

In [70]:
df_second_approach.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,0.23,4,5,1,61.5,55.0,326,3.95,3.98,2.43,38.20203
1,0.21,3,5,2,59.8,61.0,326,3.89,3.84,2.31,34.505856
2,0.23,1,5,4,56.9,65.0,327,4.05,4.07,2.31,38.076885
3,0.29,3,1,3,62.4,58.0,334,4.2,4.23,2.63,46.72458
4,0.31,1,0,1,63.3,58.0,335,4.34,4.35,2.75,51.91725


In [71]:
#dropping the x,y,z columns

df_second_approach.drop(['x','y','z'],axis=1,inplace=True)

In [72]:
df_second_approach.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,volume
0,0.23,4,5,1,61.5,55.0,326,38.20203
1,0.21,3,5,2,59.8,61.0,326,34.505856
2,0.23,1,5,4,56.9,65.0,327,38.076885
3,0.29,3,1,3,62.4,58.0,334,46.72458
4,0.31,1,0,1,63.3,58.0,335,51.91725


In [73]:
X_2 = df_second_approach.drop('price',axis=1)
y_2 = df_second_approach['price']

from sklearn.model_selection import train_test_split
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.3, random_state=42)

In [74]:
ss2 = StandardScaler()
X_train_2 = ss2.fit_transform(X_train_2)
X_test_2 = ss2.transform(X_test_2)

In [75]:
# instanciating all the models that we are going to apply

lr_2 = LinearRegression()
lasso_2 = Lasso()
ridge_2 = Ridge()
svr_2 = SVR()
knr_2 = KNeighborsRegressor()
dtree_2 = DecisionTreeRegressor()
rfr_2 = RandomForestRegressor()
abr_2 = AdaBoostRegressor(n_estimators=1000)
mlpr_2 = MLPRegressor()
xgb_2 = xgboost.XGBRegressor()

# results_dict = {'lr':0, 'lasso':0, 'ridge':0, 'svr':0, 'knr':0, 'dtree':0, 'rfr':0, 'abr':0, 'mlpr':0, 'xgb':0}

models_list_2 = [lr_2, lasso_2, ridge_2, svr_2, knr_2, dtree_2, rfr_2, abr_2, mlpr_2, xgb_2]

for model in models_list:
    print(f'{model} R2 score is: {machine_learning_r2(model,X_train_2,y_train_2,X_test_2,y_test_2)} \n')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) R2 score is: 0.9087441517978085 

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False) R2 score is: 0.9087437315956916 

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001) R2 score is: 0.9087449965338639 

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False) R2 score is: 0.5205842028839572 

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform') R2 score is: 0.9647246406891171 

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,

  if getattr(data, 'base', None) is not None and \


MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False) R2 score is: 0.9594929182687618 

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosi

it looks like the first approach had higher R2 scores when comparing to the second approach.

ok out of all the models, top five models based on R2 score are: RandomForestRegressor, XGBRegressor, KNeighborsRegressor, DecisionTreeRegressor and MLPRegressor.
let's see if we can improve the R2 score by performing hyperparameter tuning using GridSearchCV

In [82]:
from sklearn.model_selection import GridSearchCV

In [120]:
# Random Forest GridSearch

params_dict = {'n_estimators':[5,10,20,30,50,80,100], 'n_jobs':[-1],'max_features':['auto','sqrt','log2']}
rfr_GridSearch = GridSearchCV(estimator=RandomForestRegressor(), param_grid=params_dict,scoring='r2')
rfr_GridSearch.fit(X_train,y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_features': ['auto', '

In [121]:
rfr_GridSearch.best_params_

{'max_features': 'auto', 'n_estimators': 100, 'n_jobs': -1}

In [124]:
rfr_GridSearch.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [127]:
rfr_Grid_Best = RandomForestRegressor(max_features='auto',n_estimators=100,n_jobs=-1)

In [128]:
machine_learning_r2(rfr_Grid_Best,X_train,y_train,X_test,y_test)

0.9848396822883224