In [1]:
import os
import numpy as np
import pandas as pd

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
path = './data/diamonds_20200620.csv'
df = pd.read_csv(path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14705 entries, 0 to 14704
Data columns (total 30 columns):
carat                    14705 non-null float64
clarity                  14705 non-null object
color                    14705 non-null object
culet                    14705 non-null object
cut                      14705 non-null object
date                     14705 non-null object
dateSet                  14705 non-null object
depth                    14705 non-null float64
detailsPageUrl           14705 non-null object
fluorescence             14705 non-null object
hasVisualization         14705 non-null bool
id                       14705 non-null object
imageUrl                 0 non-null float64
lxwRatio                 14705 non-null float64
measurements             14705 non-null object
polish                   14705 non-null object
price                    14705 non-null int64
pricePerCarat            14705 non-null int64
quickShip                14705 non-null bool
sell

### Columns used for modeling
* Response variable: price
* Numerical: carat, depth, lxwRatio, fluorescence, measurements (need transform), table
* Categorical: clarity, color, culet, cut, polish, symmetry
* Might be helpful: date, sellingIndex

### Columns used for visualization
detailsPageUrl, hasVisualization, quickShip, id, skus, sold, v360BaseUrl, visualizationImageUrl, willArriveForHoliday

### Columns not used
dateSet, imageUrl, shapeCode (only RD used), shapeName

In [4]:
# peak in the most expensive diamond from data set
df[df['price'] == df['price'].max()]

Unnamed: 0,carat,clarity,color,culet,cut,date,dateSet,depth,detailsPageUrl,fluorescence,...,shapeCode,shapeName,skus,sold,strikethroughPrice,symmetry,table,v360BaseUrl,visualizationImageUrl,willArriveForHoliday
14703,2.01,VVS2,F,Very Small,Ideal,Jun 15,Jun 16,58.5,./diamond-details/LD12192160,,...,RD,Round,LD12192160,False,,Excellent,60.0,https://bluenile.v360.in/17/imaged/gia-2195886...,https://bnsec.bluenile.com/bnsecure/diamondvis...,False
14704,2.01,IF,G,,Ideal,Jun 15,Jun 16,62.0,./diamond-details/LD13818467,,...,RD,Round,LD13818467,False,,Excellent,57.0,https://bluenile.v360.in/17/imaged/gia-7341590...,https://bnsec.bluenile.com/bnsecure/diamondvis...,False


In [5]:
# filter data based on price in range
#lower_limit, upper_limit = 1e+4, 3e+4
#df = df.loc[(df['price'] >= lower_limit) & (df['price'] <= upper_limit), :]

In [6]:
df['measurements'].head(5)

0    7.45 x 7.40 x 4.44 mm
1    7.32 x 7.26 x 4.57 mm
2    6.52 x 6.49 x 4.02 mm
3    7.07 x 7.03 x 4.38 mm
4    7.06 x 7.12 x 4.44 mm
Name: measurements, dtype: object

In [7]:
# parse out measurement columns
measurements = df['measurements'].str.replace(' mm', '').str.split(' x ', expand=True)
measurements = measurements.apply(pd.to_numeric)
measurements.columns = ['measurements_length (mm)', 'measurements_width (mm)', 'measurements_height (mm)']
df = df.join(measurements)

In [8]:
df['price_ln'] = np.log(df['price'])
df['carat_ln'] = np.log(df['carat'])

In [9]:
cols_num = ['price', 'carat', 'depth', 'lxwRatio', 'table', 'sellingIndex', 'measurements_length (mm)', 'measurements_width (mm)', 'measurements_height (mm)', 'price_ln', 'carat_ln']
cols_cat = ['clarity', 'color', 'culet', 'cut', 'fluorescence', 'polish', 'symmetry']

df = df[cols_num + cols_cat]

In [10]:
# mean and median are close across variables, no signs of extreme outliers
df.describe()

Unnamed: 0,price,carat,depth,lxwRatio,table,sellingIndex,measurements_length (mm),measurements_width (mm),measurements_height (mm),price_ln,carat_ln
count,14705.0,14705.0,14705.0,14705.0,14705.0,14705.0,14705.0,14705.0,14705.0,14705.0,14705.0
mean,17032.758177,1.831152,62.056307,1.007108,57.966066,0.260637,7.769472,7.769338,4.820979,9.711732,0.581426
std,4316.882473,0.399186,1.195928,0.004762,1.75522,0.179514,0.553215,0.555594,0.357458,0.249025,0.217868
min,10001.0,0.9,55.3,1.0,52.0,0.00087,6.06,6.11,1.51,9.21044,-0.105361
25%,14074.0,1.52,61.6,1.0,57.0,0.115079,7.37,7.37,4.57,9.552084,0.41871
50%,16338.0,1.82,62.3,1.01,58.0,0.217766,7.8,7.8,4.85,9.701249,0.598837
75%,19655.0,2.01,62.7,1.01,59.0,0.375305,8.09,8.1,5.04,9.886087,0.698135
max,29994.0,4.03,67.9,1.03,68.0,0.918382,10.19,10.19,6.39,10.308753,1.393766


Mean and median are close across variables, no signs of extreme outliers for this price range ($10k ~ $30k)

In [11]:
df.corr()

Unnamed: 0,price,carat,depth,lxwRatio,table,sellingIndex,measurements_length (mm),measurements_width (mm),measurements_height (mm),price_ln,carat_ln
price,1.0,0.561932,-0.032942,-0.027882,0.023863,-0.140817,0.57829,0.574658,0.54572,0.988662,0.56255
carat,0.561932,1.0,0.120878,0.037067,0.076483,-0.082008,0.981486,0.982872,0.977004,0.570919,0.986561
depth,-0.032942,0.120878,1.0,0.247921,-0.542552,0.021396,-0.019229,-0.00971,0.253331,-0.032278,0.127632
lxwRatio,-0.027882,0.037067,0.247921,1.0,-0.035417,0.035493,0.000234,0.004438,0.069746,-0.024632,0.046031
table,0.023863,0.076483,-0.542552,-0.035417,1.0,0.058221,0.1421,0.139653,-0.010212,0.024228,0.082716
sellingIndex,-0.140817,-0.082008,0.021396,0.035493,0.058221,1.0,-0.077871,-0.079098,-0.069592,-0.14292,-0.068574
measurements_length (mm),0.57829,0.981486,-0.019229,0.000234,0.1421,-0.077871,1.0,0.995669,0.954046,0.592682,0.985138
measurements_width (mm),0.574658,0.982872,-0.00971,0.004438,0.139653,-0.079098,0.995669,1.0,0.956528,0.589251,0.986829
measurements_height (mm),0.54572,0.977004,0.253331,0.069746,-0.010212,-0.069592,0.954046,0.956528,1.0,0.559621,0.981815
price_ln,0.988662,0.570919,-0.032278,-0.024632,0.024228,-0.14292,0.592682,0.589251,0.559621,1.0,0.579936


Unsurprisingly Carat has the strongest correlation with Price. Measurement metrics (length, height, width) correlate with Carat, which makes sense as all measure the diamond size. We don't know how Selling Index is computed, but it seems weekly negatively correlated with Price (the higher the price, the harder to sell?).

In [None]:
# Diamond fluorescence itself is a debated topic, see https://www.leibish.com/diamond-fluorescence-article-245
# [TODO] Explore how different pair of color + fluorescence may result in different price
# Now we just simplify based on UV light intensity
print(df['fluorescence'].unique())
def map_fluorescence(x):
    if x == 'None':
        return 'None'
    elif 'Faint' in x:
        return 'Faint'
    elif 'Medium' in x:
        return 'Medium'
    elif 'Very Strong' in x:
        return 'Very Strong'
    elif 'Strong' in x:
        return 'Strong'
    else:
        raise ValueError('Unexpected value')
df['fluorescence_reduced'] = df['fluorescence'].map(map_fluorescence)
print(df['fluorescence_reduced'].unique())

In [None]:
# ranking: the bigger the better
ranking = {
    'clarity': ['I2', 'I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF', 'FL'],
    'color': ['K', 'J', 'I', 'H', 'G', 'F', 'E', 'D'],
    'cut': ['Good', 'Very Good', 'Ideal', 'Astor Ideal'], # https://www.bluenile.com/education/diamonds/cut
    'culet': ['Medium', 'Small', 'Very Small', 'Pointed', 'None'],
    'polish': ['Good', 'Very Good', 'Excellent'],
    'symmetry': ['Good', 'Very Good', 'Excellent'],
    'fluorescence_reduced': ['Very Strong', 'Strong', 'Medium', 'Faint', 'None']
}

In [None]:
df['hasVisualization'] = df['hasVisualization'].astype(int)

In [12]:
fig = px.scatter(df, x='carat', y='price', 
    color='clarity',
    opacity=0.5,
    marginal_x='box',
    marginal_y='histogram',
    title='Carat vs Price vs Clarity',
    template='plotly_dark', width=800, height=500)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig.show()

In [13]:
# Histogram plots: see if price and carat are skewed
fig2 = make_subplots(rows=1, cols=2)

cols = ['price', 'carat']
for i, cc in enumerate(cols):
    r, c = int(i / 2 + 1), i % 2 + 1
    fig2.add_trace(go.Histogram(x=df[cc], name=cc.capitalize()), row=r, col=c)
fig2.update_layout(
    height=400, width=800,
    margin=dict(l=20, r=20, t=20, b=20),
    title_text="Price and Carat Histograms")
fig2.show()

In [15]:
@interact
def scatter_plot(color=cols_cat, symbol=cols_cat):
    fig, ax = plt.subplots()
    fig.set_size_inches(14, 10)
    sns.scatterplot(x='carat', y='price', palette='Blues', hue_order=rankings[color]
                    hue=color, style=symbol, data=df, ax=ax)

interactive(children=(Dropdown(description='color', options=('clarity', 'color', 'culet', 'cut', 'fluorescence…

In [16]:
@interact
def box_plot(y=cols_cat):
    fig, ax = plt.subplots()
    fig.set_size_inches(14, 10)
    sns.boxplot(x='price', y=y, data=df, ax=ax)

interactive(children=(Dropdown(description='y', options=('clarity', 'color', 'culet', 'cut', 'fluorescence', '…

In [17]:
# ranking: the smaller the better
ranking = {
    'clarity': ['FL', 'IF', 'VVS1', 'VVS2', 'VS1', 'VS2', 'SI1', 'SI2', 'I1', 'I2'],
    'color': ['D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'],
    'cut': ['Astor Ideal', 'Ideal', 'Very Good', 'Good'], # https://www.bluenile.com/education/diamonds/cut
    'culet': ['None', 'Pointed', 'Very Small', 'Small', 'Medium'],
    'polish': ['Excellent', 'Very Good', 'Good'],
    'symmetry': ['Excellent', 'Very Good', 'Good'],
    'fluorescence_reduced': ['None', 'Faint', 'Medium', 'Strong', 'Very Strong']
}