# Introducing the diamond prices dataset

**Data Set Information:**
A dataset containing the prices and other features of almost 54,000 diamonds.

**Number of Attributes: 10**

Feature Information: A data frame with 53,940 rows and 10 variables:

price: price in US dollars

carat: weight of the diamond

cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)

color: diamond colour, from J (worst) to D (best)

clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

x: length in mm

y: width in mm

z: depth in mm

depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y)

table: width of top of diamond relative to widest point


In [1]:
import numpy as np
import pandas as pd
import os

## Loading the data

In [2]:
DATA_DIR = '../data'
FILE_NAME = 'diamonds.csv'
data_path = os.path.join(DATA_DIR, FILE_NAME)

In [3]:
diamonds = pd.read_csv(data_path)
diamonds.shape

(53940, 10)

In [4]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


## Checking summary statistiscs for numerical columns

In [5]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


## Checking the values of x equal to zero

In [6]:
diamonds.loc[diamonds['x']==0]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
11182,1.07,Ideal,F,SI2,61.6,56.0,4954,0.0,6.62,0.0
11963,1.0,Very Good,H,VS2,63.3,53.0,5139,0.0,0.0,0.0
15951,1.14,Fair,G,VS1,57.5,67.0,6381,0.0,0.0,0.0
24520,1.56,Ideal,G,VS2,62.2,54.0,12800,0.0,0.0,0.0
26243,1.2,Premium,D,VVS1,62.1,59.0,15686,0.0,0.0,0.0
27429,2.25,Premium,H,SI2,62.8,59.0,18034,0.0,0.0,0.0
49556,0.71,Good,F,SI2,64.1,60.0,2130,0.0,0.0,0.0
49557,0.71,Good,F,SI2,64.1,60.0,2130,0.0,0.0,0.0


In [7]:
diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0)]

In [8]:
diamonds.loc[11182]

carat       1.07
cut        Ideal
color          F
clarity      SI2
depth       61.6
table         56
price       4954
x              0
y           6.62
z              0
Name: 11182, dtype: object

In [9]:
diamonds.loc[11182, 'x'] = diamonds['x'].median()

In [10]:
diamonds.loc[diamonds['x']==0].shape

(0, 10)

In [11]:
diamonds.loc[diamonds['y']==0]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z


In [12]:
diamonds.loc[diamonds['z']==0]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2207,1.0,Premium,G,SI2,59.1,59.0,3142,6.55,6.48,0.0
2314,1.01,Premium,H,I1,58.1,59.0,3167,6.66,6.6,0.0
4791,1.1,Premium,G,SI2,63.0,59.0,3696,6.5,6.47,0.0
5471,1.01,Premium,F,SI2,59.2,58.0,3837,6.5,6.47,0.0
10167,1.5,Good,G,I1,64.0,61.0,4731,7.15,7.04,0.0
11182,1.07,Ideal,F,SI2,61.6,56.0,4954,5.7,6.62,0.0
13601,1.15,Ideal,G,VS2,59.2,56.0,5564,6.88,6.83,0.0
24394,2.18,Premium,H,SI2,59.4,61.0,12631,8.49,8.45,0.0
26123,2.25,Premium,I,SI1,61.3,58.0,15397,8.52,8.42,0.0
27112,2.2,Premium,H,SI1,61.2,59.0,17265,8.42,8.37,0.0


In [13]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53933.0,53933.0,53933.0,53933.0,53933.0,53933.0,53933.0
mean,0.797884,61.749333,57.457002,3932.155026,5.732007,5.73527,3.539193
std,0.473983,1.432501,2.234052,3988.700283,1.11966,1.140339,0.704592
min,0.2,43.0,43.0,326.0,3.73,3.68,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.0,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [14]:
diamonds.loc[(diamonds['y'] > 30) | (diamonds['z'] > 30)]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
24067,2.0,Premium,H,SI2,58.9,57.0,12210,8.09,58.9,8.06
48410,0.51,Very Good,E,VS1,61.8,54.7,1970,5.12,5.15,31.8
49189,0.51,Ideal,E,VS1,61.8,55.0,2075,5.15,31.8,5.12


In [15]:
diamonds = diamonds.loc[~((diamonds['y'] > 30) | (diamonds['z'] > 30))]

In [17]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53930.0,53930.0,53930.0,53930.0,53930.0,53930.0,53930.0
mean,0.797873,61.749384,57.457107,3932.072353,5.731985,5.733812,3.538556
std,0.473965,1.432488,2.234057,3988.634982,1.119639,1.111498,0.693716
min,0.2,43.0,43.0,326.0,3.73,3.68,0.0
25%,0.4,61.0,56.0,949.25,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.0,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,10.54,6.98


### One-hot encoding with pandas

In [27]:
pd.get_dummies(diamonds['cut'], prefix='cut').head()

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good
0,0,0,1,0,0
1,0,0,0,1,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,1,0,0,0


In [28]:
pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True).head()

Unnamed: 0,cut_Good,cut_Ideal,cut_Premium,cut_Very Good
0,0,1,0,0
1,0,0,1,0
2,1,0,0,0
3,0,0,1,0
4,1,0,0,0


## Adding the one-hot encoded categorical features

In [None]:
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)], axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)], axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)], axis=1)