<a href="https://colab.research.google.com/github/Pedro-A-D-S/diamonds-price/blob/main/Diamonds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction

This project aims to make an EDA of diamond prices and apply a predictive model for the price.

## EDA

## Imports

In [None]:
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

import warnings

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# SetUp

In [None]:
warnings.filterwarnings('ignore')

sns.set_palette("winter")
sns.set_style("darkgrid")

# Functions

In [None]:
def plot_quantity(data, col):
  ax = sns.set(rc = {'figure.figsize': (10, 6)})
  order = data[col].value_counts().index
  ax = sns.countplot(x = col, data = data, order = order)
  ax.set_title('Quantity of diamonds by {}'.format(col), fontsize = 16)
  ax.set_xlabel(f'{col}', fontsize = 14)
  ax.set_ylabel('Quantity', fontsize = 14)
  ax = ax
  return ax

In [None]:
def mean_price_by_cathegory(data, col):

  mean_price_by_cathegory = data.groupby([col]).mean().price.round(2).sort_values(ascending = False).reset_index()
  mean_price_by_cathegory = pd.DataFrame(mean_price_by_cathegory)
  mean_price_by_cathegory.rename(columns = {'price':'mean price'}, inplace = True)
  mean_price_by_cathegory
  
  return mean_price_by_cathegory

In [None]:
def median_price_by_cathegory(data, col):

  median_price_by_cathegory = data.groupby([col]).median().price.round(2).sort_values(ascending = False).reset_index()
  median_price_by_cathegory = pd.DataFrame(median_price_by_cathegory)
  median_price_by_cathegory.rename(columns = {'price':'median price'}, inplace = True)
  median_price_by_cathegory
  
  return median_price_by_cathegory

In [None]:
def std_price_by_cathegory(data, col):

  std_price_by_clarity = data.groupby([col]).std().price.round(2).sort_values(ascending = False).reset_index()
  std_price_by_clarity = pd.DataFrame(std_price_by_clarity)
  std_price_by_clarity.rename(columns = {'price':'std of price'}, inplace = True)
  std_price_by_clarity

  return std_price_by_clarity

# Get Data

In [None]:
diamond = pd.read_csv('../data/01_raw/diamonds.csv')

In [None]:

diamond.head()

# Description of dataset features

Carat : Diamond weight in carats (1 carat = 2 grams)

Cut : Describes the quality of the diamond's cut. Quality increases in order: Fair, Good, Very Good, Premium, Ideal.

Color : Color of the diamond, with D being the best and J being the worst.

Clarity: Diamond clarity refers to the lack of flaws and imperfections.

Depth : The height of the diamond.

Depth: The height of the diamond, measured from the culet (a culet is a flat face on the bottom of a gemstone) to the table, divided by the average girdle diameter. In circular diamonds, there is not one that is perfectly round. Therefore, each diameter is not the same and therefore the average value is taken.

Table : The width of the diamond expressed as a percentage of its average diameter.

Price : Diamond price in dollars.

X : Diamond width in millimeters.

Y : Height of the diamond in millimeters.

Z : Diamond length in millimeters.

## Dataset description

In [None]:
description = diamond.describe().round(2)
description

It is possible to obtain several relevant informations about the analyzed data, such as the average price (mean) of a diamond is almost 4 thousand dollars.

However, when comparing the minimum and maximum prices, they have a wide range of values, as indicated by the high standard deviation (std).

Another aspect to take into account is the existence of values ​​x = y = z = 0, which makes them unreal since there are no diamonds with these spatial dimensions.

## Removing outliers

In [None]:
# investigating duplicated
diamond.duplicated().sum()

In [None]:
# dropping duplicates
diamond = diamond.drop_duplicates()

In [None]:
# removing zeros
diamond = diamond[(diamond[['x','y','z']] != 0).all(axis = 1)]

In [None]:
# describing after outliers removing
diamond.describe().round(2)

Now our dataset hasn't zeros in its dimensions.

Removing outliers of numerical data

In [None]:
# defining series of variable
x = diamond['x']

# quantiles and interval interquantile
Q1 = x.quantile(.45)
Q3 = x.quantile(.75)
IIQ = Q3 - Q1

# lower and upper limits
lower_limit = Q1 - 1.5 * IIQ
upper_limit = Q3 + 1.5 * IIQ

# filtering by limits
selection = (x >= lower_limit) & (x <= upper_limit)
diamond = diamond[selection]

In [None]:
x = diamond['y']

# quantiles and interval interquantile
Q1 = x.quantile(.45)
Q3 = x.quantile(.75)
IIQ = Q3 - Q1

# lower and upper limits
lower_limit = Q1 - 1.5 * IIQ
upper_limit = Q3 + 1.5 * IIQ

# filtering by limits
selection = (x >= lower_limit) & (x <= upper_limit)
diamond = diamond[selection]

In [None]:
x = diamond['z']

# quantiles and interval interquantile
Q1 = x.quantile(.45)
Q3 = x.quantile(.75)
IIQ = Q3 - Q1

# lower and upper limits
lower_limit = Q1 - 1.5 * IIQ
upper_limit = Q3 + 1.5 * IIQ

# filtering by limits
selection = (x >= lower_limit) & (x <= upper_limit)
diamond = diamond[selection]

In [None]:
x = diamond['table']

# quantiles and interval interquantile
Q1 = x.quantile(.45)
Q3 = x.quantile(.75)
IIQ = Q3 - Q1

# lower and upper limits
lower_limit = Q1 - 1.5 * IIQ
upper_limit = Q3 + 1.5 * IIQ

# filtering by limits
selection = (x >= lower_limit) & (x <= upper_limit)
diamond = diamond[selection]

In [None]:
x = diamond['depth']

# quantiles and interval interquantile
Q1 = x.quantile(.45)
Q3 = x.quantile(.75)
IIQ = Q3 - Q1

# lower and upper limits
lower_limit = Q1 - 1.5 * IIQ
upper_limit = Q3 + 1.5 * IIQ

# filtering by limits
selection = (x >= lower_limit) & (x <= upper_limit)
diamond = diamond[selection]

## Getting information

In [None]:
# looking for null values
diamond.isnull().sum()

The dataset has no null data.

## Extracting descriptive price statistics by category

# Cathegories by price

In [None]:
fig = px.histogram(data_frame = diamond, x = 'color', text_auto = '.3s',
                   template = 'plotly_dark', range_y = [0, 11000])

fig.update_xaxes(categoryorder = 'total descending')
fig.update_yaxes(showticklabels = False)

fig.update_layout(
    title = {
        'text': "Quantity of Diamonds by Color",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Colors',
    yaxis_title = 'Quantity',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    font = dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White")

fig.update_traces(textposition = 'outside',textfont_size = 16, marker_color = 'Red')

fig.show()

# Clarity

In [None]:
fig = px.histogram(diamond, x='clarity', template = 'plotly_dark',
                   text_auto = '.3s', range_y = [0, 10000])

fig.update_xaxes(categoryorder = 'total descending')
#fig.update_yaxes(showticklabels = False)

fig.update_layout(
    title = {
        'text': "Diamonds by clarity",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Clarity',
    yaxis_title = 'Quantity',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    font = dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White")

fig.update_traces(textposition = 'outside',textfont_size = 16, marker_color = 'Red')

fig.show()

It is possible to notice the decrease in diamonds' number with clarity.

In [None]:
fig = px.histogram(diamond, x='clarity', color="color", barmode='group',
                   template = 'plotly_dark',
                   text_auto = '.3s')

fig.update_xaxes(categoryorder = 'total descending')
#fig.update_yaxes(showticklabels = False)

fig.update_layout(
    title = {
        'text': "Diamonds by clarity and color destribuition",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Clarity',
    yaxis_title = 'Quantity',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    font = dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White")

fig.update_traces(textposition = 'outside',textfont_size = 16)

fig.show()

There is a predominance of E and G colors in the cuts with the highest amounts of diamonds in the dataset, which is attenuated as these amounts decrease.

In [None]:
fig = px.histogram(diamond, x='clarity', color="cut", barmode='group',
                   template = 'plotly_dark',
                   text_auto = '.3s', range_y = [0, 5000])

fig.update_xaxes(categoryorder = 'total descending')
# fig.update_yaxes(showticklabels = False)

fig.update_layout(
    title = {
        'text': "Diamonds by clarity and cut destribuition",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Clarity',
    yaxis_title = 'Quantity',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    font = dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White")

fig.update_traces(textposition = 'outside',textfont_size = 16)

fig.show()

In [None]:
mean_price = diamond.groupby(by = 'clarity').agg({'price':'mean'}).round(2).reset_index().sort_values(by = 'price',
                                                                                         ascending = False)
mean_price['percent'] = ((mean_price['price'] / mean_price['price'].sum()) * 100).round(2)
mean_price['acc_sum'] = mean_price['percent'].cumsum().round(2)

In [None]:
fig = px.bar(data_frame = mean_price, x = 'clarity', y = 'price', text_auto = '.3s',
             template = 'plotly_dark', range_y = [0, 5000])

fig.update_xaxes(categoryorder = 'total descending')

fig.update_layout(
    title = {
        'text': 'Mean Price by Clarity',
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Clarity',
    yaxis_title = 'Quantity',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    font = dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White")

fig.update_traces(textposition = 'outside',textfont_size = 16, marker_color = 'Red')

fig.show()

As we can see, the mean price decreases with the decrease of quality in clarity.

# Cut

In [None]:
fig = px.histogram(diamond, x='cut', template = 'plotly_dark',
                   text_auto = '.3s', range_y = [0, 22000])

fig.update_xaxes(categoryorder = 'total descending')

fig.update_layout(
    title = {
        'text': "Diamonds by cut",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Cut',
    yaxis_title = 'Quantity',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    font = dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White")

fig.update_traces(textposition = 'outside', textfont_size = 16, marker_color = 'Red')

fig.show()

The diamonds quantity decreases with the quality of the cut.

In [None]:
fig = px.histogram(diamond, x='cut', color="color", barmode='group',
                   text_auto = '.2s', template = 'plotly_dark')

fig.update_xaxes(categoryorder = 'total descending')
# fig.update_yaxes(showticklabels = False)

fig.update_layout(
    title = {
        'text': "Diamonds by clarity and cut destribuition",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Clarity',
    yaxis_title = 'Quantity',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    font = dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White")

fig.update_traces(textposition = 'outside',textfont_size = 16)

fig.show()

This is how the colors are distributed in relation to the types of cuts. There is a predominance of G and E colors in the first cuts and this behavior is accentuated along the distributions.

In [None]:
fig = make_subplots(rows = 2, cols = 2)

fig = make_subplots(rows = 2,
                    cols = 2,
                    specs=[[{}, {}],
                    [{"colspan": 2}, None]],
                    subplot_titles = [
                        'Mean Price',  # 1. subplot title
                        'Median Price',# 2. subplot title
                        'Std Price'    # 3. subplot title
])

# add the 1st graph by specifying which row and column it will come to
fig.add_trace(go.Bar(x=diamond_mean_price_by_cut['cut'], 
                     y= diamond_mean_price_by_cut['mean price'], textposition='auto', name='Mean'), row = 1, col = 1)
# add the 2nd graph
fig.add_trace(go.Bar(x=diamond_median_price_by_cut['cut'], 
                     y= diamond_median_price_by_cut['median price'], textposition='auto', name='Median'), row = 1, col = 2)
# add the 3th graph
fig.add_trace(go.Bar(x = diamond_std_price_by_cut['cut'], 
                     y = diamond_std_price_by_cut['std of price'], textposition='auto', name='Std'), row = 2, col = 1)

# Update xaxis properties
fig.update_xaxes(title_text="Cut", row=1, col=1)
fig.update_xaxes(title_text="Cut", row=1, col=2)
fig.update_xaxes(title_text="Cut", row=2, col=1)

# Update yaxis properties
fig.update_yaxes(title_text="Price (USS)", row=1, col=1)
fig.update_yaxes(title_text="Price (USS)", row=1, col=2)
fig.update_yaxes(title_text="Price (USS)", row=2, col=1)

fig.update_layout(title = 'Descriptive Statistics of Price (cut)', title_x = 0.5)

As well as clarity, cut also intuitively has a linear relationship to price.

# Color

In [None]:
fig = px.histogram(data_frame = diamond, x = 'color', text_auto = '.3s',
                   template = 'plotly_dark', range_y = [0, 10000]
                   )

fig.update_xaxes(categoryorder = 'total descending')
fig.update_yaxes(showticklabels = False)

fig.update_layout(
    title = {
        'text': "Quantity of Diamonds by Color",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Colors',
    yaxis_title = 'Quantity',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    font = dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White")

fig.update_traces(textposition = 'outside',textfont_size = 16, marker_color = 'Red')

fig.show()

In [None]:
fig = px.histogram(diamond, x='color', color="cut", barmode='group',
                   text_auto = True, template = 'plotly_dark')

fig.update_xaxes(categoryorder = 'total descending')

fig.update_layout(
    title = {
        'text': "Diamonds by color and cut destribuition",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Colors',
    yaxis_title = 'Quantity',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    font = dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White")

fig.update_traces(textposition = 'outside',textfont_size = 16)

fig.show()

In [None]:
fig = px.histogram(diamond, x='color', color="clarity", barmode='group',
                   text_auto = True, template = 'plotly_dark')

fig.update_xaxes(categoryorder = 'total descending')

fig.update_layout(
    title = {
        'text': "Diamonds by color and clarity destribuition",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Colors',
    yaxis_title = 'Quantity',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    font = dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White")

fig.update_traces(textposition = 'outside',textfont_size = 16)

fig.show()

As well as the two previous variables, color also shows a linear decrease with price.

# Correlations between variables

Considering that the descriptive statistics have linear relationships with the price, we will check the correlations between the variables and test the hypothesis raised by the study of the graphs.

In [None]:
correlation = diamond.corr().round(2)

In [None]:
fig = px.imshow(correlation, text_auto = True, template = 'plotly_dark')

fig.update_layout(
    title = {
        'text': "Correlation Heatmap",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'}
)

fig.show()

In fact, it is possible to notice a strong correlation between the variables carat, x, y and z with the price as seen in the heatmap and by analyzing the regression lines of the variables in relation to the price.

# Frequency Distributions

Let us now study the distribution of diamonds present in the dataset with respect to the variables carat and depth.

In [None]:
fig = px.histogram(data_frame = diamond,
                   x = 'carat', template = 'plotly_dark')
fig.update_layout(
    title = {
        'text': "Diamonds distribuition by carat",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Carat',
    yaxis_title = 'Frequency',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    title_font_family="Arial",
    title_font_color= "White",
    font = dict(size = 18),
    legend_title_font_color = "green")

fig.update_traces(marker_color = 'Red')

fig.show()

In [None]:
fig = px.histogram(data_frame = diamond,
                   x = 'depth', template = 'plotly_dark')
fig.update_layout(
    title = {
        'text': "Diamonds distribuition by depth",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Depth',
    yaxis_title = 'Frequency',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    title_font_family="Arial",
    title_font_color= "White",
    font = dict(size = 18),
    legend_title_font_color = "green")

fig.update_traces(marker_color = 'Red')

fig.show()

# Linear regression model

Finding a high correlation with price and variables, let's use Python's sklearn library to try to predict price values ​​with respect to these variables.

In [None]:
X1 = diamond[['carat', 'x', 'y', 'z']]
y1 = diamond[['price']]

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.2, random_state = 2811)

In [None]:
model1 = LinearRegression()

In [None]:
model1.fit(X1_train, y1_train)

In [None]:
model1.score(X1_train, y1_train)

In [None]:
model1.score(X1_test, y1_test)

In [None]:
train_LR_accuracy = model1.score(X1_train, y1_train)
test_LR_accuracy = model1.score(X1_test, y1_test)

In [None]:
percentage_test_LR_accuracy = (model1.score(X1_test, y1_test)) * 100

In [None]:
print('Training Accuracy: {:.2f}%'.format((train_LR_accuracy) * 100))
print('Test Accuracy: {:.2f}%'.format((test_LR_accuracy) * 100))

# "Linearized" Linear Regression Model

It is possible to improve the accuracy of our model by linearizing the variables using the logarithm function as follows:

In [None]:
diamond['log_price'] = np.log(diamond['price'])
diamond['log_carat'] = np.log(diamond['carat'])
diamond['log_x'] = np.log(diamond['x'])
diamond['log_y'] = np.log(diamond['y'])
diamond['log_z'] = np.log(diamond['z'])

In [None]:
fig = px.histogram(data_frame = diamond,
                   x = 'price', template = 'plotly_dark')
fig.update_layout(
    title = {
        'text': "Diamonds distribuition by price",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Price',
    yaxis_title = 'Frequency',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    title_font_family="Arial",
    title_font_color= "White",
    font = dict(size = 18),
    legend_title_font_color = "green")

fig.update_traces(marker_color = 'Red')

fig.show()

This is the price distribution before linearizing.

In [None]:
fig = px.histogram(data_frame = diamond,
                   x = 'log_price', template = 'plotly_dark')
fig.update_layout(
    title = {
        'text': "Diamonds distribuition by price (log)",
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title = 'Price',
    yaxis_title = 'Frequency',
    width = 1200,
    height = 500,
    font_family="Arial",
    font_color="White",
    title_font_family="Arial",
    title_font_color= "White",
    font = dict(size = 18),
    legend_title_font_color = "green")

fig.update_traces(marker_color = 'Red')

fig.show()

This is the distribution of prices after linearization, much closer to a normal distribution than in the previous case.

In [None]:
ax = sns.set(rc = {'figure.figsize': (10, 6)})
ax = sns.pairplot(data = diamond, y_vars = 'log_price', x_vars = ['log_carat', 'log_x', 'log_y', 'log_z'], kind = 'reg')

In [None]:
X2 = diamond[['log_carat', 'log_x', 'log_y', 'log_z']]
y2 = diamond[['log_price']]

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.2, random_state = 2811)

In [None]:
model2 = LinearRegression()

In [None]:
model2.fit(X2_train, y2_train)

In [None]:
model2.score(X2_train, y2_train)

In [None]:
model2.score(X2_test, y2_test)

In [None]:
percentage_model2_train = (model2.score(X2_train, y2_train) * 100).round(2)
percentage_model2_test = (model2.score(X2_test, y2_test) * 100).round(2)

In [None]:
print('The accuracy of the linearized model in the training data was {}%.'.format(percentage_model2_train))
print('The accuracy of the linearized model in the testing data was {}%.'.format(percentage_model2_test))

## Comparing the models

In [None]:
variation = (percentage_model2_test - percentage_test_LR_accuracy).round(2)

In [None]:
print('The accuracy of the model was increased by {}%.'.format(variation))