In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("C:\\Users\\Vansh Patel\\Downloads\\EDACAI53\\Diamonds Prices2022.csv")

In [None]:
print(f"Shape Of The Dataset : {data.shape}")
print(f"\nGlimpse Of The Dataset :")
data.head().style.set_properties(**{"background-color": "#FDD667","color":"black","border": "1.5px solid black"})

In [None]:
print(f"Informations Of The Dataset :\n")
print(data.info())

In [None]:
print(f"Summary Of The Dataset :")
data.describe().style.set_properties(**{"background-color": "#FDD667","color":"black","border": "1.5px solid black"})

In [None]:
data.describe(include=object).T.style.set_properties(**{"background-color": "#FDD667","color":"black","border": "1.5px solid black"})

In [None]:
# Taking a peek at the dataset from the top 10 rows
print(data.head(10))

In [None]:
# Taking a peek at the dataset from the bottom 10 rows
print(data.tail(10))

In [None]:
print(float(data.cut.isnull().sum() * 100.0 / len(data)))
print(float(data.color.isnull().sum() * 100.0 / len(data)))
print(float(data.clarity.isnull().sum() * 100.0 / len(data)))
print(float(data.table.isnull().sum() * 100.0 / len(data)))

In [None]:
data1= data[~data.cut.isnull()].copy()
data1 = data1[~data.color.isnull()].copy()
data1 = data1[~data.clarity.isnull()].copy()
data1 = data1[~data.table.isnull()].copy()
print(data1.describe())
print(data1.info())
data1.isnull().sum()

In [None]:
print(f"Null values of the Dataset :")
data.isna().sum().to_frame().T.style.set_properties(**{"background-color": "#FDD667","color":"black","border": "1.5px solid black"})

In [None]:
sns.set_style("white")
sns.set(rc={"axes.facecolor":"#f2d4b1","figure.facecolor":"#f2d4b1","grid.color":"white"})
sns.set_context("poster",font_scale = .7)

palette = ["#c94727","#ea5b17","#e57716","#f2a324","#a2c0a6","#7ac0a8","#5e9786","#557260"]

sns.palplot(sns.color_palette(palette))
plt.show()

In [None]:
print(f"Let's have a look on the distribution of prices :")
plt.subplots(figsize=(20, 8))
p = sns.histplot(data["price"],color=palette[6],kde=True,bins=30,alpha=1,fill=True,edgecolor="black",linewidth=3)
p.axes.lines[0].set_color("orange")
p.axes.set_title("\nDiamond's Price Distribution\n",fontsize=25)
plt.ylabel("Count",fontsize=20)
plt.xlabel("\nPrice",fontsize=20)
plt.yscale("linear")
sns.despine(left=True, bottom=True)

plt.show()

In [None]:
print(f"Let's have a look on the ratio of diamond's cut :")
plt.subplots(figsize=(12, 12))

labels = "Ideal","Premium","Very Good","Good","Fair"
size = 0.5

wedges, texts, autotexts = plt.pie([data["cut"].value_counts().values[0],
                                    data["cut"].value_counts().values[1],
                                    data["cut"].value_counts().values[2],
                                    data["cut"].value_counts().values[3],
                                    data["cut"].value_counts().values[4]],
                                    explode = (0,0,0,0,0),
                                    textprops=dict(size= 20, color= "white"),
                                    autopct="%.2f%%", 
                                    pctdistance = 0.72,
                                    radius=.9, 
                                    colors = ["#3f4f45","#5e9880","#f5a126","#ea5b17","#6c3938"], 
                                    shadow = True,
                                    wedgeprops=dict(width = size, edgecolor = "black", 
                                    linewidth = 4),
                                    startangle = -165)

plt.legend(wedges, labels, title="Diamond's Cut",loc="center left",bbox_to_anchor=(1, 0, 0.5, 1), edgecolor = "black")
plt.title("\nDiamond's Cut Ratio",fontsize=25)
plt.show()

In [None]:
print("Let's have a look on the diamond's cut :")
plt.subplots(figsize=(20, 8))
p=sns.countplot(y=data["cut"],order=data["cut"].value_counts().index,palette=["#3f4f45","#5e9880","#f5a126","#ea5b17","#6c3938"], saturation=1, edgecolor = "#1c1c1c", linewidth = 5)
# p.axes.set_yscale("symlog")
p.axes.set_title("\nDiamond's Cut\n",fontsize=25)
p.axes.set_ylabel("Cut",fontsize=20)
p.axes.set_xlabel("\nTotal",fontsize=20)
p.axes.set_yticklabels(p.get_yticklabels(),rotation = 0)
for container in p.containers:
    p.bar_label(container,label_type="center",padding=6,size=25,color="black",rotation=0,
    bbox={"boxstyle": "round", "pad": 0.4, "facecolor": "#e0b583", "edgecolor": "#1c1c1c", "linewidth" : 4, "alpha": 1})


sns.despine(left=True, bottom=True)
plt.show()

In [None]:
print("Let's have a look on the price distribution of diamond's cut :")
plt.subplots(figsize=(25, 10))

p=sns.violinplot(x=data["cut"],y=data["price"],order=data["cut"].value_counts().index,palette=["#3f4f45","#5e9880","#f5a126","#ea5b17","#6c3938"],saturation=1,linewidth=4,edgecolor="black")
p.axes.set_title("\nDiamond's Price On Cut\n",fontsize=30)
p.axes.set_xlabel("\nCut",fontsize=25)
p.axes.set_ylabel("Price",fontsize=25)

sns.despine(left=True, bottom=True)
plt.show()

In [None]:
print(f"Let's have a look on the ratio of diamond's color :")
plt.subplots(figsize=(12, 12))

labels = "G","E","F","H","D","I","J"
size = 0.5

wedges, texts, autotexts = plt.pie([data["color"].value_counts().values[0],
                                    data["color"].value_counts().values[1],
                                    data["color"].value_counts().values[2],
                                    data["color"].value_counts().values[3],
                                    data["color"].value_counts().values[4],
                                    data["color"].value_counts().values[5],
                                    data["color"].value_counts().values[6]],
                                    explode = (0,0,0,0,0,0,0),
                                    textprops=dict(size= 20, color= "white"),
                                    autopct="%.2f%%", 
                                    pctdistance = 0.72,
                                    radius=.9, 
                                    colors = ["#3f4f45","#5e9880","#a2c0a6","#f5a126","#b05f0d","#ea5b17","#6c3938"], 
                                    shadow = True,
                                    wedgeprops=dict(width = size, edgecolor = "black", 
                                    linewidth = 4),
                                    startangle = -125)

plt.legend(wedges, labels, title="Diamond's Color",loc="center left",bbox_to_anchor=(1, 0, 0.5, 1), edgecolor = "black")
plt.title("\nDiamond's Color Ratio",fontsize=25)
plt.show()

In [None]:
print("Let's have a look on the diamond's color :")
plt.subplots(figsize=(20, 8))
p=sns.countplot(y=data["color"],order=data["color"].value_counts().index,palette=["#3f4f45","#5e9880","#a2c0a6","#f5a126","#b05f0d","#ea5b17","#6c3938"], saturation=1, edgecolor = "#1c1c1c", linewidth = 5)
# p.axes.set_yscale("symlog")
p.axes.set_title("\nDiamond's Color\n",fontsize=25)
p.axes.set_ylabel("Color",fontsize=20)
p.axes.set_xlabel("\nTotal",fontsize=20)
p.axes.set_yticklabels(p.get_yticklabels(),rotation = 0)
for container in p.containers:
    p.bar_label(container,label_type="center",padding=6,size=25,color="black",rotation=0,
    bbox={"boxstyle": "round", "pad": 0.2, "facecolor": "#e0b583", "edgecolor": "#1c1c1c", "linewidth" : 4, "alpha": 1})


sns.despine(left=True, bottom=True)
plt.show()

In [None]:
print("Let's have a look on the price distribution of diamond's color :")
plt.subplots(figsize=(25, 10))

p=sns.violinplot(x=data["color"],y=data["price"],order=data["color"].value_counts().index,palette=["#3f4f45","#5e9880","#a2c0a6","#f5a126","#b05f0d","#ea5b17","#6c3938"],saturation=1,linewidth=4,edgecolor="black")
p.axes.set_title("\nDiamond's Price On Color\n",fontsize=30)
p.axes.set_xlabel("\nColor",fontsize=25)
p.axes.set_ylabel("Price",fontsize=25)

sns.despine(left=True, bottom=True)
plt.show()

In [None]:
print(f"Let's have a look on the ratio of diamond's clarity :")
plt.subplots(figsize=(12, 12))

labels = "SI1","VS2","SI2","VS1","VVS2","VVS1","IF","I1"
size = 0.5

wedges, texts, autotexts = plt.pie([data["clarity"].value_counts().values[0],
                                    data["clarity"].value_counts().values[1],
                                    data["clarity"].value_counts().values[2],
                                    data["clarity"].value_counts().values[3],
                                    data["clarity"].value_counts().values[4],
                                    data["clarity"].value_counts().values[5],
                                    data["clarity"].value_counts().values[6],
                                    data["clarity"].value_counts().values[7]],
                                    explode = (0,0,0,0,0,0,0,0),
                                    textprops=dict(size= 20, color= "white"),
                                    autopct="%.2f%%", 
                                    pctdistance = 0.72,
                                    radius=.9, 
                                    colors = ["#3f4f45","#557260","#5e9880","#a2c0a6","#f5a126","#e57716","#ea5b17","#6c3938"], 
                                    shadow = True,
                                    wedgeprops=dict(width = size, edgecolor = "black", 
                                    linewidth = 4),
                                    startangle = -230)

plt.legend(wedges, labels, title="Diamond's Clarity",loc="center left",bbox_to_anchor=(1, 0, 0.5, 1), edgecolor = "black")
plt.title("\nDiamond's Clarity Ratio",fontsize=25)
plt.show()

In [None]:
print("Let's have a look on the diamond's clarity :")
plt.subplots(figsize=(20, 8))
p=sns.countplot(y=data["clarity"],order=data["clarity"].value_counts().index,palette=["#3f4f45","#557260","#5e9880","#a2c0a6","#f5a126","#e57716","#ea5b17","#6c3938"], saturation=1, edgecolor = "#1c1c1c", linewidth = 5)
# p.axes.set_yscale("symlog")
p.axes.set_title("\nDiamond's Clarity\n",fontsize=25)
p.axes.set_ylabel("Clarity",fontsize=20)
p.axes.set_xlabel("\nTotal",fontsize=20)
p.axes.set_yticklabels(p.get_yticklabels(),rotation = 0)
for container in p.containers:
    p.bar_label(container,label_type="center",padding=6,size=22,color="black",rotation=0,
    bbox={"boxstyle": "round", "pad": 0.2, "facecolor": "#e0b583", "edgecolor": "#1c1c1c", "linewidth" : 3, "alpha": 1})


sns.despine(left=True, bottom=True)
plt.show()

In [None]:
print("Let's have a look on the price distribution of diamond's clarity :")
plt.subplots(figsize=(25, 10))

p=sns.violinplot(x=data["clarity"],y=data["price"],order=data["clarity"].value_counts().index,palette=["#3f4f45","#557260","#5e9880","#a2c0a6","#f5a126","#e57716","#ea5b17","#6c3938"],saturation=1,linewidth=4,edgecolor="black")
p.axes.set_title("\nDiamond's Price On Clarity\n",fontsize=30)
p.axes.set_xlabel("\nClarity",fontsize=25)
p.axes.set_ylabel("Price",fontsize=25)

sns.despine(left=True, bottom=True)
plt.show()

In [None]:
print(f"Let's have a look on the distribution of weight in carat :")
plt.subplots(figsize=(20, 8))
p = sns.histplot(data["carat"],color="#3f4f45",kde=True,bins=30,alpha=1,fill=True,edgecolor="black",linewidth=3)
p.axes.lines[0].set_color("orange")
p.axes.set_title("\nDiamond's Weight In Carat Distribution\n",fontsize=25)
plt.ylabel("Count",fontsize=20)
plt.xlabel("\nWeights In Carat",fontsize=20)
# plt.yscale("linear")
sns.despine(left=True, bottom=True)

plt.show()

In [None]:
print("Let's have a look on the price distribution of weights in carat :")

_, axes = plt.subplots(figsize=(20,8))
sns.kdeplot(y=data["carat"], x=data["price"],edgecolor="#1c1c1c",fill=True, kind="kde",shade=True,height=10,color="#3f4f45")
axes.set_title("\nPrice Distribution Of Weights\n",fontsize=25)
axes.set_xlabel("\nPrice",fontsize=20)
axes.set_ylabel("Weights In Carat",fontsize=20)
    
sns.despine(left=True, bottom=True)
plt.show()

In [None]:
print(f"Let's have a look on the distribution of depth percentage :")
plt.subplots(figsize=(20, 8))
p = sns.histplot(data["depth"],color="#5e9880",kde=True,bins=30,alpha=1,fill=True,edgecolor="black",linewidth=3)
p.axes.lines[0].set_color("orange")
p.axes.set_title("\nDiamond's Depth Percentage Distribution\n",fontsize=25)
plt.ylabel("Count",fontsize=20)
plt.xlabel("\nDepth Percentage",fontsize=20)
# plt.yscale("linear")
sns.despine(left=True, bottom=True)

plt.show()

In [None]:
print("Let's have a look on the price distribution of depth percentage :")

_, axes = plt.subplots(figsize=(20,8))
sns.kdeplot(y=data["depth"], x=data["price"],edgecolor="#1c1c1c",fill=True, kind="kde",shade=True,height=10,color="#5e9880")
axes.set_title("\nPrice Distribution Of Depth Percentage\n",fontsize=25)
axes.set_xlabel("\nPrice",fontsize=20)
axes.set_ylabel("Depth Percentage",fontsize=20)
    
sns.despine(left=True, bottom=True)
plt.show()

In [None]:
print(f"Let's have a look on the distribution of diamond's table :")
plt.subplots(figsize=(20, 8))
p = sns.histplot(data["table"],color="#6c3938",kde=True,bins=30,alpha=1,fill=True,edgecolor="black",linewidth=3)
p.axes.lines[0].set_color("orange")
p.axes.set_title("\nDiamond's Table Distribution\n",fontsize=25)
plt.ylabel("Count",fontsize=20)
plt.xlabel("\nDiamond's Table",fontsize=20)
# plt.yscale("linear")
sns.despine(left=True, bottom=True)

plt.show()

In [None]:
print("Let's have a look on the price distribution of diamond's table :")

_, axes = plt.subplots(figsize=(20,8))
sns.kdeplot(y=data["table"], x=data["price"],edgecolor="#1c1c1c",fill=True, kind="kde",shade=True,height=10,color="#6c3938")
axes.set_title("\nPrice Distribution Of Diamond's Table\n",fontsize=25)
axes.set_xlabel("\nPrice",fontsize=20)
axes.set_ylabel("Diamond's Table",fontsize=20)
    
sns.despine(left=True, bottom=True)
plt.show()

In [None]:
print(f"Let's have a look on the pairwise relationships :")

sns.pairplot(data=data,hue="cut",palette=["#ea5b17","#6c3938","#f5a126","#3f4f45","#5e9880","#5e9869"],plot_kws=dict(linewidth=0))
sns.despine(left=True, bottom=True)
plt.show()

In [None]:
_, axs = plt.subplots(2,3,figsize=(25,12),sharex=True)
plt.tight_layout(pad=4.0)

sns.regplot(x="price", y="x", data=data, ax=axs[0,0], color="#3f4f45", fit_reg=True, line_kws=dict(color= "orange"))
axs[0,0].set_title("Price vs Length\n",fontsize=25)
axs[0,0].set_ylabel("x",fontsize=20)

sns.regplot(x="price", y="y", data=data, ax=axs[0,1], color="#5e9880", fit_reg=True, line_kws=dict(color= "orange"))
axs[0,1].set_title("Price vs Width\n",fontsize=25)
axs[0,1].set_ylabel("y",fontsize=20)

sns.regplot(x="price", y="z", data=data, ax=axs[0,2], color="#a2c0a6", fit_reg=True, line_kws=dict(color= "orange"))
axs[0,2].set_title("Price vs Depth\n",fontsize=25)
axs[0,2].set_ylabel("z",fontsize=20)

sns.regplot(x="price", y="depth", data=data, ax=axs[1,0], color="#f5a126", fit_reg=True, line_kws=dict(color= "#6c3938"))
axs[1,0].set_title("Price vs Depth Percentage\n",fontsize=25)
axs[1,0].set_xlabel("\nPrice",fontsize=20)
axs[1,0].set_ylabel("depth",fontsize=20)

sns.regplot(x="price", y="table", data=data, ax=axs[1,1], color="#ea5b17", fit_reg=True, line_kws=dict(color= "#6c3938"))
axs[1,1].set_title("Price vs Table\n",fontsize=25)
axs[1,1].set_xlabel("\nPrice",fontsize=20)
axs[1,1].set_ylabel("Table",fontsize=20)

sns.regplot(x="price", y="carat", data=data, ax=axs[1,2], color="#6c3938", fit_reg=True, line_kws=dict(color= "orange"))
axs[1,2].set_title("Price vs Weight\n",fontsize=25)
axs[1,2].set_xlabel("\nPrice",fontsize=20)
axs[1,2].set_ylabel("carat",fontsize=20)

plt.suptitle("Regression Line",fontsize=30, y=1.03)
sns.despine(left=True, bottom=True)
plt.show()

In [None]:
data = data[(data["x"]<10)&(data["x"]>3)]
data = data[(data["y"]<13)&(data["y"]>2)]
data = data[(data["z"]<6)&(data["z"]>2)]
data = data[(data["depth"]<73)&(data["depth"]>53)]
data = data[(data["table"]<71)&(data["table"]>50)]
data = data[(data["carat"]<3)]

print(f"After dropping outliers, let's have a look on the pairwise relationships :")

sns.pairplot(data=data,hue="cut",palette=["#ea5b17","#6c3938","#f5a126","#3f4f45","#5e9880","#5e9869"],plot_kws=dict(linewidth=0))
sns.despine(left=True, bottom=True)
plt.show()

In [None]:
catcol = ["color","clarity","cut"]
le = LabelEncoder()
for col in catcol:
        data[col] = le.fit_transform(data[col])


plt.subplots(figsize =(10, 10))

sns.heatmap(data.corr(), cmap = palette, square=True, cbar_kws=dict(shrink =.82), 
            annot=True, vmin=-1, vmax=1, linewidths=3,linecolor='#3f4f45',annot_kws=dict(fontsize =12))
plt.title("Pearson Correlation Of Features\n", fontsize=25)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

In [None]:
sns.pairplot(data[["price", "carat", "cut"]], hue = "cut", height = 5)
plt.show()
sns.barplot(x = "carat", y = "cut", data = data)
plt.show()
sns.barplot(x = "price", y = "cut", data = data)
plt.show()

In [None]:
sns.pairplot(data[["price", "carat", "color"]], hue = "color", height = 5)
plt.show()
sns.barplot(x = "carat", y = "color", data = data)
plt.show()
sns.barplot(x = "price", y = "color", data = data)
plt.show()

In [None]:
sns.pairplot(data[["price", "carat", "clarity"]], hue = "clarity", height = 5)
plt.show()
sns.barplot(x = "carat", y = "clarity", data = data)
plt.show()
sns.barplot(x = "price", y = "clarity", data = data)
plt.show()