In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/diamonds/diamonds.csv


In [3]:
#read dataset
diamond_data = pd.read_csv('/kaggle/input/diamonds/diamonds.csv')

In [4]:
diamond_data.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
diamond_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53940 non-null  float64
 2   cut         53940 non-null  object 
 3   color       53940 non-null  object 
 4   clarity     53940 non-null  object 
 5   depth       53940 non-null  float64
 6   table       53940 non-null  float64
 7   price       53940 non-null  int64  
 8   x           53940 non-null  float64
 9   y           53940 non-null  float64
 10  z           53940 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB


In [6]:
diamond_data = diamond_data.drop(columns= ['Unnamed: 0'])

Let's visualize the effect of carat on diamond price

In [7]:
fig = px.scatter(diamond_data, x='carat', y='price', size='depth', color= 'cut', trendline='ols')
fig.show()

We can see that diamond prices increase with increase in carat

In [8]:
#Adding a volume column to the dataset by multiplying the length(x), width(y) and depth(z) of the diamond
diamond_data['volume'] = diamond_data.x * diamond_data.y * diamond_data.z

Inspecting the relationship between the volume and price of the diamond

In [9]:
fig = px.scatter(diamond_data, x='volume', y= 'price',size='volume', color= "cut", trendline="ols")

fig.update_yaxes(range=[1,20000])
fig.update_xaxes(range=[1,1000])

Premium cut diamonds are larger than other diamonds
There’s a linear relationship between the size of all types of diamonds and their prices

Visualizing price based on diamond color to see if color will affect prices

In [10]:
fig= px.box(diamond_data, y='price', x='cut', color= 'color')
fig.show()

In [11]:
fig = px.box(diamond_data, x= 'cut', y='price',color='color')
fig.show()

In [12]:
diamond_data_num= diamond_data.select_dtypes(include=['int','float'])

In [13]:
correlation = diamond_data_num.corr()

In [14]:
print(f'{correlation["price"].sort_values(ascending = False)}')

price     1.000000
carat     0.921591
volume    0.902385
x         0.884435
y         0.865421
z         0.861249
table     0.127134
depth    -0.010647
Name: price, dtype: float64


Now, moving to predicting diamond prices using all the necessary information from the price analysis done above.

Firstly, I will convert the values of the cut column to numeric values

In [15]:
diamond_data.cut = diamond_data['cut'].map({'Ideal':1,
                                           'Premium':2,
                                           "Good":3,
                                           'Very Good':4,
                                           'Fair':5})

In [16]:
#getting features and target variables
x= np.array(diamond_data[["carat", "cut", "volume"]])
y= np.array(diamond_data[['price']])

In [17]:
X_train, X_test, y_train,y_test = train_test_split(x,y,test_size=0.2, random_state= 42)

In [18]:
model= RandomForestRegressor()
model.fit(X_train,y_train )


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [19]:
print('Diamond Price Prediction')
a = float(input("Carat Size: "))
b = int(input("Cut Type (Ideal: 1, Premium: 2, Good: 3, Very Good: 4, Fair: 5): "))
c = float(input("Volume: "))
features = np.array([[a, b, c]])
print("Predicted Diamond's Price = ", model.predict(features))

Diamond Price Prediction


Carat Size:  0.6
Cut Type (Ideal: 1, Premium: 2, Good: 3, Very Good: 4, Fair: 5):  2
Volume:  40


Predicted Diamond's Price =  [933.76666667]
