In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('data/diamonds.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().transpose()

In [None]:
df.head()

### Rearranging Columns

In [None]:
df2 = df.iloc[:,[0,1,2,3,4,5,7,8,9,6]]

In [None]:
df2.head()

In [None]:
df2.shape

### Checking for Outliers

In [None]:
col_name = df2.select_dtypes(include=['int','float']).columns


for i in col_name:
  mean = df2[i].mean()
  med =  df2[i].median()
  print(f'Mean for {i} is {mean}')
  print(f'Median for {i} is {med}')

In [None]:
# before removing outliers
plt.boxplot(df2.carat)
plt.show()

In [None]:
plt.boxplot(df2.depth)
plt.show()

In [None]:
plt.boxplot(df2.table)
plt.show()

In [None]:
plt.boxplot(df2.x)
plt.show()

In [None]:
plt.boxplot(df2.y)
plt.show()

In [None]:
plt.boxplot(df2.z)
plt.show()

In [None]:
# after removing outliers
plt.boxplot(df2.price)
plt.show()

In [None]:
df2.shape

### Treating Outliers

In [None]:
def outliers(col_name):
  Q1 = np.percentile(df2[col_name], 25,
                   interpolation = 'midpoint')
 
  Q3 = np.percentile(df2[col_name], 75,
                   interpolation = 'midpoint')
  IQR = Q3 - Q1

  upper = Q3+(1.5*IQR)
  lower = Q1-(1.5*IQR)

  # df2.drop(upper[0], inplace = True)
  # df2.drop(lower[0], inplace = True)

  # df2.drop(df2[df2[col_name] == upper[0]].index, inplace = True)
  # df2.drop(df2[df2[col_name] == lower[0]].index, inplace = True)
  # df2.drop[(df2[col_name] > upper) & (df2[col_name] < lower)]
  # df2[col_name] = df2[(df2[col_name] < upper) & (df2[col_name] > lower)]
  df2.drop(df2[(df2[col_name] > upper) | (df2[col_name] < lower)].index, inplace=True)


In [None]:
outliers('carat')

In [None]:
plt.boxplot(df2.carat)
plt.show()

In [None]:
df2.shape

In [None]:
outliers('depth')

In [None]:
plt.boxplot(df2.depth)
plt.show()

In [None]:
outliers('table')

In [None]:
plt.boxplot(df2.table)
plt.show()

In [None]:
outliers('x')
plt.boxplot(df2.x)
plt.show()

In [None]:
outliers('y')
plt.boxplot(df2.y)
plt.show()

In [None]:
outliers('z')
plt.boxplot(df2.z)
plt.show()

In [None]:
outliers('price')
plt.boxplot(df2.price)
plt.show()

In [None]:
df2.shape

In [None]:
df2.head()

In [None]:
plt.scatter(df2['carat'],df2['price'])
plt.xlabel('Carat')
plt.ylabel('Price')
plt.show()

In [None]:
plt.scatter(df2['cut'],df2['price'])
plt.xlabel('cut')
plt.ylabel('Price')
plt.show()

In [None]:
sns.histplot(data=df2['carat'],kde=True)

In [None]:
sns.histplot(data=df2['cut'],kde=True)

In [None]:
col_name

In [None]:
sns.histplot(data=df2['table'],kde=True)

In [None]:
sns.histplot(data=df2['x'],kde=True)

In [None]:
sns.histplot(data=df2['y'],kde=True)

In [None]:
sns.histplot(data=df2['z'],kde=True)

In [None]:
sns.histplot(data=df2['price'],kde=True)

In [None]:
sns.heatmap(df2.iloc[:,:].corr(),annot=True)

We Can see Price column have strong corelation with carat,x,y and z columns.

In [None]:
df2.head()

In [None]:
df2.price.max()

In [None]:
df2.describe()

# KNN Algorithm from Scratch

###Splitting Data into Train Test

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
df2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335
5,0.24,Very Good,J,VVS2,62.8,57.0,3.94,3.96,2.48,336


In [51]:
X = df2.iloc[:,:9]

In [52]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
5,0.24,Very Good,J,VVS2,62.8,57.0,3.94,3.96,2.48
...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


In [53]:
y = df2['price']

In [54]:
y

0         326
1         326
3         334
4         335
5         336
         ... 
53935    2757
53936    2757
53937    2757
53938    2757
53939    2757
Name: price, Length: 46530, dtype: int64

In [55]:
X.shape

(46530, 9)

In [56]:
y.shape

(46530,)

### Spliting Data

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [58]:
X_train.shape ,X_test.shape, y_train.shape, y_test.shape

((34897, 9), (11633, 9), (34897,), (11633,))

In [59]:
X_train_cat = X_train.select_dtypes(include=['object'])

X_train_cat.head()

Unnamed: 0,cut,color,clarity
47832,Ideal,G,VVS2
33879,Premium,E,VS2
31586,Premium,E,VS2
6385,Premium,E,VVS2
12121,Premium,G,SI2


### Ordinal Encoder for categorical features


In [60]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

In [61]:
X_train_catg = pd.DataFrame(enc.fit_transform(X_train_cat), 
                                    columns = X_train_cat.columns, 
                                    index = X_train_cat.index)
X_train_catg

Unnamed: 0,cut,color,clarity
47832,2.0,3.0,7.0
33879,3.0,1.0,5.0
31586,3.0,1.0,5.0
6385,3.0,1.0,7.0
12121,3.0,3.0,3.0
...,...,...,...
31566,2.0,0.0,7.0
52051,1.0,5.0,4.0
36146,2.0,1.0,5.0
34892,2.0,0.0,5.0


In [62]:
X_train_catg.isnull().sum()

cut        0
color      0
clarity    0
dtype: int64

In [63]:
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])

X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
47832,0.52,62.2,56.0,5.13,5.16,3.2
33879,0.3,60.7,59.0,4.33,4.3,2.62
31586,0.35,59.2,58.0,4.59,4.63,2.73
6385,0.25,62.5,59.0,4.02,4.04,2.52
12121,1.35,60.5,60.0,7.16,7.11,4.32


### Standard scaler for Numerical Columns

In [64]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


X_train_no = pd.DataFrame(scaler.fit_transform(X_train_num), 
                                    columns = X_train_num.columns, 
                                    index = X_train_num.index)

### Concantenating Dataframe/Columns

In [65]:
X_train_rescaled = pd.concat([X_train_catg,X_train_no], axis=1)

X_train_rescaled.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
47832,2.0,3.0,7.0,-0.491004,0.363849,-0.616883,-0.404926,-0.381295,-0.35739
33879,3.0,1.0,5.0,-1.098495,-1.028555,0.873124,-1.234161,-1.278129,-1.328144
31586,3.0,1.0,5.0,-0.960429,-2.420958,0.376455,-0.964659,-0.933995,-1.144036
6385,3.0,1.0,7.0,-1.236561,0.64233,0.873124,-1.555489,-1.549265,-1.495516
12121,3.0,3.0,3.0,1.800892,-1.214209,1.369793,1.699255,1.652225,1.517169


### Preparing Test Data

In [66]:
X_test_cat = X_test.select_dtypes(include=['object'])

X_test_cat.head()

Unnamed: 0,cut,color,clarity
12983,Ideal,G,SI2
7945,Ideal,H,SI2
39341,Good,H,SI1
47409,Good,G,VS2
48664,Ideal,G,VVS1


In [67]:
X_test_catg = pd.DataFrame(enc.transform(X_test_cat), 
                                    columns = X_test_cat.columns, 
                                    index = X_test_cat.index)

In [68]:
X_test_catg.head()

Unnamed: 0,cut,color,clarity
12983,2.0,3.0,3.0
7945,2.0,4.0,3.0
39341,1.0,4.0,2.0
47409,1.0,3.0,5.0
48664,2.0,3.0,6.0


In [69]:
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

X_test_num.head()

Unnamed: 0,carat,depth,table,x,y,z
12983,1.11,61.7,57.0,6.69,6.65,4.12
7945,1.02,60.7,57.0,6.51,6.54,3.96
39341,0.52,63.6,62.0,5.11,5.04,3.23
47409,0.62,61.0,62.0,5.41,5.5,3.33
48664,0.53,61.4,57.0,5.18,5.21,3.19


In [70]:
X_test_no = pd.DataFrame(scaler.transform(X_test_num), 
                                    columns = X_test_num.columns, 
                                    index = X_test_num.index)

X_test_no.head()

Unnamed: 0,carat,depth,table,x,y,z
12983,1.138175,-0.100286,-0.120214,1.21208,1.172523,1.182427
7945,0.889656,-1.028555,-0.120214,1.025502,1.057811,0.914632
39341,-0.491004,1.663426,2.363132,-0.425657,-0.506434,-0.307179
47409,-0.214872,-0.750074,2.363132,-0.114695,-0.026732,-0.139807
48664,-0.463391,-0.378766,-0.120214,-0.353099,-0.329153,-0.374127


### Concatenating Test Dataframe

In [71]:
X_test_rescaled = pd.concat([X_test_catg,X_test_no], axis=1)

X_test_rescaled.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
12983,2.0,3.0,3.0,1.138175,-0.100286,-0.120214,1.21208,1.172523,1.182427
7945,2.0,4.0,3.0,0.889656,-1.028555,-0.120214,1.025502,1.057811,0.914632
39341,1.0,4.0,2.0,-0.491004,1.663426,2.363132,-0.425657,-0.506434,-0.307179
47409,1.0,3.0,5.0,-0.214872,-0.750074,2.363132,-0.114695,-0.026732,-0.139807
48664,2.0,3.0,6.0,-0.463391,-0.378766,-0.120214,-0.353099,-0.329153,-0.374127


### Model Training

In [75]:
import mlflow

In [76]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

mlflow.set_experiment("Prediction of Diamond Price")

2022/09/21 19:10:05 INFO mlflow.tracking.fluent: Experiment with name 'Prediction of Diamond Price' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/2', creation_time=1663767605365, experiment_id='2', last_update_time=1663767605365, lifecycle_stage='active', name='Prediction of Diamond Price', tags={}>

In [77]:
from sklearn import metrics

In [78]:
from pickle import dump

In [79]:
from pickle import dump

dump(scaler, open('pickle_files/standard_scaler.pkl', 'wb'))

## Random Forest

In [80]:
from sklearn.ensemble import RandomForestRegressor

with mlflow.start_run():
    mlflow.set_tag('dev', 'Omkar')
    mlflow.set_tag('algo', 'Random Forest')
    mlflow.log_param('data-path', 'data/diamonds.csv')
    
    rf_regressor = RandomForestRegressor()
    rf_regressor.fit(X_train_rescaled, y_train)
    
    y_test_pred = rf_regressor.predict(X_test_rescaled)
    
#     acc = metrics.accuracy_score(y_test, y_test_pred)
#     mlflow.log_metric('accuracy', acc)
    mae = metrics.mean_absolute_error(y_test, y_test_pred)
    mse = metrics.mean_squared_error(y_test, y_test_pred)
    msqe = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
    
    mlflow.log_metric('mean_ab_error', mae)
    mlflow.log_metric('mean_sqr_error', mse)
    mlflow.log_metric('mean_srt_error', msqe)
    
    mlflow.sklearn.log_model(rf_regressor, artifact_path='models')
    mlflow.log_artifact('pickle_files/standard_scaler.pkl')

In [None]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

## Decision Tree Regression

In [81]:
from sklearn.tree import DecisionTreeRegressor

with mlflow.start_run():
    mlflow.set_tag('dev', 'Omkar')
    mlflow.set_tag('algo', 'Decision Tree')
    mlflow.log_param('data-path', 'data/diamonds.csv')

    dt_regressor = DecisionTreeRegressor()
    dt_regressor.fit(X_train_rescaled, y_train)

    y_test_pred = dt_regressor.predict(X_test_rescaled)
    
    mae = metrics.mean_absolute_error(y_test, y_test_pred)
    mse = metrics.mean_squared_error(y_test, y_test_pred)
    msqe = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
    
    mlflow.log_metric('mean_ab_error', mae)
    mlflow.log_metric('mean_sqr_error', mse)
    mlflow.log_metric('mean_srt_error', msqe)
    
    mlflow.sklearn.log_model(dt_regressor, artifact_path='models')
    mlflow.log_artifact('pickle_files/standard_scaler.pkl')

In [None]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

## Gradient Boosting Regression

In [82]:
from sklearn.ensemble import GradientBoostingRegressor

with mlflow.start_run():
    mlflow.set_tag('dev', 'Omkar')
    mlflow.set_tag('algo', 'Gradient Boosting')
    mlflow.log_param('data-path', 'data/diamonds.csv')

    gbr_regressor = GradientBoostingRegressor()
    gbr_regressor.fit(X_train_rescaled, y_train)
    
    y_test_pred = dt_regressor.predict(X_test_rescaled)
    
    mae = metrics.mean_absolute_error(y_test, y_test_pred)
    mse = metrics.mean_squared_error(y_test, y_test_pred)
    msqe = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
    
    mlflow.log_metric('mean_ab_error', mae)
    mlflow.log_metric('mean_sqr_error', mse)
    mlflow.log_metric('mean_srt_error', msqe)
    
    mlflow.sklearn.log_model(gbr_regressor, artifact_path='models')
    mlflow.log_artifact('pickle_files/standard_scaler.pkl')

In [None]:
y_test_pred = gbr_regressor.predict(X_test_rescaled)

In [84]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  247.77424880747247
Mean Squared Error:  167255.81962013603
Root Mean Squared Error:  408.96921598102716


## Saving the model (Serialization)

In [181]:
from pickle import dump
dump(enc,open('models/ordinal_encoder.pkl','wb'))
dump(scaler, open('models/Standard_scaler.pkl', 'wb'))
dump(rf_regressor, open('models/rf_regressor.pkl', 'wb'))

Loading back model to check its working or not

In [182]:
from pickle import load


# Loading pretrained models from pickle file
enc=load(open('models/ordinal_encoder.pkl','rb'))
scaler = load(open('models/standard_scaler.pkl', 'rb'))
rf_regressor=load(open('models/rf_regressor.pkl','rb'))

In [194]:
print('enter diamond details')
cut=input()
color=input()
clarity=input()
carat=float(input())
depth=float(input())
table=float(input())
x = float(input())
y = float(input())
z = float(input())

enter diamond details
Ideal
E
SI2
0.23
61.5
55
3.95
3.98
2.43


In [195]:
query_num = pd.DataFrame({'carat':[carat], 'depth':[depth],'table':[table],'x':[x],'y':[y],'z':[z]})
query_cat = pd.DataFrame({'cut':[cut], 'color':[color], 'clarity':[clarity]})   

In [196]:
query_cat = enc.transform(query_cat)

In [197]:
query_num = scaler.transform(query_num)

In [198]:
query_point = pd.concat([pd.DataFrame(query_num), pd.DataFrame(query_cat)], axis=1)
price = dt_regressor.predict(query_point)
        

In [199]:
print(f"The price of Selected Diamond is $ {round(price[0],2)}")

The price of Selected Diamond is $ 2170.0
