In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow

from pickle import dump
from pickle import load
from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV



In [40]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Diamond Price Prediction")

<Experiment: artifact_location='/Users/vishalchandru/MacBook/DATA SCIENCE/Projects/MLFLOW/mlruns/1', creation_time=1687389469084, experiment_id='1', last_update_time=1687389469084, lifecycle_stage='active', name='Diamond Price Prediction', tags={}>

In [41]:
data = pd.read_csv('data/diamonds.csv')
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [43]:
data.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [44]:
data.shape

(53940, 10)

In [45]:
data.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [46]:
data.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [47]:
data = data[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z','price']]
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


In [48]:
#sns.set(style="ticks", color_codes=True)
#sns.pairplot(data[['carat', 'depth', 'table', 'x', 'y', 'z','price']], kind = 'reg', plot_kws={'line_kws':{'color':'red'}})

In [49]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

## Split Dataset

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [51]:
print('X_train Shape:', X_train.shape)
print('y_train Shape:', y_train.shape)
print('X_test Shape:', X_test.shape)
print('y_test Shape:', y_test.shape)

X_train Shape: (40455, 9)
y_train Shape: (40455,)
X_test Shape: (13485, 9)
y_test Shape: (13485,)


In [52]:
X_train.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [53]:
X_train_num = X_train.select_dtypes('float64')
X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
35965,0.25,64.9,58.0,3.95,3.97,2.57
52281,0.84,61.8,56.0,6.04,6.07,3.74
6957,1.05,61.1,58.0,6.56,6.51,3.99
9163,1.02,60.7,56.0,6.53,6.5,3.95
50598,0.61,61.8,57.0,5.43,5.47,3.37


In [54]:
X_train_cat = X_train.select_dtypes('object')
X_train_cat.head()

Unnamed: 0,cut,color,clarity
35965,Good,E,VVS2
52281,Ideal,J,SI1
6957,Premium,J,VS2
9163,Ideal,F,SI2
50598,Ideal,F,VS1


## Data Preprocessing

In [55]:
scaler = StandardScaler()
X_train_num_transform = pd.DataFrame(scaler.fit_transform(X_train_num),columns = X_train_num.columns, index = X_train_num.index)
X_train_num_transform.head()

Unnamed: 0,carat,depth,table,x,y,z
35965,-1.156665,2.207837,0.242414,-1.589985,-1.544446,-1.365816
52281,0.086917,0.038517,-0.654923,0.27356,0.291506,0.282149
6957,0.529547,-0.451329,0.242414,0.737217,0.676181,0.634279
9163,0.466314,-0.731242,-0.654923,0.710468,0.667439,0.577938
50598,-0.397869,0.038517,-0.206254,-0.270345,-0.233052,-0.239002


In [56]:
X_train_num.describe()

Unnamed: 0,carat,depth,table,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.798764,61.744959,57.459703,5.733197,5.73657,3.539683
std,0.474442,1.429037,2.228845,1.121532,1.143835,0.709975
min,0.2,43.0,43.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,4.72,4.73,2.91
50%,0.7,61.8,57.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,6.54,6.54,4.04
max,5.01,79.0,95.0,10.74,58.9,31.8


In [57]:
print('Number of Features:', scaler.n_features_in_)
print('Average of each column', scaler.mean_)
print('Variance of each column', scaler.var_)

Number of Features: 6
Average of each column [ 0.79876356 61.7449586  57.45970337  5.73319738  5.73656977  3.53968335]
Variance of each column [0.22508969 2.04209639 4.96762745 1.25780335 1.30832582 0.50405228]


In [58]:
X_train_cat['clarity'].value_counts(normalize = True)

SI1     0.244197
VS2     0.227314
SI2     0.169448
VS1     0.151477
VVS2    0.093017
VVS1    0.067853
IF      0.033197
I1      0.013496
Name: clarity, dtype: float64

In [59]:
X_train_cat['cut'].value_counts(normalize = True)

Ideal        0.400099
Premium      0.254826
Very Good    0.225287
Good         0.089902
Fair         0.029885
Name: cut, dtype: float64

In [60]:
X_train_cat['color'].value_counts(normalize = True)

G    0.207885
E    0.181906
F    0.177407
H    0.156075
D    0.125423
I    0.099246
J    0.052058
Name: color, dtype: float64

In [61]:
X_train_cat

Unnamed: 0,cut,color,clarity
35965,Good,E,VVS2
52281,Ideal,J,SI1
6957,Premium,J,VS2
9163,Ideal,F,SI2
50598,Ideal,F,VS1
...,...,...,...
11284,Very Good,I,VS2
44732,Ideal,D,VS1
38158,Very Good,F,IF
860,Premium,J,SI1


In [62]:
cut_temp = ['Fair', 'Good', 'Very Good', 'Ideal', 'Premium']
color_temp = ['J','I','H','G','F','E','D']
clarity_temp = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

encoder = OrdinalEncoder(categories = [cut_temp, color_temp, clarity_temp], dtype = np.int64)
X_train_cat_transform = pd.DataFrame(encoder.fit_transform(X_train_cat),columns = ['cut','color','clarity'],index = X_train_cat.index)
X_train_cat_transform.head()

Unnamed: 0,cut,color,clarity
35965,1,5,5
52281,3,0,2
6957,4,0,3
9163,3,4,1
50598,3,4,4


In [63]:
X_train_transform = pd.concat([X_train_num_transform, X_train_cat_transform], axis = 1)
X_train_transform.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
35965,-1.156665,2.207837,0.242414,-1.589985,-1.544446,-1.365816,1,5,5
52281,0.086917,0.038517,-0.654923,0.27356,0.291506,0.282149,3,0,2
6957,0.529547,-0.451329,0.242414,0.737217,0.676181,0.634279,4,0,3
9163,0.466314,-0.731242,-0.654923,0.710468,0.667439,0.577938,3,4,1
50598,-0.397869,0.038517,-0.206254,-0.270345,-0.233052,-0.239002,3,4,4


In [64]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
1388,0.24,Ideal,G,VVS1,62.1,56.0,3.97,4.0,2.47
50052,0.58,Very Good,F,VVS2,60.0,57.0,5.44,5.42,3.26
41645,0.4,Ideal,E,VVS2,62.1,55.0,4.76,4.74,2.95
42377,0.43,Premium,E,VVS2,60.8,57.0,4.92,4.89,2.98
17244,1.55,Ideal,E,SI2,62.3,55.0,7.44,7.37,4.61


In [65]:
X_test_num = X_test.select_dtypes('float64')
X_test_cat = X_test.select_dtypes('object')

X_test_num_transform = pd.DataFrame(scaler.transform(X_test_num), columns = X_test_num.columns, index = X_test_num.index)
X_test_cat_transform = pd.DataFrame(encoder.transform(X_test_cat), columns = X_test_cat.columns, index = X_test_cat.index)

X_test_transform = pd.concat([X_test_num_transform, X_test_cat_transform],axis = 1)
X_test_transform.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
1388,-1.177742,0.248451,-0.654923,-1.572152,-1.518219,-1.506668,3,3,6
50052,-0.461102,-1.221088,-0.206254,-0.261429,-0.276765,-0.393939,2,4,5
41645,-0.8405,0.248451,-1.103591,-0.86775,-0.871264,-0.830579,3,5,5
42377,-0.777267,-0.661264,-0.206254,-0.725086,-0.740125,-0.788324,4,5,5
17244,1.58343,0.388407,-1.103591,1.521868,1.428047,1.50756,3,5,1


In [66]:
dump(scaler, open('models/StandardScaler.pkl', 'wb'))
dump(encoder, open('models/OrdinalEncoder.pkl', 'wb'))

## Linear Regression

In [67]:
with mlflow.start_run():
    mlflow.set_tag('Dev','Vishal')
    mlflow.set_tag('Algo','LinearReg')
    mlflow.log_param('data-path', 'data/diamonds.csv')

    linear_regressor = LinearRegression()
    linear_regressor.fit(X_train_transform,y_train)

    y_test_pred = linear_regressor.predict(X_test_transform)

    mse = metrics.mean_squared_error(y_test,y_test_pred)
    mae = metrics.mean_absolute_error(y_test,y_test_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test,y_test_pred))

    mlflow.log_metric('MSE', mse)
    mlflow.log_metric('MAE', mae)
    mlflow.log_metric('RMSE', rmse)

    mlflow.sklearn.log_model(linear_regressor, artifact_path='models')
    mlflow.log_artifact('models/StandardScaler.pkl')
    mlflow.log_artifact('models/OrdinalEncoder.pkl')

    print('Mean Squared Error :', mse)
    print('Mean Absolute Error :', mae)
    print('Root Mean Squared Error :', rmse)

Mean Squared Error : 1463942.8899984383
Mean Absolute Error : 802.0798566421087
Root Mean Squared Error : 1209.9350767700053


## KNN Regression

In [68]:
with mlflow.start_run():
    mlflow.set_tag('Dev', 'Vishal')
    mlflow.set_tag('Algo', 'KNN_Reg')
    mlflow.log_param('data-path','data/diamonds.csv')

    knn_regressor = KNeighborsRegressor()
    knn_regressor.fit(X_train_transform, y_train)

    y_test_pred = knn_regressor.predict(X_test_transform)

    mse = metrics.mean_squared_error(y_test, y_test_pred)
    mae = metrics.mean_absolute_error(y_test, y_test_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

    mlflow.log_metric('MSE', mse)
    mlflow.log_metric('MAE', mae)
    mlflow.log_metric('RMSE', rmse)

    mlflow.sklearn.log_model(knn_regressor, artifact_path='models')
    mlflow.log_artifact('models/StandardScaler.pkl')
    mlflow.log_artifact('models/OrdinalEncoder.pkl')

    print('Mean Squared Error :', mse)
    print('Mean Absolute Error :', mae)
    print('Root Mean Squared Error :', rmse)


Mean Squared Error : 522184.8879110122
Mean Absolute Error : 385.149766407119
Root Mean Squared Error : 722.6236142771784


## Decision Tree Regression

In [69]:
with mlflow.start_run():
    mlflow.set_tag('Dev','Vishal')
    mlflow.set_tag('Algo','DecisionTreeReg')
    mlflow.log_param('data-path', 'data/diamonds.csv')

    DT_regressor = DecisionTreeRegressor()
    DT_regressor.fit(X_train_transform, y_train)

    y_test_pred = DT_regressor.predict(X_test_transform)

    mse = metrics.mean_squared_error(y_test, y_test_pred)
    mae = metrics.mean_absolute_error(y_test, y_test_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

    mlflow.log_metric('MSE', mse)
    mlflow.log_metric('MAE', mae)
    mlflow.log_metric('RMSE', rmse)

    mlflow.sklearn.log_model(DT_regressor,artifact_path='models')
    mlflow.log_artifact('models/StandardScaler.pkl')
    mlflow.log_artifact('models/OrdinalEncoder.pkl')
    
    print('Mean Squared Error :', mse)
    print('Mean Absolute Error :', mae)
    print('Root Mean Squared Error :', rmse)


Mean Squared Error : 533714.0007230255
Mean Absolute Error : 362.37222840192806
Root Mean Squared Error : 730.5573219967243


## Random Forest

In [70]:
with mlflow.start_run():
    mlflow.set_tag('Dev','Vishal')
    mlflow.set_tag('Algo', 'RandomForestReg')
    mlflow.log_param('data-path', 'data/diamonds.csv')

    RF_regressor = RandomForestRegressor()
    RF_regressor.fit(X_train_transform, y_train)
    y_test_pred = RF_regressor.predict(X_test_transform)

    mse = metrics.mean_squared_error(y_test, y_test_pred)
    mae = metrics.mean_absolute_error(y_test, y_test_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

    mlflow.log_metric('MSE', mse)
    mlflow.log_metric('MAE', mae)
    mlflow.log_metric('RMSE', rmse)

    mlflow.sklearn.log_model(DT_regressor, artifact_path='models')
    mlflow.log_artifact('models/StandardScaler.pkl')
    mlflow.log_artifact('models/OrdinalEncoder.pkl')
    
    print('Mean Squared Error :', mse)
    print('Mean Absolute Error :', mae)
    print('Root Mean Squared Error :', rmse)


Mean Squared Error : 295690.9220424437
Mean Absolute Error : 269.8774173994032
Root Mean Squared Error : 543.7746978689278


## HyperParameter Tuning of KNN

In [71]:
mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run():
    tuned_parameters = [{'n_neighbors':[1,2,3,4,5,6,7], 'p':[1,2]}]

    GScv_KNN = GridSearchCV(estimator = KNeighborsRegressor(),
                            param_grid = tuned_parameters,
                            cv = 5,
                            scoring = 'neg_mean_absolute_error',
                            return_train_score = True,
                            verbose = 1
                            )

    GScv_KNN.fit(X_train_transform, y_train)

    mlflow.sklearn.autolog(disable=True)

In [72]:
dump(linear_regressor, open('models/LinearRegressor.pkl', 'wb'))
dump(knn_regressor, open('models/KNNRegressor.pkl','wb'))
dump(DT_regressor, open('models/DecisionTreeRegressor.pkl', 'wb'))
dump(RF_regressor, open('models/RandomForestRegressor.pkl', 'wb'))

In [73]:
# print('Enter the details of the Diamond')
# x = float(input('Enter the length of the Diamond '))
# y = float(input('Enter the width of the Diamond '))
# z = float(input('Enter the height of the Diamond '))
# table = float(input('Enter the table of the Diamond '))
# depth = float(input('Enter the depth of the Diamond '))
# carat = float(input('Enter the carat of the Diamond '))
# cut = input('Enter the cut of the Diamond ')
# color = input('Enter the color of the Diamond ')
# clarity = input('Enter the clarity of the Diamond ')

In [74]:
# scaler = load(open('models/StandardScaler.pkl', 'rb'))
# encoder = load(open('models/OrdinalEncoder.pkl', 'rb'))
# lr_regressor = load(open('models/LinearRegressor.pkl', 'rb'))
# knn_regressor = load(open('models/KNNRegressor.pkl', 'rb'))
# dt_regressor = load(open('models/DecisionTreeRegressor.pkl', 'rb'))
# rf_regressor = load(open('models/RandomForestRegressor.pkl', 'rb'))

In [75]:
# query_point_num = np.array([carat, depth, table, x, y , z]).reshape(1,-1)
# query_point_cat = np.array([cut,color,clarity]).reshape(1,-1)
# query_point_num_transform = scaler.transform(query_point_num)
# query_point_cat_transform = encoder.transform(query_point_cat)
# query_point_transform = np.concatenate((query_point_num_transform,query_point_cat_transform), axis = 1)

In [76]:
# print(lr_regressor.predict(query_point_transform))
# print(knn_regressor.predict(query_point_transform))
# print(dt_regressor.predict(query_point_transform))
# print(rf_regressor.predict(query_point_transform))