# Experiments using MLFlow

# Loading the Data


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df=pd.read_csv(r"C:\Users\SNEGHAL\Desktop\model\diamond_price_prediction\diamonds.csv")
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [5]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [6]:
df.isnull().sum()


carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [7]:
df = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']]

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


# Test Train Split

In [8]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [9]:
import sklearn

print(sklearn.__version__)

1.1.1


In [10]:
import sklearn

print(sklearn.__version__)

1.1.1


In [11]:
y = df['price']

X = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

In [13]:
X_train.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
29477,0.32,Ideal,F,VS1,61.3,56.0,4.4,4.44,2.71
52542,0.72,Very Good,I,VVS2,61.7,55.0,5.76,5.81,3.57
8725,0.38,Very Good,D,SI1,62.0,55.0,4.67,4.72,2.91
15577,1.0,Very Good,H,VVS2,62.6,56.0,6.36,6.39,3.99
19651,1.7,Premium,H,SI2,59.8,61.0,7.67,7.62,4.57


In [14]:
print(X_train.shape, y_train.shape)

print(X_test.shape, y_test.shape)

(37758, 9) (37758,)
(16182, 9) (16182,)


In [15]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
29477,0.32,Ideal,F,VS1,61.3,56.0,4.4,4.44,2.71
52542,0.72,Very Good,I,VVS2,61.7,55.0,5.76,5.81,3.57
8725,0.38,Very Good,D,SI1,62.0,55.0,4.67,4.72,2.91
15577,1.0,Very Good,H,VVS2,62.6,56.0,6.36,6.39,3.99
19651,1.7,Premium,H,SI2,59.8,61.0,7.67,7.62,4.57


In [16]:
X_train.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [17]:
X_train_cat = X_train.select_dtypes(include=['object'])

X_train_cat.head()

Unnamed: 0,cut,color,clarity
29477,Ideal,F,VS1
52542,Very Good,I,VVS2
8725,Very Good,D,SI1
15577,Very Good,H,VVS2
19651,Premium,H,SI2


In [18]:
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])

X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
29477,0.32,61.3,56.0,4.4,4.44,2.71
52542,0.72,61.7,55.0,5.76,5.81,3.57
8725,0.38,62.0,55.0,4.67,4.72,2.91
15577,1.0,62.6,56.0,6.36,6.39,3.99
19651,1.7,59.8,61.0,7.67,7.62,4.57


# Data Preprocessing on Training Data

In [19]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# column names are (annoyingly) lost after Scaling
# (i.e. the dataframe is converted to a numpy ndarray)

X_train_num_rescaled = pd.DataFrame(scaler.fit_transform(X_train_num), 
                                    columns = X_train_num.columns, 
                                    index = X_train_num.index)

X_train_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
29477,-1.007784,-0.316595,-0.647912,-1.187006,-1.122457,-1.191717
52542,-0.166198,-0.038325,-1.095448,0.023744,0.062997,0.043156
8725,-0.881547,0.170377,-1.095448,-0.946636,-0.880174,-0.904537
15577,0.422912,0.587782,-0.647912,0.557898,0.564868,0.646234
19651,1.895688,-1.360107,1.589767,1.724135,1.62918,1.479055


In [20]:
X_train_cat['cut'].value_counts(normalize=True)

Ideal        0.400339
Premium      0.254224
Very Good    0.224006
Good         0.092033
Fair         0.029398
Name: cut, dtype: float64

In [21]:
X_train_cat['color'].value_counts(normalize=True)

G    0.210419
E    0.181736
F    0.178479
H    0.153583
D    0.122888
I    0.100244
J    0.052651
Name: color, dtype: float64

In [22]:
X_train_cat['clarity'].value_counts(normalize=True)

SI1     0.242094
VS2     0.226283
SI2     0.169765
VS1     0.152153
VVS2    0.095212
VVS1    0.068515
IF      0.032602
I1      0.013375
Name: clarity, dtype: float64

# Preparing Test Data

In [23]:
from sklearn.preprocessing import OneHotEncoder

encoder_ = OneHotEncoder(drop='first',  sparse=False)

# column names are (annoyingly) lost after OneHotEncoding
# (i.e. the dataframe is converted to a numpy ndarray)

X_train_cat_ohe = pd.DataFrame(encoder_.fit_transform(X_train_cat), 
                               columns=encoder_.get_feature_names_out(X_train_cat.columns), 
                               index = X_train_cat.index)

X_train_cat_ohe.head()

Unnamed: 0,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
29477,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
52542,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8725,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
15577,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19651,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [24]:
X_train_cat_le = pd.DataFrame(index=X_train_cat.index)

X_train_cat_le.head()

29477
52542
8725
15577
19651


In [25]:
X_train_cat.cut.unique()

array(['Ideal', 'Very Good', 'Premium', 'Good', 'Fair'], dtype=object)

In [26]:
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

X_train_cat_le['cut'] = X_train_cat['cut'].apply(lambda x : cut_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut
29477,4
52542,3
8725,3
15577,3
19651,5


In [27]:
X_train_cat.color.unique()

array(['F', 'I', 'D', 'H', 'G', 'E', 'J'], dtype=object)

In [28]:
color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

X_train_cat_le['color'] = X_train_cat['color'].apply(lambda x : color_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut,color
29477,4,5
52542,3,2
8725,3,7
15577,3,3
19651,5,3


In [29]:
X_train_cat.clarity.unique()

array(['VS1', 'VVS2', 'SI1', 'SI2', 'VVS1', 'VS2', 'I1', 'IF'],
      dtype=object)

In [30]:
clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

X_train_cat_le['clarity'] = X_train_cat['clarity'].apply(lambda x : clarity_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut,color,clarity
29477,4,5,5
52542,3,2,6
8725,3,7,3
15577,3,3,6
19651,5,3,2


In [31]:
X_train_transformed = pd.concat([X_train_num_rescaled, X_train_cat_le], axis=1)

X_train_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
29477,-1.007784,-0.316595,-0.647912,-1.187006,-1.122457,-1.191717,4,5,5
52542,-0.166198,-0.038325,-1.095448,0.023744,0.062997,0.043156,3,2,6
8725,-0.881547,0.170377,-1.095448,-0.946636,-0.880174,-0.904537,3,7,3
15577,0.422912,0.587782,-0.647912,0.557898,0.564868,0.646234,3,3,6
19651,1.895688,-1.360107,1.589767,1.724135,1.62918,1.479055,5,3,2


In [32]:
X_test_cat = X_test.select_dtypes(include=['object'])

X_test_cat.head()

Unnamed: 0,cut,color,clarity
52264,Ideal,E,VS2
21073,Ideal,G,VS1
42161,Ideal,G,SI1
35974,Ideal,F,VS1
7641,Premium,G,IF


In [33]:
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

X_test_num.head()

Unnamed: 0,carat,depth,table,x,y,z
52264,0.57,61.5,57.0,5.35,5.32,3.28
21073,1.16,61.5,55.0,6.75,6.81,4.17
42161,0.51,63.2,58.0,5.05,5.08,3.2
35974,0.42,60.6,56.0,4.83,4.87,2.94
7641,0.8,62.6,58.0,5.89,5.93,3.7


In [34]:
X_test_num_rescaled = pd.DataFrame(scaler.transform(X_test_num), 
                                   columns = X_test_num.columns, 
                                   index = X_test_num.index)

X_test_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
52264,-0.481793,-0.17746,-0.200377,-0.341261,-0.360998,-0.373255
21073,0.759547,-0.17746,-1.095448,0.905098,0.928292,0.904696
42161,-0.608031,1.005187,0.247159,-0.608339,-0.568668,-0.488127
35974,-0.797388,-0.803567,-0.647912,-0.804195,-0.75038,-0.86146
7641,0.002119,0.587782,0.247159,0.139477,0.166832,0.229823


In [35]:
X_test_cat_le = pd.DataFrame(index = X_test_cat.index)

X_test_cat_le.head()

52264
21073
42161
35974
7641


In [36]:
X_test_cat_le['cut'] = X_test_cat['cut'].apply(lambda x : cut_encoder[x])

X_test_cat_le['color'] = X_test_cat['color'].apply(lambda x : color_encoder[x])

X_test_cat_le['clarity'] = X_test_cat['clarity'].apply(lambda x : clarity_encoder[x])

X_test_cat_le.head()

Unnamed: 0,cut,color,clarity
52264,4,6,4
21073,4,4,5
42161,4,4,3
35974,4,5,5
7641,5,4,8


In [37]:
X_test_transformed = pd.concat([X_test_num_rescaled, X_test_cat_le], axis=1)

X_test_transformed.head()



Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
52264,-0.481793,-0.17746,-0.200377,-0.341261,-0.360998,-0.373255,4,6,4
21073,0.759547,-0.17746,-1.095448,0.905098,0.928292,0.904696,4,4,5
42161,-0.608031,1.005187,0.247159,-0.608339,-0.568668,-0.488127,4,4,3
35974,-0.797388,-0.803567,-0.647912,-0.804195,-0.75038,-0.86146,4,5,5
7641,0.002119,0.587782,0.247159,0.139477,0.166832,0.229823,5,4,8


# Linerar Regression

In [38]:
from sklearn.linear_model import LinearRegression
li_regressor = LinearRegression()
li_regressor.fit(X_train_transformed, y_train)

In [39]:
y_test_pred = li_regressor.predict(X_test_transformed)

In [40]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,2363.83474
21073,9248,7469.644228
42161,1284,643.298938
35974,921,1516.80821
7641,4268,5721.128606


In [41]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  795.4560135879765
Mean Squared Error:  1471939.1560760005
Root Mean Squared Error:  1213.2349962295023


# K Neighbors Regressor

In [42]:
from sklearn.neighbors import KNeighborsRegressor
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(X_train_transformed, y_train)


In [43]:
y_test_pred = knn_regressor.predict(X_test_transformed)

In [44]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1779.6
21073,9248,9000.6
42161,1284,1136.6
35974,921,960.4
7641,4268,5510.6


In [45]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  384.03119515511065
Mean Squared Error:  567200.9668570017
Root Mean Squared Error:  753.127457245453


# Decision Tree Regressor

In [46]:
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X_train_transformed, y_train)

In [47]:
y_test_pred = dt_regressor.predict(X_test_transformed)

In [48]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1625.0
21073,9248,8020.0
42161,1284,1197.0
35974,921,1031.0
7641,4268,6338.0


In [49]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  364.50704486466446
Mean Squared Error:  552707.2645532073
Root Mean Squared Error:  743.4428455188787


# Random Forest Regressor

In [50]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train_transformed, y_train)

In [51]:
y_test_pred = rf_regressor.predict(X_test_transformed)

In [52]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1835.66
21073,9248,8491.68
42161,1284,1240.44
35974,921,1022.59
7641,4268,4215.38


In [53]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  268.97018128216143
Mean Squared Error:  289227.1131659189
Root Mean Squared Error:  537.7983945363903


# Serialization

In [54]:
from pickle import dump

In [55]:
dump(scaler, open(r'C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking/standard_scaler.pkl', 'wb'))
dump(li_regressor, open(r'C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking/linearregression.pkl', 'wb'))
dump(knn_regressor, open(r'C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking/knnregression.pkl', 'wb'))
dump(dt_regressor, open(r'C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking/decisiontreeregression.pkl', 'wb'))
dump(rf_regressor, open(r'C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking/randomforestregression.pkl', 'wb'))

# Deserialization

In [56]:
from pickle import load

In [57]:
rf_regressor = load(open(r'C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking\randomforestregression.pkl', 'rb'))
scaler = load(open(r'C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking\standard_scaler.pkl', 'rb'))

In [58]:
clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}
color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

In [59]:
clarity_encoder.keys()

dict_keys(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])

In [60]:
num_scal=scaler.transform([[1.16 ,61.5 ,55.0 ,6.75 ,6.81 ,4.17]])
num_scal.flatten()



array([ 0.7595468 , -0.1774599 , -1.09544817,  0.90509846,  0.92829164,
        0.90469557])

In [61]:
cat_encod=np.array([clarity_encoder["I1"],color_encoder["J"],cut_encoder["Good"]])
cat_encod

array([1, 1, 2])

In [62]:
np.concatenate((cat_encod, num_scal.flatten()), axis=None).reshape(1,-1)

array([[ 1.        ,  1.        ,  2.        ,  0.7595468 , -0.1774599 ,
        -1.09544817,  0.90509846,  0.92829164,  0.90469557]])

In [63]:
rf_regressor.predict(np.concatenate((cat_encod, num_scal.flatten()), axis=None).reshape(1,-1)).item()



2403.51

# Experiment Tracking

Step 1 - Import MLFlow

In [64]:
import mlflow

Step 2 - Set the tracker and experiment

In [65]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

mlflow.set_experiment("Diamond Price Prediction")

<Experiment: artifact_location='./mlruns/1', creation_time=1663934343841, experiment_id='1', last_update_time=1663934343841, lifecycle_stage='active', name='Diamond Price Prediction', tags={}>

In [66]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn import metrics

In [67]:
from pickle import dump

dump(scaler, open('C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking/standard_scaler.pkl', 'wb'))

# Experiment 1 - Training KNN Classifier

Step 3 - Start a experiment run

Step 4 - Logging the metadata

Step 5 - Logging the model and other files (2 ways)

Way 1 - mlflow.<FRAMEWORK>.log_model(MODEL_OBJECT, artifact_path="PATH")

Way 2 - mlflow.log_artifact(LOCAL_PATH, artifact_path="PATH")

In [68]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Snehal")
    mlflow.set_tag("algo", "KNN")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", r"C:\Users\SNEGHAL\Desktop\model\diamond_price_prediction\diamonds.csv")
    k = 53
    mlflow.log_param("n_neighbors", k)
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(X_train_num_rescaled, y_train)
    y_test_pred = knn_classifier.predict(X_test_num_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(knn_classifier, artifact_path="models")
    mlflow.log_artifact("C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking/standard_scaler.pkl")

# Experiment 2 -Training Decision Tree


In [69]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Kanav")
    mlflow.set_tag("algo", "DecisionTree")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking/standard_scaler.pkl")
    depth = 3
    mlflow.log_param("max_depth", depth)
    dt_classifier = DecisionTreeClassifier(max_depth = depth)
    dt_classifier.fit(X_train_num_rescaled, y_train)
    y_test_pred = dt_classifier.predict(X_test_num_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(dt_classifier, artifact_path="models")
    mlflow.log_artifact("C:/Users/SNEGHAL/AppData/Local/Programs/Python/Experiment_Tracking/standard_scaler.pkl")