# Wine prediction 
**Author**: Pierre Mulliez \
**Created**: 04/06/2021 \
**Description**: Run a machine learning model using Mlflow \
**Contact**: pierremulliez1@gmail.cim

In [19]:
#Load and import libraries 
import mlflow
import mlflow.sklearn
import pandas as pd 
import numpy as np 
import shutil  
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

In [20]:
#change directory 
!cd C:/Users/Pierre Computer/Documents/IE_classes/MLops/Assignment1_Pierre_Mulliez

In [21]:
#Delete last runs directory
try:
    shutil.rmtree('./mlruns')
except FileNotFoundError:
    print("WARNING: Can't find folder mlruns")

In [22]:
pathdirectory = "./winequality-red.csv"
df = pd.DataFrame(pd.read_csv(pathdirectory))

In [23]:
print("The different quality grades in the dataset are: {}".format(np.sort(df.quality.unique())))
print("Do we have any null values ? ")
print({col: np.sum(np.isnan(df.loc[:,col])) for col in df.columns})
df.head()

The different quality grades in the dataset are: [3 4 5 6 7 8]
Do we have any null values ? 
{'fixed acidity': 0, 'volatile acidity': 0, 'citric acid': 0, 'residual sugar': 0, 'chlorides': 0, 'free sulfur dioxide': 0, 'total sulfur dioxide': 0, 'density': 0, 'pH': 0, 'sulphates': 0, 'alcohol': 0, 'quality': 0}


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


**Train test split**

In [24]:
y = df.quality 
X = df.loc[:,df.columns != "quality"]

In [25]:
#Normelize the data
X = preprocessing.normalize(X)

In [26]:
#split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=0)

## Run the KNN 

In [27]:
#small baseline predict with the mean
y_pred_b = []
for el in range(0,len(y_test)):
    y_pred_b.append(y_train.mean())

In [28]:
#Evaluate
print( "Mean abs error: {}".format(mean_absolute_error(y_pred_b, y_test)))
print( "Mean squared error: {}".format(mean_squared_error(y_pred_b, y_test)))
print( "r2 score: {}".format(r2_score(y_pred_b, y_test)))

Mean abs error: 0.6390050820953871
Mean squared error: 0.5752539443014328
r2 score: 0.0


**First model**

In [29]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)

In [30]:
np.unique(y_pred)

array([3, 4, 5, 6, 7, 8], dtype=int64)

In [31]:
#Evaluate
mae = mean_absolute_error(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
r2 = r2_score(y_pred, y_test)
mape = np.mean(np.abs((y_test - y_pred) / y)) * 100
print( "Mean abs error: {}".format(mae))
print( "Mean squared error: {}".format(mse))
print( "r2 score: {}".format(r2))

Mean abs error: 0.578125
Mean squared error: 0.815625
r2 score: -0.4853017019082


### Optimization of the model with mlflow

#### Parameters to optimize
- P -> euclidian or mannahatan distance 
- n_neighbors 
- leaf size 

In [32]:
#error function
def calculate_errors(y,ypred):
    mae = mean_absolute_error(y,ypred)
    mse = mean_squared_error(y, ypred)
    R2 = r2_score(y,ypred)
    y,ypred = np.array(y), np.array(ypred)
    mape = np.mean(np.abs((y - ypred) / y)) * 100
    return mae,mse,R2,mape

In [33]:
#Mlflow function
def train_knn(nb,leaf,dist,exp = None):
    with mlflow.start_run(experiment_id=exp): #start mlflow run
        neigh = KNeighborsClassifier(n_neighbors=nb, leaf_size= leaf, p = dist)
        neigh.fit(X_train, y_train)
        y_pred = neigh.predict(X_test)
        
        #calculate errors
        mae,mse,R2,mape = calculate_errors(y_test,y_pred)
        errors = mae,mse,R2,mape
        print("MAE:{0:.3f}, MSE:{1:.2f}, R2:{2:.2f}".format(mae, mse, R2))
        
        #log metris and parmeters
        mlflow.log_metrics({"MAE":mae,"MSE":mse, "R2":R2, "MAPE":mape})
        mlflow.log_params({"Numbers of neighbors": nb,"Leaf size":leaf,"p":dist})
        
        #register model
        mlflow.sklearn.log_model(neigh, "model")
        
        #save error plot
        plt.figure()
        plt.bar(['mae','mse','R2','mape'],errors,color=['blue','red','green','orange']);
        plt.title("Errors")
        plt.savefig("errors.png")
        plt.close()
        mlflow.log_artifact("errors.png")

In [34]:
exp = mlflow.create_experiment(name="knn normelized")

In [35]:
p = [1,2]
nei = [3,4,5,6]
lea = [30,40]
for pdist in p:
    for nb in nei:
        for l in lea:
            train_knn(nb,l,pdist,exp)

MAE:0.588, MSE:0.76, R2:-0.33
MAE:0.588, MSE:0.76, R2:-0.33
MAE:0.544, MSE:0.69, R2:-0.21
MAE:0.544, MSE:0.69, R2:-0.21
MAE:0.550, MSE:0.67, R2:-0.17
MAE:0.550, MSE:0.67, R2:-0.17
MAE:0.553, MSE:0.73, R2:-0.28
MAE:0.553, MSE:0.73, R2:-0.28
MAE:0.578, MSE:0.82, R2:-0.42
MAE:0.578, MSE:0.82, R2:-0.42
MAE:0.597, MSE:0.83, R2:-0.46
MAE:0.597, MSE:0.83, R2:-0.46
MAE:0.609, MSE:0.80, R2:-0.40
MAE:0.609, MSE:0.80, R2:-0.40
MAE:0.637, MSE:0.89, R2:-0.55
MAE:0.637, MSE:0.89, R2:-0.55


In [36]:
#log baseline 
mlflow.log_metrics({"MAE":mae,"MSE":mse, "R2":r2, "MAPE":mape})
mlflow.log_params({"Numbers of neighbors": 0,"Leaf size":0,"p":0})
#register model
mlflow.sklearn.log_model("baseline", "model")  
mlflow.end_run()

## Best model selector 

In [41]:
df = mlflow.search_runs(experiment_ids="1")
run_id = df.loc[df['metrics.MAE'].idxmin()]['run_id']
print("Minimum error run_id: ",run_id)

Minimum error run_id:  8ac50649c1954d87b67e68177f38eff3


In [63]:
print('Best number of neighbor: ')
df.loc[df.run_id == "8ac50649c1954d87b67e68177f38eff3",'params.Numbers of neighbors']

Best number of neighbor: 


12    4
Name: params.Numbers of neighbors, dtype: object