# Model for the wine quality prediciton

1. Import data

you can find the source file data from this: [uci datasets](https://archive.ics.uci.edu/dataset/186/wine+quality)

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('winequality-red.csv', sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## 2. EDA data

In [3]:
df.iloc[:, :-1].describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9


In [4]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [5]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

## 3. Preprocessing data

In [7]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

In [8]:
df['quality'].value_counts()

bad     1382
good     217
Name: quality, dtype: int64

In [9]:
encoder = LabelEncoder()
df['quality'] = encoder.fit_transform(df['quality'])

In [10]:
df['quality'].value_counts()

0    1382
1     217
Name: quality, dtype: int64

In [11]:
from imblearn.over_sampling import RandomOverSampler

In [12]:
oversample = RandomOverSampler(sampling_strategy='minority')

X, y = df.drop('quality', axis=1), df['quality']
X, y = oversample.fit_resample(X, y)

In [14]:
y.value_counts()

0    1382
1    1382
Name: quality, dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=0)

X_train.shape, X_test.shape

((1934, 11), (830, 11))

## 4. Build the model

In [15]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [16]:
clf_pipeline = [('scaling', StandardScaler()), 
                ('clf', SVC(random_state = 42))]
pipeline = Pipeline(clf_pipeline)

In [17]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

## 5. The performance of the model

In [20]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [21]:
confusion_matrix(y_test, y_pred)

array([[347,  79],
       [ 39, 365]], dtype=int64)

In [22]:
accuracy_score(y_test, y_pred)

0.8578313253012049

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.81      0.85       426
           1       0.82      0.90      0.86       404

    accuracy                           0.86       830
   macro avg       0.86      0.86      0.86       830
weighted avg       0.86      0.86      0.86       830



## 6. Save the model

In [29]:
from pathlib import Path
import os
import joblib

In [30]:
PATH_SAVE = 'models'
MODEL_NAME = 'svc'

if not os.path.isdir('models'):
    print('create path models')
    os.mkdir(PATH_SAVE)
else:
    print('models path is exist.')

PATH_FILE = Path(PATH_SAVE) / (MODEL_NAME+'.joblib')
PATH_FILE

create path models


WindowsPath('models/svc.joblib')

In [34]:
joblib.dump((pipeline, list(X_test.columns)), PATH_FILE)

['models\\svc.joblib']

## 7. Testing the loaded model

In [36]:
from joblib import load

In [38]:
loaded_model, columns = load('./models/svc.joblib')

In [39]:
columns

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [40]:
loaded_model.predict(X_test)

array([1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,