# Wine Quality Prediction

Vaibhavi Powar

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [5]:
# Load the wine quality dataset
df = pd.read_csv('Winequality.csv')

In [6]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [7]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [10]:
df.shape

(1599, 12)

In [11]:
df.describe(include='all')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [12]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('quality', axis=1), df['quality'], test_size=0.25, random_state =40)

In [13]:
# Scale the features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
y =df['quality']
y

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 1599, dtype: int64

In [15]:
scaler

MinMaxScaler()

In [16]:
X_train

array([[0.0733945 , 0.51239669, 0.        , ..., 1.        , 0.15568862,
        0.73214286],
       [0.26605505, 0.21487603, 0.57      , ..., 0.48818898, 0.13173653,
        0.53571429],
       [0.16513761, 0.4214876 , 0.21      , ..., 0.66141732, 0.19760479,
        0.25      ],
       ...,
       [0.24770642, 0.43801653, 0.        , ..., 0.51181102, 0.08383234,
        0.28571429],
       [0.29357798, 0.33884298, 0.33      , ..., 0.44094488, 0.16167665,
        0.19642857],
       [0.40366972, 0.39669421, 0.29      , ..., 0.47244094, 0.14371257,
        0.28571429]])

In [17]:
X_test

array([[0.50458716, 0.20661157, 0.34      , ..., 0.33858268, 0.19161677,
        0.39285714],
       [0.09174312, 0.15702479, 0.37      , ..., 0.45669291, 0.1497006 ,
        0.14285714],
       [0.44036697, 0.31404959, 0.34      , ..., 0.43307087, 0.11377246,
        0.41071429],
       ...,
       [0.39449541, 0.23140496, 0.32      , ..., 0.50393701, 0.26347305,
        0.375     ],
       [0.28440367, 0.30578512, 0.26      , ..., 0.51181102, 0.18562874,
        0.21428571],
       [0.28440367, 0.34710744, 0.26      , ..., 0.40944882, 0.15568862,
        0.23214286]])

In [18]:
# Scale the features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
# Create the model
model = LogisticRegression()

In [20]:
# Train the model
model.fit(X_train, y_train)

LogisticRegression()

In [21]:
# Make predictions
y_pred = model.predict(X_test)
y_pred

array([6, 5, 6, 6, 5, 5, 6, 6, 5, 5, 6, 6, 6, 6, 5, 6, 6, 5, 5, 5, 5, 5,
       5, 5, 7, 5, 6, 5, 6, 5, 5, 6, 5, 5, 5, 6, 7, 5, 6, 5, 5, 6, 5, 6,
       6, 5, 6, 6, 6, 5, 6, 5, 5, 6, 6, 5, 6, 6, 6, 5, 6, 5, 6, 6, 6, 5,
       5, 6, 6, 6, 5, 5, 5, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6, 5,
       6, 5, 5, 6, 6, 5, 6, 5, 5, 6, 6, 5, 5, 6, 5, 6, 7, 6, 6, 5, 5, 6,
       7, 5, 6, 6, 5, 5, 5, 6, 6, 6, 5, 5, 5, 5, 5, 6, 6, 5, 6, 6, 5, 6,
       7, 5, 5, 6, 5, 6, 5, 5, 6, 6, 6, 5, 6, 6, 5, 6, 6, 6, 6, 6, 5, 6,
       6, 5, 5, 7, 6, 5, 6, 5, 6, 6, 6, 5, 5, 5, 6, 5, 5, 6, 6, 5, 5, 5,
       6, 5, 5, 5, 6, 6, 5, 6, 5, 5, 5, 6, 6, 7, 6, 5, 5, 5, 5, 5, 5, 5,
       6, 6, 5, 5, 6, 5, 6, 6, 6, 5, 6, 7, 6, 5, 6, 6, 5, 6, 7, 5, 6, 6,
       6, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 5, 7, 6,
       6, 5, 5, 6, 6, 6, 6, 5, 6, 5, 5, 5, 5, 7, 6, 5, 6, 6, 5, 5, 6, 5,
       5, 6, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 6,

In [22]:
model.score(X_test, y_test)

0.59

In [23]:
pd.crosstab(y_test, y_pred)

col_0,5,6,7
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,4,0,0
4,10,1,0
5,126,33,1
6,57,102,5
7,3,43,8
8,0,4,3
