In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [29]:
df=pd.read_csv('winequality-red.csv')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [30]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [32]:
df.isnull().sum().sum()

0

In [33]:
print(df.nunique())

fixed acidity            96
volatile acidity        143
citric acid              80
residual sugar           91
chlorides               153
free sulfur dioxide      60
total sulfur dioxide    144
density                 436
pH                       89
sulphates                96
alcohol                  65
quality                   6
dtype: int64


In [75]:
df["quality"].unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [34]:
#Binart class conversion
df_B=df.copy()
df_B["quality"] = df_B["quality"].apply(lambda value : 1 if value >= 7 else 0)
df_B

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,0
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,0
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,0


In [35]:
df_B["quality"].value_counts()

0    1382
1     217
Name: quality, dtype: int64

In [36]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [71]:
#splitting for multiclass classification
y=df['quality'].copy()
X=df.drop('quality',axis=1).copy()
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [38]:
#splitting for binary classification
Y=df_B["quality"].copy()
Y

0       0
1       0
2       0
3       0
4       0
       ..
1594    0
1595    0
1596    0
1597    0
1598    0
Name: quality, Length: 1599, dtype: int64

In [39]:
#scaler
scaler=StandardScaler()
X=pd.DataFrame(scaler.fit_transform(X),columns=X.columns)

In [40]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,shuffle=True,random_state=123)
X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
374,3.263541,-0.658202,1.843607,0.894790,0.032592,-0.944346,0.016184,2.466324,-1.950996,0.896120,0.353895
800,-0.643266,0.459094,-0.980669,1.036685,-0.116184,0.968269,1.871131,-0.178445,-0.395969,-0.874272,-0.960246
1441,-0.528360,1.436727,-0.415813,1.888059,0.138860,0.298854,1.567041,0.203165,-0.979104,-0.815259,-0.803801
1269,-1.619967,-0.211283,-1.237421,-0.524166,-0.923826,1.159531,1.232543,-3.151821,1.223850,0.955133,3.357647
691,0.505795,2.190902,-0.159061,0.043416,-0.009916,-0.370562,1.414996,1.618302,1.094265,-0.697233,-0.584777
...,...,...,...,...,...,...,...,...,...,...,...
1122,-1.160343,-0.323013,-1.391472,-0.807957,-0.690035,1.063900,-0.409542,-2.409802,0.899886,-1.051311,1.761904
1346,-1.275249,0.347364,-1.340122,-0.311323,-0.668781,-1.039977,-1.017721,-1.074168,1.353436,-0.579207,0.917099
1406,-0.068735,-1.607903,0.354443,1.817111,-0.541259,-0.753085,-0.744040,0.346269,-0.590348,1.663290,0.447763
1389,-0.930531,-0.267148,-1.288771,-0.240375,-0.158692,1.924577,1.962358,-0.798561,-1.367861,-0.756246,-0.678644


In [41]:
y_test

912     6
772     5
1037    5
1106    6
263     5
       ..
1466    7
580     5
1082    6
1279    7
1155    5
Name: quality, Length: 480, dtype: int64

In [42]:
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , train_size = 0.70 , random_state = 123)
X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
374,3.263541,-0.658202,1.843607,0.894790,0.032592,-0.944346,0.016184,2.466324,-1.950996,0.896120,0.353895
800,-0.643266,0.459094,-0.980669,1.036685,-0.116184,0.968269,1.871131,-0.178445,-0.395969,-0.874272,-0.960246
1441,-0.528360,1.436727,-0.415813,1.888059,0.138860,0.298854,1.567041,0.203165,-0.979104,-0.815259,-0.803801
1269,-1.619967,-0.211283,-1.237421,-0.524166,-0.923826,1.159531,1.232543,-3.151821,1.223850,0.955133,3.357647
691,0.505795,2.190902,-0.159061,0.043416,-0.009916,-0.370562,1.414996,1.618302,1.094265,-0.697233,-0.584777
...,...,...,...,...,...,...,...,...,...,...,...
1122,-1.160343,-0.323013,-1.391472,-0.807957,-0.690035,1.063900,-0.409542,-2.409802,0.899886,-1.051311,1.761904
1346,-1.275249,0.347364,-1.340122,-0.311323,-0.668781,-1.039977,-1.017721,-1.074168,1.353436,-0.579207,0.917099
1406,-0.068735,-1.607903,0.354443,1.817111,-0.541259,-0.753085,-0.744040,0.346269,-0.590348,1.663290,0.447763
1389,-0.930531,-0.267148,-1.288771,-0.240375,-0.158692,1.924577,1.962358,-0.798561,-1.367861,-0.756246,-0.678644


In [43]:
Y_train

374     0
800     0
1441    0
1269    1
691     0
       ..
1122    0
1346    0
1406    0
1389    0
1534    1
Name: quality, Length: 1119, dtype: int64

In [44]:
print("Multiclass label :", y_train.unique())
print("Binary label     :",Y_train.unique())

Multiclass label : [6 5 8 3 7 4]
Binary label     : [0 1]


In [92]:
#Training Multiclass
model_1=LogisticRegression()
model_1.fit(X_train,y_train)
model_1.score(X_test,y_test)*100

56.666666666666664

In [93]:
model_2=DecisionTreeClassifier()
model_2.fit(X_train,y_train)
model_2.score(X_test,y_test)*100

58.54166666666667

In [97]:
model_3=RandomForestClassifier()
model_3.fit(X_train,y_train)
model_3.score(X_test,y_test)*100

65.0

In [82]:
model_4=GradientBoostingClassifier()
model_4.fit(X_train,y_train)
model_4.score(X_test,y_test)*100

63.33333333333333

In [83]:
print(model_1.score(X_test,y_test)*100)
print(model_2.score(X_test,y_test)*100)
print(model_3.score(X_test,y_test)*100)
print(model_4.score(X_test,y_test)*100)

56.666666666666664
56.458333333333336
65.0
63.33333333333333


In [103]:
#Training Binary
model_1=LogisticRegression()
model_1.fit(X_train,Y_train)
model_1.score(X_test,Y_test)*100

86.25

In [104]:
model_2=DecisionTreeClassifier()
model_2.fit(X_train,Y_train)
model_2.score(X_test,Y_test)*100

86.25

In [105]:
model_3=RandomForestClassifier()
model_3.fit(X_train,Y_train)
model_3.score(X_test,Y_test)*100

91.66666666666666

In [106]:
model_4=GradientBoostingClassifier()
model_4.fit(X_train,Y_train)
model_4.score(X_test,Y_test)*100

90.20833333333333

In [107]:
print(model_1.score(X_test,Y_test)*100)
print(model_2.score(X_test,Y_test)*100)
print(model_3.score(X_test,Y_test)*100)
print(model_4.score(X_test,Y_test)*100)

86.25
86.25
91.66666666666666
90.20833333333333
