* Red Wine Quality Using Classification Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('/content/winequality-red.csv')

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
X = dataset.iloc[:,1:-1].values
y = dataset.iloc[:,-1].values

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20)

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
sc = StandardScaler()

In [9]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [11]:
l_cla = LogisticRegression()
k_cla = KNeighborsClassifier()
d_cla = DecisionTreeClassifier()
r_cla = RandomForestClassifier()
s_cla = SVC(kernel='linear')
ks_cla= SVC(kernel='rbf')

In [12]:
l_cla.fit(X_train, y_train)
k_cla.fit(X_train, y_train)
d_cla.fit(X_train, y_train)
r_cla.fit(X_train, y_train)
s_cla.fit(X_train, y_train)
ks_cla.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
l_pred = l_cla.predict(X_test)
k_pred = k_cla.predict(X_test)
d_pred = d_cla.predict(X_test)
r_pred = r_cla.predict(X_test)
s_pred = s_cla.predict(X_test)
ks_pred = ks_cla.predict(X_test)

In [14]:
from sklearn.metrics import confusion_matrix

In [15]:
l_c = confusion_matrix(y_test,l_pred)
k_c = confusion_matrix(y_test, k_pred)
d_c = confusion_matrix(y_test, d_pred)
r_c = confusion_matrix(y_test, r_pred)
s_c = confusion_matrix(y_test, s_pred)
ks_c = confusion_matrix(y_test, ks_pred)


In [16]:
l_c

array([[ 0,  0,  1,  0,  0,  0],
       [ 0,  0,  4,  3,  0,  0],
       [ 1,  0, 92, 43,  2,  0],
       [ 0,  0, 45, 76, 13,  0],
       [ 0,  0,  6, 19, 12,  0],
       [ 0,  0,  0,  1,  2,  0]])

In [17]:
k_c

array([[ 0,  0,  0,  1,  0,  0],
       [ 0,  0,  6,  1,  0,  0],
       [ 1,  3, 88, 41,  5,  0],
       [ 0,  0, 35, 80, 19,  0],
       [ 0,  0,  9, 12, 16,  0],
       [ 0,  0,  0,  1,  2,  0]])

In [18]:
d_c

array([[ 0,  0,  1,  0,  0,  0],
       [ 0,  1,  4,  2,  0,  0],
       [ 0,  4, 96, 33,  5,  0],
       [ 0,  3, 32, 81, 17,  1],
       [ 0,  0,  2, 12, 22,  1],
       [ 0,  0,  0,  2,  0,  1]])

In [19]:
r_c

array([[  0,   0,   1,   0,   0,   0],
       [  0,   0,   5,   2,   0,   0],
       [  0,   1, 102,  33,   2,   0],
       [  0,   0,  35,  88,  11,   0],
       [  0,   0,   3,  12,  22,   0],
       [  0,   0,   0,   2,   1,   0]])

In [20]:
s_c

array([[ 0,  0,  1,  0,  0,  0],
       [ 0,  0,  5,  2,  0,  0],
       [ 0,  0, 94, 44,  0,  0],
       [ 0,  0, 46, 88,  0,  0],
       [ 0,  0,  5, 32,  0,  0],
       [ 0,  0,  0,  3,  0,  0]])

In [21]:
ks_c

array([[ 0,  0,  1,  0,  0,  0],
       [ 0,  0,  6,  1,  0,  0],
       [ 0,  0, 95, 42,  1,  0],
       [ 0,  0, 45, 77, 12,  0],
       [ 0,  0,  5, 21, 11,  0],
       [ 0,  0,  0,  3,  0,  0]])

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
l_a = accuracy_score(y_test, l_pred)
k_a = accuracy_score(y_test, k_pred)
d_a = accuracy_score(y_test, d_pred)
r_a = accuracy_score(y_test, r_pred)
s_a = accuracy_score(y_test, s_pred)
ks_a = accuracy_score(y_test, ks_pred)

In [24]:
print('Logistic Regression: ' + str(l_a) + '\\nKNN: ' + str(k_a) + '\\nDecision Tree: ' + str(d_a) + '\\nRandom Forest: ' + str(r_a) + '\\nLinear SVC: ' + str(s_a) + '\\nKernel SVC: ' + str(l_a))

Logistic Regression: 0.5625\nKNN: 0.575\nDecision Tree: 0.628125\nRandom Forest: 0.6625\nLinear SVC: 0.56875\nKernel SVC: 0.5625


* Red Wine Quality Using Regression Model

In [25]:
import pandas as pd
import numpy as np

In [26]:
dataset = pd.read_csv('/content/winequality-red.csv')

In [27]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [28]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [32]:
m_reg = LinearRegression()
p_reg = LinearRegression()
d_reg = DecisionTreeRegressor()
r_reg = RandomForestRegressor()

In [33]:
X_poly = PolynomialFeatures(degree = 5)
X_poly = X_poly.fit_transform(X_train)

In [34]:
m_reg.fit(X_train, y_train)
p_reg.fit(X_train, y_train)
d_reg.fit(X_train, y_train)
r_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [35]:
temp = PolynomialFeatures(degree = 5)
temp = temp.fit_transform(X_test)

In [36]:
m_pred = m_reg.predict(X_test)
p_pred = p_reg.predict(X_test)
d_pred = d_reg.predict(X_test)
r_pred = r_reg.predict(X_test)

In [37]:
from sklearn.metrics import r2_score

In [38]:
m = r2_score(y_test, m_pred)
p = r2_score(y_test, p_pred)
d = r2_score(y_test, d_pred)
r = r2_score(y_test, r_pred)

In [39]:
print(m, p, d, r)

0.34024214913684203 0.34024214913684203 0.033864541832669404 0.5045328685258963


By comparing both models the Regression model is less compared to theClassification model. 

So from the above results, We can conclude that Classification model is best fit for our dataset.

Thank you

CHEERALA RACHANA