#**MACHINE LEARNING PROJECT ON RED WINE DATASET**

# Classification

**Data preprocessing**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv('/content/RED WINE.csv')

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [None]:
dataset['quality'].unique()

array([5, 6, 7, 4, 8, 3])

**Separate features and target**  


In [None]:
X = dataset.iloc[: , 1:-1].values
y = dataset.iloc[:, -1].values

**Splitting data into train and test sets**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size=0.20)

**Standardization**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

**Train on different algorithms**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
l_cla = LogisticRegression()
k_cla = KNeighborsClassifier()
d_cla = DecisionTreeClassifier()
r_cla = RandomForestClassifier()
s_cla = SVC(kernel='linear')
ks_cla = SVC(kernel= 'rbf')

In [None]:
l_cla.fit(X_train, y_train)
k_cla.fit(X_train, y_train)
d_cla.fit(X_train, y_train)
r_cla.fit(X_train, y_train)
s_cla.fit(X_train, y_train)
ks_cla.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
l_pred = l_cla.predict(X_test)
k_pred = k_cla.predict(X_test)
d_pred = d_cla.predict(X_test)
r_pred = r_cla.predict(X_test)
s_pred = s_cla.predict(X_test)
ks_pred = ks_cla.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
l_c = confusion_matrix(y_test, l_pred)
k_c = confusion_matrix(y_test, k_pred)
d_c = confusion_matrix(y_test, d_pred)
r_c = confusion_matrix(y_test, r_pred)
s_c = confusion_matrix(y_test, s_pred)
ks_c = confusion_matrix(y_test, ks_pred)

In [None]:
l_c

array([[ 0,  0,  3,  0,  0,  0],
       [ 0,  0, 11,  1,  1,  0],
       [ 0,  0, 97, 36,  3,  0],
       [ 0,  0, 44, 70, 15,  2],
       [ 0,  0,  4, 21, 10,  0],
       [ 0,  0,  0,  0,  2,  0]])

In [None]:
k_c

array([[ 0,  1,  2,  0,  0,  0],
       [ 0,  0,  7,  6,  0,  0],
       [ 0,  1, 99, 33,  3,  0],
       [ 0,  0, 39, 82, 10,  0],
       [ 0,  0,  8, 17, 10,  0],
       [ 0,  0,  0,  0,  2,  0]])

In [None]:
d_c

array([[ 1,  0,  1,  1,  0,  0],
       [ 1,  2,  6,  4,  0,  0],
       [ 0,  9, 97, 25,  4,  1],
       [ 0,  6, 37, 72, 11,  5],
       [ 0,  0,  3, 17, 15,  0],
       [ 0,  0,  0,  0,  2,  0]])

In [None]:
r_c

array([[  0,   0,   3,   0,   0,   0],
       [  0,   0,   9,   4,   0,   0],
       [  0,   1, 106,  24,   5,   0],
       [  0,   1,  33,  85,  12,   0],
       [  0,   0,   2,  17,  16,   0],
       [  0,   0,   0,   0,   2,   0]])

In [None]:
s_c

array([[  0,   0,   3,   0,   0,   0],
       [  0,   0,  11,   2,   0,   0],
       [  0,   0, 101,  34,   1,   0],
       [  0,   0,  48,  82,   1,   0],
       [  0,   0,   4,  31,   0,   0],
       [  0,   0,   0,   1,   1,   0]])

In [None]:
ks_c

array([[  0,   0,   2,   1,   0,   0],
       [  0,   0,  10,   3,   0,   0],
       [  0,   0, 105,  30,   1,   0],
       [  0,   0,  43,  81,   7,   0],
       [  0,   0,   1,  24,  10,   0],
       [  0,   0,   0,   0,   2,   0]])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
l_a = accuracy_score(y_test, l_pred)
k_a = accuracy_score(y_test, k_pred)
d_a = accuracy_score(y_test, d_pred)
r_a = accuracy_score(y_test, r_pred)
s_a = accuracy_score(y_test, s_pred)
ks_a = accuracy_score(y_test, ks_pred)

**Result**

In [None]:
print('Logistic Regression: ' + str(l_a) + '\nKNN: ' + str(k_a) + '\nDecision Tree: ' + str(d_a) + '\nRandom Forest: ' + str(r_a) + '\nLinear SVC: ' + str(s_a) + '\nKernel SVC: ' + str(l_a))

Logistic Regression: 0.553125
KNN: 0.596875
Decision Tree: 0.584375
Random Forest: 0.646875
Linear SVC: 0.571875
Kernel SVC: 0.553125


#Regression

**Data preprocessing**

In [None]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data=pd.read_csv("/content/RED WINE.csv")

In [None]:
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [None]:
data. describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [None]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [None]:
data. info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


**Separate features and target**

In [None]:
X = data.iloc[: , :-1].values
Y = data.iloc[: , -1].values

In [None]:
X

array([[ 7.4  ,  0.7  ,  0.   , ...,  3.51 ,  0.56 ,  9.4  ],
       [ 7.8  ,  0.88 ,  0.   , ...,  3.2  ,  0.68 ,  9.8  ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  3.26 ,  0.65 ,  9.8  ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  3.42 ,  0.75 , 11.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  3.57 ,  0.71 , 10.2  ],
       [ 6.   ,  0.31 ,  0.47 , ...,  3.39 ,  0.66 , 11.   ]])

In [None]:
Y

array([5, 5, 5, ..., 6, 5, 6])

**Splitting Data into Train and Test sets**

In [None]:
 from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
X_train

array([[7.60e+00, 6.30e-01, 3.00e-02, ..., 3.44e+00, 6.40e-01, 1.09e+01],
       [1.15e+01, 1.80e-01, 5.10e-01, ..., 3.28e+00, 9.70e-01, 1.01e+01],
       [8.00e+00, 3.10e-01, 4.50e-01, ..., 3.15e+00, 8.10e-01, 1.25e+01],
       ...,
       [1.19e+01, 3.70e-01, 6.90e-01, ..., 3.00e+00, 6.50e-01, 1.28e+01],
       [7.10e+00, 7.50e-01, 1.00e-02, ..., 3.39e+00, 4.00e-01, 1.28e+01],
       [8.50e+00, 6.60e-01, 2.00e-01, ..., 3.13e+00, 4.80e-01, 9.20e+00]])

**Train on different algorithms**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
m_reg = LinearRegression()
p_reg = LinearRegression()
d_reg = DecisionTreeRegressor()
r_reg = RandomForestRegressor(n_estimators=500)

In [None]:
X_poly = PolynomialFeatures(degree =2)
X_poly = X_poly.fit_transform(X_train)

In [None]:
m_reg.fit(X_train, Y_train)
p_reg.fit(X_poly,Y_train)
d_reg.fit(X_train, Y_train)
r_reg.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
temp = PolynomialFeatures(degree =2)
temp = temp.fit_transform(X_test)

In [None]:
m_pred = m_reg.predict(X_test)
p_pred = p_reg.predict(temp)
d_pred = d_reg.predict(X_test)
r_pred = r_reg.predict(X_test)

In [None]:
from sklearn.metrics import r2_score

In [None]:
m = r2_score(Y_test, m_pred)
p = r2_score(Y_test, p_pred)
d = r2_score(Y_test, d_pred)
r = r2_score(Y_test, r_pred)

**Result**

In [None]:
print('Logistic Regression: ' + str(m) + '\npolyfeat: ' + str(p) + '\nDecision Tree: ' + str(d) + '\nRandom Forest: ' + str(r))

Logistic Regression: 0.39192582636682416
polyfeat: 0.3444783096804007
Decision Tree: 0.17002012072434614
Random Forest: 0.5189738430583501


#**REPORT OF RED WINE DATASET**

##**1.Contents of data file**

Data file contains datasets of red_wine_quality. what are the elements needed to prepare the wine,relation between them.



##**2.Libraries imported**

i.Pandas  
ii.numpy  
iii.matplotlib

##**3.Model used**

i.Classification   
ii.Regression 

##**4.Algorithms**

1.Logistic regression  
2.KNN  
3.Decision tree  
4.Random forest  
5.Linear SVC  
6.Kernel SVC  
7.Polynomial features  


##**5.Data preprocessing**

1.describing data based on mean,median,std_dev,percentile   
2.finding null values if present  
3.checking datatypes  
4.scaling data  
5.encoding mechanisms  
6.checking quality of wine

##**6.Results**

two algorithms which have been used has given atmost equivalent results  
1.logistic regression 0.553125  
2.kernel SVC 0.553125 

Random forest gives best accuracy 0.646875

##**7.Conclusion**


according to given dataset,wine quality will be good if it cross the scale above 75%.

after testing on them the purity value obtained are less than given reference value.hence, the given the quality of wine is not good..

Since Random forest is giving more efficiency than other algorithm we can consider random forest as the best algorithm for this dataset