# Selection of important features and predicting wine quality using machine learning techniques

## Imports

In [57]:
import pandas as pd
import numpy as np
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing  import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn import svm

## Data preparation

### Reading dataset

In [58]:
data_df = pd.read_csv("winequality-white.csv")
data_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [59]:
data_df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [60]:
data_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [61]:
data_df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

## Cleaning dataset

In [62]:
data_df.describe(include="all")

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


## Split dataset


In [63]:
data_sel=data_df.copy()

In [64]:
y=data_df["quality"]
y_sel=data_sel["quality"]

In [65]:
data_df.drop(columns=["quality"],inplace=True)
data_sel.drop(columns=["quality"],inplace=True)

In [66]:
#noul dataset cu atributele mai semnificative selectate 
#atributele mai putin semnificative  sunt eliminate din datasetul compet
data_sel.drop(columns=["citric acid"],inplace=True)
data_sel.drop(columns=["chlorides"],inplace=True)
data_sel.drop(columns=["total sulfur dioxide"],inplace=True)

In [67]:
X=data_df
X_sel=data_sel

In [68]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.20, random_state=42)
X_sel_train,X_sel_test,y_sel_train,y_sel_test=train_test_split(X_sel,y_sel, test_size=0.20, random_state=42)

In [69]:
X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
4665,7.3,0.17,0.36,8.20,0.028,44.0,111.0,0.99272,3.14,0.41,12.4
1943,6.3,0.25,0.44,11.60,0.041,48.0,195.0,0.99680,3.18,0.52,9.5
3399,5.6,0.32,0.33,7.40,0.037,25.0,95.0,0.99268,3.25,0.49,11.1
843,6.9,0.19,0.35,1.70,0.036,33.0,101.0,0.99315,3.21,0.54,10.8
2580,7.7,0.30,0.26,18.95,0.053,36.0,174.0,0.99976,3.20,0.50,10.4
...,...,...,...,...,...,...,...,...,...,...,...
4426,6.2,0.21,0.52,6.50,0.047,28.0,123.0,0.99418,3.22,0.49,9.9
466,7.0,0.14,0.32,9.00,0.039,54.0,141.0,0.99560,3.22,0.43,9.4
3092,7.6,0.27,0.52,3.20,0.043,28.0,152.0,0.99129,3.02,0.53,11.4
3772,6.3,0.24,0.29,13.70,0.035,53.0,134.0,0.99567,3.17,0.38,10.6


In [71]:
X_sel_train

Unnamed: 0,fixed acidity,volatile acidity,residual sugar,free sulfur dioxide,density,pH,sulphates,alcohol
4665,7.3,0.17,8.20,44.0,0.99272,3.14,0.41,12.4
1943,6.3,0.25,11.60,48.0,0.99680,3.18,0.52,9.5
3399,5.6,0.32,7.40,25.0,0.99268,3.25,0.49,11.1
843,6.9,0.19,1.70,33.0,0.99315,3.21,0.54,10.8
2580,7.7,0.30,18.95,36.0,0.99976,3.20,0.50,10.4
...,...,...,...,...,...,...,...,...
4426,6.2,0.21,6.50,28.0,0.99418,3.22,0.49,9.9
466,7.0,0.14,9.00,54.0,0.99560,3.22,0.43,9.4
3092,7.6,0.27,3.20,28.0,0.99129,3.02,0.53,11.4
3772,6.3,0.24,13.70,53.0,0.99567,3.17,0.38,10.6


In [72]:
y_train

4665    6
1943    5
3399    6
843     7
2580    5
       ..
4426    6
466     6
3092    6
3772    6
860     8
Name: quality, Length: 3918, dtype: int64

In [73]:
y_sel_train

4665    6
1943    5
3399    6
843     7
2580    5
       ..
4426    6
466     6
3092    6
3772    6
860     8
Name: quality, Length: 3918, dtype: int64

## Machine Learning Alg.

In [74]:
data_df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
dtype: object

In [75]:
data_sel.dtypes

fixed acidity          float64
volatile acidity       float64
residual sugar         float64
free sulfur dioxide    float64
density                float64
pH                     float64
sulphates              float64
alcohol                float64
dtype: object

In [77]:
all_features=["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"]

In [78]:
sel_features=["fixed acidity","volatile acidity","residual sugar","free sulfur dioxide","density","pH","sulphates","alcohol"]

In [79]:
preprocessor = ColumnTransformer([     
    ("num", Pipeline([("scaler", StandardScaler())]), all_features)],     
    remainder="drop")

In [80]:
preprocessor_sel = ColumnTransformer([     
    ("num", Pipeline([("scaler", StandardScaler())]), sel_features)],     
    remainder="drop")

In [81]:
preprocessor.fit(X_train)
X_train=preprocessor.transform(X_train)

In [82]:
X_test=preprocessor.transform(X_test)

In [83]:
preprocessor_sel.fit(X_sel_train)
X_sel_train=preprocessor_sel.transform(X_sel_train)

In [84]:
X_sel_test=preprocessor_sel.transform(X_sel_test)

## Linear Regression

In [85]:
linreg=LinearRegression()
linreg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [86]:
scores=cross_val_score(linreg,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)

0.27712563278727187

In [87]:
y_pred=linreg.predict(X_test)

In [88]:
r2_score(y_test,y_pred)

0.2652750042179145

### Multi-Layer Perceptron


Rezultate obtinute pentru setul de date compet

In [89]:
mlp = MLPRegressor(random_state=1, max_iter=1000,hidden_layer_sizes=(5))
mlp.fit(X_train, y_train)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=5, learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=1000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=1, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [90]:
scores=cross_val_score(mlp,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)

0.33506904518324804

In [104]:
y_pred=mlp.predict(X_test)

In [105]:
r2_score(y_test, y_pred)

0.3548502391066687

Rezultate obtinute pentru setul cu date selectate

In [92]:
mlp_sel = MLPRegressor(random_state=1, max_iter=1000,hidden_layer_sizes=(5))
mlp_sel.fit(X_sel_train, y_sel_train)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=5, learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=1000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=1, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [93]:
scores=cross_val_score(mlp_sel,X_sel_train,y_sel_train,scoring="r2", cv=5)
np.mean(scores)

0.30928512702907845

In [94]:
y_pred=mlp_sel.predict(X_sel_test)

In [95]:
r2_score(y_sel_test, y_pred)

0.334476931839381

# Support Vector Machine

Rezultate obtinute pentru setul de date compet


In [96]:
regr = svm.SVR(kernel='linear',C=100)
regr.fit(X_train,y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [97]:
scores=cross_val_score(regr,X_train,y_train,scoring="r2", cv=5)
np.mean(scores)

0.27381149850385245

In [98]:
y_pred=regr.predict(X_test)

In [99]:
r2_score(y_test, y_pred)

0.25955747891827285

Rezultate obtinute pentru setul cu date selectate

In [100]:
regr_sel = svm.SVR(kernel='linear',C=100)
regr_sel.fit(X_sel_train,y_sel_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [101]:
scores=cross_val_score(regr_sel,X_sel_train,y_sel_train,scoring="r2", cv=5)
np.mean(scores)

0.27463978509861914

In [102]:
y_pred=regr_sel.predict(X_sel_test)

In [103]:
r2_score(y_sel_test, y_pred)

0.26087538583034087