# Machine Learning with Wine Composition Data

## Location of data obtained

- Kaggle https://www.kaggle.com/dell4010/wine-dataset

### Imports

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.ensemble import ExtraTreesClassifier

### Loading csv file

In [2]:
wine = pd.read_csv('datasets/wine_dataset.csv')
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         6497 non-null   float64
 1   volatile_acidity      6497 non-null   float64
 2   citric_acid           6497 non-null   float64
 3   residual_sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free_sulfur_dioxide   6497 non-null   float64
 6   total_sulfur_dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  style                 6497 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB


### Exploratory Analysis

In [3]:
wine.shape

(6497, 13)

In [4]:
wine.columns

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'style'],
      dtype='object')

In [5]:
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,style
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [6]:
wine.tail()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,style
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.5,11.2,6,white
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.9949,3.15,0.46,9.6,5,white
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
6495,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white
6496,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6,white


In [7]:
wine['style'].dtypes

dtype('O')

In [8]:
wine['style'].unique()

array(['red', 'white'], dtype=object)

In [9]:
wine['style'].value_counts(normalize=True)

white    0.753886
red      0.246114
Name: style, dtype: float64

### Treatment columns for ML

In [10]:
wine['style'] = wine['style'].map({'white': 1, 'red': 0})

In [11]:
wine['style'].value_counts()

1    4898
0    1599
Name: style, dtype: int64

### Separating the model

In [12]:
# Here we separate the variables between the predictor variable and the target variable
y = wine['style']
x = wine.drop('style', axis=1)

### Machine Learning: Tree Classifier

In [13]:
# creating the test and training data sets, test_size=0.2 means 20% for test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) 

In [14]:
x_train.shape, y_train.shape

((5197, 12), (5197,))

In [15]:
# model create
model = ExtraTreesClassifier()
model.fit(x_train, y_train)

# printing result
result = model.score(x_test, y_test)
print('Accuracy:', result)

Accuracy: 0.9961538461538462


### Viewing data outside of training

In [16]:
x_test.shape, y_test.shape

((1300, 12), (1300,))

In [17]:
# See 3 random data separated by the model 
# to test the training done
x_test[300:303]

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
3381,8.5,0.21,0.26,9.25,0.034,73.0,142.0,0.9945,3.05,0.37,11.4,6
3148,8.5,0.15,0.49,1.5,0.031,17.0,122.0,0.9932,3.03,0.4,10.3,6
4315,7.8,0.25,0.34,13.7,0.044,66.0,184.0,0.99976,3.22,0.75,8.9,5


In [18]:
# Real values
y_test[300:303]

3381    1
3148    1
4315    1
Name: style, dtype: int64

### Testing the model prediction

In [19]:
test = x_test[300:303]
answer = y_test[300:303]

In [20]:
predicted = model.predict(test)

In [21]:
predicted

array([1, 1, 1], dtype=int64)