In [1]:
# Import the necessary libraries (numpy, pandas, scikit-learn packages metrics to start)
import numpy as np
import pandas as pd
from sklearn import metrics

In [2]:
# Import the wine dataset available at https://archive.ics.uci.edu/ml/datasets/Wine
wine = pd.read_csv("wine.data", header=None) # in the folder datasets located in the same folder as the python .ipynb
# header = none is used as the dataset doesn't have a header in the wine.data
# of course more information about the columns are available online at https://archive.ics.uci.edu/ml/datasets/Wine

In [3]:
#wine vs. wine.values
wine
#wine.values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [4]:
# Perform an exploratory data analysis (EDA) on the dataset to better understand its underlying structure and key features. 
# Typical steps in EDA include examining the data’s shape and structure, checking for missing values, 
# visualizing distributions, and identifying relationships between variables through basic statistical summaries and 
# visualizations like histograms, box plots, and scatter plots.

In [5]:
# Segment the class of wine (first column) and remaining data (attributes) so we can use the attributes in our classifiers
# Segment the outcome (first column) and remaining data (attributes) so we can use the attributes for clustering
wineData = wine.values[:, 1:14] # remember that the last column 14 is actually excluded so only 1 to 13 will be selected
wineTarget = wine.values[:,0] # we know that the true target is in the first column for this dataset

In [6]:
# once segmented, now we can print the data to check it
print(wineData)

# we can also print the target or the true values if needed
print(wineTarget)

# we can also print the unique values of the target
# print(np.unique(wineTarget))

[[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.
 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.
 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]


In [7]:
# split the dataset into two subsets: training and test set
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(wineData, wineTarget, test_size = 0.3)

In [8]:
# X_train, X_test, Y_train, Y_test
print(X_train)
#print(X_test)
print(Y_train)
#print(Y_test)

[[1.296e+01 3.450e+00 2.350e+00 ... 6.800e-01 1.750e+00 6.750e+02]
 [1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.237e+01 1.170e+00 1.920e+00 ... 1.120e+00 3.480e+00 5.100e+02]
 ...
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 [1.277e+01 2.390e+00 2.280e+00 ... 5.700e-01 1.630e+00 4.700e+02]
 [1.305e+01 1.770e+00 2.100e+00 ... 8.800e-01 3.350e+00 8.850e+02]]
[3. 1. 2. 1. 3. 3. 1. 3. 3. 3. 1. 1. 1. 2. 2. 1. 1. 1. 1. 3. 2. 2. 1. 1.
 3. 1. 2. 2. 1. 1. 3. 3. 2. 2. 2. 3. 3. 2. 1. 2. 2. 1. 1. 2. 1. 1. 1. 3.
 3. 3. 2. 2. 1. 2. 1. 3. 2. 1. 1. 2. 3. 3. 1. 2. 1. 2. 2. 2. 2. 2. 1. 2.
 3. 1. 3. 1. 2. 1. 3. 2. 1. 1. 2. 2. 1. 2. 3. 2. 1. 2. 2. 3. 2. 2. 1. 2.
 2. 2. 2. 3. 3. 2. 3. 2. 3. 2. 1. 3. 2. 3. 2. 1. 2. 2. 2. 3. 1. 2. 1. 1.
 2. 1. 3. 1.]


In [2]:
# Create the following classifiers to predict the class of wine
# i. Logistic regression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# precision (also called positive predictive value) is the fraction of relevant instances among the retrieved instances
# recall (also known as sensitivity) is the fraction of the total amount of relevant instances that were actually retrieved
print("LOGISTIC REGRESSION")
print("**************************************")
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression()
lm.fit(X_train, Y_train)
predicted = lm.predict(X_test)
print(metrics.classification_report(Y_test, predicted))
print(metrics.confusion_matrix(Y_test, predicted))

LOGISTIC REGRESSION
**************************************


NameError: name 'X_train' is not defined

In [10]:
# Create the following classifiers to predict the class of wine
# K nearest neighbours
print("\n\n K nearest neighbours")
print("**************************************")
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, Y_train)
print(model)
predicted = model.predict(X_test)
print(metrics.classification_report(Y_test, predicted))
print(metrics.confusion_matrix(Y_test, predicted))



 K nearest neighbours
**************************************
KNeighborsClassifier()
              precision    recall  f1-score   support

         1.0       0.88      0.88      0.88        16
         2.0       0.57      0.76      0.65        21
         3.0       0.60      0.35      0.44        17

    accuracy                           0.67        54
   macro avg       0.68      0.66      0.66        54
weighted avg       0.67      0.67      0.65        54

[[14  1  1]
 [ 2 16  3]
 [ 0 11  6]]


In [11]:
# Create the following classifiers to predict the class of wine
# K nearest neighbours, with different parameters if needed
print("\n\n K nearest neighbours - weights: distance")
print("**************************************")
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(weights='distance')
model.fit(X_train, Y_train)
print(model)
predicted = model.predict(X_test)
print(metrics.classification_report(Y_test, predicted))
print(metrics.confusion_matrix(Y_test, predicted))



 K nearest neighbours - weights: distance
**************************************
KNeighborsClassifier(weights='distance')
              precision    recall  f1-score   support

         1.0       0.88      0.88      0.88        16
         2.0       0.60      0.71      0.65        21
         3.0       0.62      0.47      0.53        17

    accuracy                           0.69        54
   macro avg       0.70      0.69      0.69        54
weighted avg       0.69      0.69      0.68        54

[[14  1  1]
 [ 2 15  4]
 [ 0  9  8]]


In [12]:
# Create the following classifiers to predict the class of wine
# Decision tree
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
print("\nDecision Tree")
print("**************************************")
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
#model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train, Y_train)
print(model)
predicted = model.predict(X_test)
print(metrics.classification_report(Y_test, predicted))
print(metrics.confusion_matrix(Y_test, predicted))


Decision Tree
**************************************
DecisionTreeClassifier()
              precision    recall  f1-score   support

         1.0       1.00      0.81      0.90        16
         2.0       0.78      1.00      0.88        21
         3.0       1.00      0.82      0.90        17

    accuracy                           0.89        54
   macro avg       0.93      0.88      0.89        54
weighted avg       0.91      0.89      0.89        54

[[13  3  0]
 [ 0 21  0]
 [ 0  3 14]]


In [13]:
# 5. Create the following classifiers to predict the class of wine
# iv. Naïve Bayes
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
print("\nNaive Bayes")
print("**************************************")
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, Y_train)
print(model)
predicted = model.predict(X_test)
print(metrics.classification_report(Y_test, predicted))
print(metrics.confusion_matrix(Y_test, predicted))


Naive Bayes
**************************************
GaussianNB()
              precision    recall  f1-score   support

         1.0       1.00      0.94      0.97        16
         2.0       0.95      0.90      0.93        21
         3.0       0.89      1.00      0.94        17

    accuracy                           0.94        54
   macro avg       0.95      0.95      0.95        54
weighted avg       0.95      0.94      0.94        54

[[15  1  0]
 [ 0 19  2]
 [ 0  0 17]]


In [14]:
# once we have created the models, we can try to predict new cases using the models created
newCases = [[14,2,2,15,110,2,2,0.5,1.5,4.8,1.2,4,1015],[13,1,1.5,14,120,3,1,0.3,1.3,4.1,1.7,5,1030]]
predictedNewCases = model.predict(newCases)
print(predictedNewCases)

[1. 2.]


In [15]:
newCases = [[14,2,2,15,110,2,2,0.5,1.5,4.8,1.2,4,1015],[13,1,1.5,14,120,3,1,0.3,1.3,4.1,1.7,5,1030]]
Y_newCases = [1,1]
predictedNewCases = model.predict(newCases)
print(metrics.classification_report(Y_newCases, predictedNewCases))
print(metrics.confusion_matrix(Y_newCases, predictedNewCases))

              precision    recall  f1-score   support

         1.0       1.00      0.50      0.67         2
         2.0       0.00      0.00      0.00         0

    accuracy                           0.50         2
   macro avg       0.50      0.25      0.33         2
weighted avg       1.00      0.50      0.67         2

[[1 1]
 [0 0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# The following is an example on how we can use KFold
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
import numpy as np
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4],[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 1, 2, 1, 2, 1, 2])
kf = KFold(n_splits=4)
kf.get_n_splits(X)
print(kf)
#KFold(n_splits=2, random_state=None, shuffle=False)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

KFold(n_splits=4, random_state=None, shuffle=False)
TRAIN: [2 3 4 5 6 7] TEST: [0 1]
TRAIN: [0 1 4 5 6 7] TEST: [2 3]
TRAIN: [0 1 2 3 6 7] TEST: [4 5]
TRAIN: [0 1 2 3 4 5] TEST: [6 7]


In [17]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    print(X_train)
    y_train, y_test = y[train_index], y[test_index]
    print(y_train)

[[1 2]
 [3 4]
 [1 2]
 [3 4]
 [1 2]
 [3 4]]
[1 2 1 2 1 2]
[[1 2]
 [3 4]
 [1 2]
 [3 4]
 [1 2]
 [3 4]]
[1 2 1 2 1 2]
[[1 2]
 [3 4]
 [1 2]
 [3 4]
 [1 2]
 [3 4]]
[1 2 1 2 1 2]
[[1 2]
 [3 4]
 [1 2]
 [3 4]
 [1 2]
 [3 4]]
[1 2 1 2 1 2]


In [18]:
# 4. Segment the data in a training and test set with a random KFold split
from sklearn import model_selection
#X_train, X_test, Y_train, Y_test = model_selection.train_test_split(wineData, wineTarget, test_size = 0.30)
kf = KFold(n_splits=4)
kf.get_n_splits(wineData)
print("\n\n Naive Bayes")
print("**************************************")
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
uniqueTargets = np.unique(wineTarget)
for train_index, test_index in kf.split(wineData):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = wineData[train_index], wineData[test_index]
    Y_train, Y_test = wineTarget[train_index], wineTarget[test_index]
    model.partial_fit(X_train, Y_train, uniqueTargets)
predicted = model.predict(wineData)
print(metrics.classification_report(wineTarget, predicted))
print(metrics.confusion_matrix(wineTarget, predicted))



 Naive Bayes
**************************************
              precision    recall  f1-score   support

         1.0       1.00      0.98      0.99        59
         2.0       0.99      0.99      0.99        71
         3.0       0.98      1.00      0.99        48

    accuracy                           0.99       178
   macro avg       0.99      0.99      0.99       178
weighted avg       0.99      0.99      0.99       178

[[58  1  0]
 [ 0 70  1]
 [ 0  0 48]]
