In [None]:
# Import the necessary libraries (numpy, pandas, scikit-learn packages metrics to start)
import numpy as np
import pandas as pd
from sklearn import metrics

In [None]:
# Import the wine dataset available at https://archive.ics.uci.edu/ml/datasets/Wine
wine = pd.read_csv("Datasets\\wine.data", header=None) # in the folder datasets located in the same folder as the python .ipynb
# header = none is used as the dataset doesn't have a header in the wine.data
# of course more information about the columns are available online at https://archive.ics.uci.edu/ml/datasets/Wine

In [None]:
#wine vs. wine.values
wine
#wine.values

In [None]:
# Perform an exploratory data analysis (EDA) on the dataset to better understand its underlying structure and key features. 
# Typical steps in EDA include examining the data’s shape and structure, checking for missing values, 
# visualizing distributions, and identifying relationships between variables through basic statistical summaries and 
# visualizations like histograms, box plots, and scatter plots.

In [None]:
# Segment the class of wine (first column) and remaining data (attributes) so we can use the attributes in our classifiers
# Segment the outcome (first column) and remaining data (attributes) so we can use the attributes for clustering
wineData = wine.values[:, 1:14] # remember that the last column 14 is actually excluded so only 1 to 13 will be selected
wineTarget = wine.values[:,0] # we know that the true target is in the first column for this dataset

In [None]:
# once segmented, now we can print the data to check it
print(wineData)

# we can also print the target or the true values if needed
print(wineTarget)

# we can also print the unique values of the target
# print(np.unique(wineTarget))

In [None]:
# split the dataset into two subsets: training and test set
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(wineData, wineTarget, test_size = 0.3)

In [None]:
# X_train, X_test, Y_train, Y_test
print(X_train)
#print(X_test)
print(Y_train)
#print(Y_test)

In [None]:
# Create the following classifiers to predict the class of wine
# i. Logistic regression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# precision (also called positive predictive value) is the fraction of relevant instances among the retrieved instances
# recall (also known as sensitivity) is the fraction of the total amount of relevant instances that were actually retrieved
print("LOGISTIC REGRESSION")
print("**************************************")
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression()
lm.fit(X_train, Y_train)
predicted = lm.predict(X_test)
print(metrics.classification_report(Y_test, predicted))
print(metrics.confusion_matrix(Y_test, predicted))

In [None]:
# Create the following classifiers to predict the class of wine
# K nearest neighbours
print("\n\n K nearest neighbours")
print("**************************************")
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, Y_train)
print(model)
predicted = model.predict(X_test)
print(metrics.classification_report(Y_test, predicted))
print(metrics.confusion_matrix(Y_test, predicted))

In [None]:
# Create the following classifiers to predict the class of wine
# K nearest neighbours, with different parameters if needed
print("\n\n K nearest neighbours - weights: distance")
print("**************************************")
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(weights='distance')
model.fit(X_train, Y_train)
print(model)
predicted = model.predict(X_test)
print(metrics.classification_report(Y_test, predicted))
print(metrics.confusion_matrix(Y_test, predicted))

In [None]:
# Create the following classifiers to predict the class of wine
# Decision tree
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
print("\nDecision Tree")
print("**************************************")
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
#model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train, Y_train)
print(model)
predicted = model.predict(X_test)
print(metrics.classification_report(Y_test, predicted))
print(metrics.confusion_matrix(Y_test, predicted))

In [None]:
# 5. Create the following classifiers to predict the class of wine
# iv. Naïve Bayes
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
print("\nNaive Bayes")
print("**************************************")
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, Y_train)
print(model)
predicted = model.predict(X_test)
print(metrics.classification_report(Y_test, predicted))
print(metrics.confusion_matrix(Y_test, predicted))

In [None]:
# once we have created the models, we can try to predict new cases using the models created
newCases = [[14,2,2,15,110,2,2,0.5,1.5,4.8,1.2,4,1015],[13,1,1.5,14,120,3,1,0.3,1.3,4.1,1.7,5,1030]]
predictedNewCases = model.predict(newCases)
print(predictedNewCases)

In [None]:
newCases = [[14,2,2,15,110,2,2,0.5,1.5,4.8,1.2,4,1015],[13,1,1.5,14,120,3,1,0.3,1.3,4.1,1.7,5,1030]]
Y_newCases = [1,1]
predictedNewCases = model.predict(newCases)
print(metrics.classification_report(Y_newCases, predictedNewCases))
print(metrics.confusion_matrix(Y_newCases, predictedNewCases))

In [None]:
# The following is an example on how we can use KFold
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
import numpy as np
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4],[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 1, 2, 1, 2, 1, 2])
kf = KFold(n_splits=4)
kf.get_n_splits(X)
print(kf)
#KFold(n_splits=2, random_state=None, shuffle=False)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    print(X_train)
    y_train, y_test = y[train_index], y[test_index]
    print(y_train)

In [None]:
# 4. Segment the data in a training and test set with a random KFold split
from sklearn import model_selection
#X_train, X_test, Y_train, Y_test = model_selection.train_test_split(wineData, wineTarget, test_size = 0.30)
kf = KFold(n_splits=4)
kf.get_n_splits(wineData)
print("\n\n Naive Bayes")
print("**************************************")
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
uniqueTargets = np.unique(wineTarget)
for train_index, test_index in kf.split(wineData):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = wineData[train_index], wineData[test_index]
    Y_train, Y_test = wineTarget[train_index], wineTarget[test_index]
    model.partial_fit(X_train, Y_train, uniqueTargets)
predicted = model.predict(wineData)
print(metrics.classification_report(wineTarget, predicted))
print(metrics.confusion_matrix(wineTarget, predicted))