In [None]:
"""
 we are going to use the Wine dataset to illustrate different 
performance metrics for our algorithms.

You can find more info about it here: https://archive.ics.uci.edu/ml/datasets/wine
"""
from sklearn.datasets import load_wine

# To randomly split the data into train/test 
from sklearn.model_selection import train_test_split

"""
We will compare the performance of all of the classifiers we have seen
so far in previous weeks, so we import them all
"""

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB  # I will just use one of the Naive Bayes we know about

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
"""
We are not going to do any optimisation or tuning on them, so let's just use them
with their default options. We will save the constructor functions for all of them 
in a list (array) called "classifiers", and then call those default constructor
functions and save their instantiation (objects) in a list called "models"
Note: You can have seen many different parameters
for all of these classifiers, so feel free to tune them by adding some parameters to
these constructors.
"""
classifiers = [LogisticRegression, KNeighborsClassifier, MultinomialNB, 
               DecisionTreeClassifier, RandomForestClassifier]

models = [classifier() for classifier in classifiers]

In [None]:
"""
We will download the dataset, we will note the input features and the target
that we are going to classify (in this case we have 3 classes)
"""
wine_data = load_wine(return_X_y=False, as_frame=True)
wine_data = wine_data.frame # We just take the Pandas DataFrame from the data
input_features = list(wine_data.columns[:-1]) # All of the columns, apart from the last one, are input features - the last column is the target feature
wine_data  # We can display the data like this

In [None]:
"""
We will split the data into a train dataset (which 
we will use to train/fit our models) and a test dataset (which we will use to test 
them -> AND to evaluate their performance a little bit better than with just the accuracy)
"""
PERCENTAGE_SAMPLES_USED_FOR_TESTING = 0.4

train, test = train_test_split(wine_data, test_size=PERCENTAGE_SAMPLES_USED_FOR_TESTING)
print('We are using', len(train),'samples for training the', len(models),'models and',len(test),'samples for testing them later.')

In [None]:
"""
Now we can train all of our models in a loop
"""
for m in models:
    m.fit(train[input_features], train['target'])
    
"""
And once they are fit (trained) we can then use them to make predictions/estimations for
the test data samples, and compare those predictions/estimations with the "real" target
values in the test data.
Here I will use the score function, that calculates the accuracy (from 0 to 1 where 0 is
0% accurate and 1 is 100% accurate - i.e. it guessed correctly all of the test samples).


Note: You might get some warning for some algorithm, because we are not tuning them and
the default parameters not always work - feel free to tune it or let's just go with
whatever accuracy.
"""
for m in models:
    score = m.score(test[input_features], test['target'])
    print(str(m), ": ",score)

In [None]:
"""
Let's look at their balanced accuracies, to understand what they are doing well and wrong...
"""
from sklearn.metrics import balanced_accuracy_score

for m in models:
    predictions = m.predict(test[input_features])
    print()
    print("Balanced Accuracy Scores for", str(m), ":")
    balanced_accuracies = balanced_accuracy_score(test['target'], predictions)
    print(balanced_accuracies)


In [None]:
"""
Those accuracies don't tell us much about what classes perform better than others...
So let's look at the Confusion Matrices
"""
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

for m in models:
    predictions = m.predict(test[input_features])
    # cm will be a list of lists (matrix), with the values of the confusion matrix...
    cm = confusion_matrix(test['target'], predictions, labels=m.classes_)
    print()
    print("Confusion matrix for", str(m), ":")
    print(cm)

In [None]:

# ... however it's better to just plot it out to visualise it.
# In one function we're doing the predictions in the test set and plotting them:

# We are doing them one by one in different cells:
print("Confusion Matrix for ", str(models[0]))
plot_confusion_matrix(models[0], 
                      X=test[input_features], 
                      y_true=test['target'], 
                      labels=models[0].classes_,
                      cmap='Blues') # we changed the colormap to something less psychodelic than the default

In [None]:
print("Confusion Matrix for ", str(models[1]))
plot_confusion_matrix(models[1], 
                      X=test[input_features], 
                      y_true=test['target'], 
                      labels=models[1].classes_,
                      cmap='Blues') #We changed the colormap to something less psychodelic than the default

In [None]:
print("Confusion Matrix for ", str(models[2]))
plot_confusion_matrix(models[2], 
                      X=test[input_features], 
                      y_true=test['target'], 
                      labels=models[2].classes_,
                      cmap='Blues') # We  changed the colormap to something less psychodelic than the default

In [None]:
print("Confusion Matrix for ", str(models[3]))
plot_confusion_matrix(models[3], 
                      X=test[input_features], 
                      y_true=test['target'], 
                      labels=models[3].classes_,
                      cmap='Blues') # We did the same, we changed the colormap to something less psychodelic than the default

In [None]:
print("Confusion Matrix for ", str(models[4]))
plot_confusion_matrix(models[4], 
                      X=test[input_features], 
                      y_true=test['target'], 
                      labels=models[4].classes_,
                      cmap='Blues') # the colormap to something less psychodelic than the default

Note: The Confusion Matrices above whose diagonals are darker mean that they are better models. You can also see in them which classes your models are predicting better than others. 

The data is randomised so maybe this is not the case for you but in some of the runs it seems like KNN is struggling with Wine Type 2 (class "2"). And this is what mainly causes its low performance. RandomForest on the other hand is always predicting class "2" correctly, and its mistakes come from confusing classes "0" and "1"

In [None]:
"""
Let's calculate again the confusion matrices, but this time normalised:

Then, the diagonal of the confusion matrix are the accuracies for each class!!
So we can calculate the Balance Accuracy in this way, which is relatively simple:
  * First calculate each class' individual accuracy
  * Then average those individual class accuracies

"""
for m in models:
    predictions = m.predict(test[input_features])
    # cm will be a list of lists (matrix), with the values of the confusion matrix...
    cm = confusion_matrix(test['target'], predictions, labels=m.classes_, normalize='true')
    print('\n\n')
    print("Confusion matrix for", str(m), ":")
    print(cm)
    print("  * Accuracies per class: ")
    print('    ', list(zip(['Class 0', 'Class 1', 'Class 2'], [round(x, 2) for x in cm.diagonal()])))
    print("  * Balanced Accuracy for ", str(m), ":")
    print('    ', round(sum(cm.diagonal())/len(cm.diagonal()), 4))
    print('(this should be the same balanced score value that we calculated above, using the balanced_accuracy_score function)')

try to remember the slides of this session, many classification performance metrics such as precision, recall and f1-score are derived from the Confusion Matrix by counting right and wrong predictions in different ways (false positives, true negatives, etc.).

A very interesting function in sklearn that you can use to get a glance of there metrics, per class, is classification_report. Let's check it out:

In [None]:
from sklearn.metrics import classification_report


for m in models:
    predictions = m.predict(test[input_features])
    print('***** Classification report for ', str(m), '***** ')
    print(classification_report(test['target'], predictions))

We can see there how poorly KNN performs in Class 2, across all of the metrics

If you remember from the slides, the precision, recall and f1-score metrics were designed for binary classifiers, so... why does it work in this case with 3 classes?

Well, what sklearn is doing when calculating them per class is "transforming the problem into a binary classifier" by calculating the metrics as: the current class vs all other classes. In this way the problem becomes a binary classifier, whatever the number of classes we have.

We could have also calculated each of these metrics separately. In fact, we could have calculated many other metrics, all of them available in the sklearn.metrics package (check docs here: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics). I will leave that to you...

### try to do the following:
Can you calculate the precision, recall and f1-scores separately using their corresponding function in the metrics package?


In [None]:
from sklearn.metrics import roc_auc_score

for m in models:
    # We will need the proabilities, so the X axis of the ROC curve can be calculated
    # (i.e. for different thresholds, remember?)
    predictions = m.predict_proba(test[input_features])
    print('***** ROC Area Under the Curve for', str(m), '***** ')
    print(roc_auc_score(y_true=list(test['target']), 
                        y_score=predictions, 
                        multi_class='ovr')
         )

So RandomForest is not just the classifier with the highest accuracy, but also the one that better differentiates the classes between them!
And remember, ROC works for binary classifiers, so in this multi-class problem we are specifying the OVR strategy: one-vs-rest, that is one class vs the rest of classes, and then average out the AUC scores for all 3 classes.

We could plot those ROC curves so we can visualise the five performances if our classification problem was binary, just by using the code given by sklearn here:
https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

In short: We'd have to apply the one-vs-rest strategy into the classifier (there is a special type of meta-classifier that applies this strategy on top of any other classifier), and then we'd manually plot the data returned by the auc function in the metrics package.

If you ever have a binary classification problem, just do:

```
from sklearn.metrics import plot_roc_curve
plot_roc_curve(models[0], test[input_features], test['target'])
```

However if you run that code here it'll complain that your classifiers are not binary... :(

### Learning exercise:

Finally, note that we also have all of the [Regression success metrics in the `sklearn.metrics` package](https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics)

you can experiment further, if you want to follow these steps: 
1. load the [diabetes data, which is a regression problem](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes),
2. fit and predict a couple of regression models to that data, for example [linear regressor](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html?highlight=linear%20regression#sklearn.linear_model.LinearRegression) and [ridge regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge) and 
3. apply some of the popular metrics such as MAE, MSE, RMSE and R^2, available in `sklearn.metrics`. You just need to apply them the same way we did with the more complex classification metrics.