In [5]:
!pip install wget

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=a279f8550631b1219bb7f26a3787f7358781b36940c55355fdd7387f23922f87
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [6]:
import numpy as np
import os
import ssl
import wget
import zipfile

import numpy as np
import pandas as pd

In [7]:
def download_and_prepare(name, path):
    if name == "movielens-small":
        print(f"Preparing dataset {name}...")
        # Check if data has been extracted and if not download extract it
        if (os.path.exists(os.path.join(path, "ml-latest-small"))):
            print(f"Dataset {name} already extracted.")
        else:
            print(f"Downloading dataset {name}...")
            ssl._create_default_https_context = ssl._create_unverified_context
            url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
            wget.download(url, path)
            print(f"Extracting dataset {name}...")
            with zipfile.ZipFile(os.path.join(path, "ml-latest-small.zip"), 'r') as zip_ref:
                zip_ref.extractall(path)

        # Read dataset with pandas
        ratings = pd.read_csv(os.path.join(path, 'ml-latest-small', 'ratings.csv'))
        print(f"{len(ratings)} entries read.")
        r_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

        return np.array(r_matrix) # for performance reasons we only take every 2nd element along each axis

    else:
        raise ValueError

In [28]:
def confusion_matrix(y_true, y_pred, normalize=None):
    """Computes the confusion matrix from predictions and labels.
    The matrix columns represent the real labels and the rows represent the
    prediction labels. The confusion matrix is always a 2-D array of shape `[n_labels, n_labels]`,
    where `n_labels` is the number of valid labels for a given classification task. Both
    prediction and labels must be 1-D arrays of the same shape in order for this
    function to work.
    Parameters:
        y_true: 1-D array of real labels for the classification task.
        y_pred: 1-D array of predictions for a given classification.
        normalize: One of ['true', 'pred', 'all', None], corresponding to column sum, row sum, matrix sum, or no
                   normalization.
    Returns:
        A 2-D array with shape `[n_labels, n_labels]` representing the confusion
        matrix, where `n` is the number of possible labels in the classification
        task.
    """

    if normalize not in ['true', 'pred', 'all', None]:
        raise ValueError("normalize must be one of {'true', 'pred', 'all', None}")

    n_labels = np.max(np.maximum(y_true,y_pred))+1 # TODO (TASK 1)

    cm = np.zeros((n_labels, n_labels))
    for i,j in zip(y_pred,y_true):
      cm[i,j]=cm[i,j]+1
    # TODO (TASK 1)

    if normalize == 'true':
        cm = cm / cm.sum(axis=0, keepdims=True)# TODO (TASK 1)
    elif normalize == 'pred':
        cm = cm / cm.sum(axis=1, keepdims=True)# TODO (TASK 1)
    elif normalize == 'all':
        cm = cm/ cm.sum() # TODO (TASK 1)

    return cm

In [30]:
def precision(y_true, y_pred):
    return confusion_matrix(y_true, y_pred, normalize='pred')[1, 1]
# TODO (TASK 2)

In [31]:
def recall(y_true, y_pred):
    return confusion_matrix(y_true, y_pred, normalize='true')[1, 1]
# TODO (TASK 2)

In [33]:
def false_alarm_rate(y_true, y_pred):
    return confusion_matrix(y_true, y_pred, normalize='true')[1, 0]
# TODO (TASK 2)

In [34]:
import numpy as np
np.random.seed(42)
np.set_printoptions(precision=2, floatmode='fixed')

# Part I
print("------------------------------------------------")
print("Part I - Confusion matrix")
print("------------------------------------------------")

y_true = np.random.randint(0, 2, 20)
y_pred = np.random.randint(0, 2, 20)

print(y_true)
print(y_pred)

print(np.max(np.maximum(y_true,y_pred))+1)

print("Unnormalized confusion matrix:")
cm = confusion_matrix(y_true, y_pred)
print(cm)
print("Matrix sum normalization:")
cm = confusion_matrix(y_true, y_pred, normalize='all')
print(cm)
print("Row sum normalization:") 
cm = confusion_matrix(y_true, y_pred, normalize='pred')
print(cm)
print("Column sum normalization:")
cm = confusion_matrix(y_true, y_pred, normalize='true')
print(cm)


------------------------------------------------
Part I - Confusion matrix
------------------------------------------------
[0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 0]
[1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 0 0]
2
Unnormalized confusion matrix:
[[4.00 3.00]
 [9.00 4.00]]
Matrix sum normalization:
[[0.20 0.15]
 [0.45 0.20]]
Row sum normalization:
[[0.57 0.43]
 [0.69 0.31]]
Column sum normalization:
[[0.31 0.43]
 [0.69 0.57]]


In [35]:
print(f"Precision: {precision(y_true, y_pred):.2f}, recall: {recall(y_true, y_pred):.2f}"
      f", false alarm rate: {false_alarm_rate(y_true, y_pred):.2f}")

Precision: 0.31, recall: 0.57, false alarm rate: 0.69
