<a href="https://colab.research.google.com/github/neworldemancer/DSF5/blob/master/Course_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to machine learning & Data Analysis

Basic introduction on how to perform typical machine learning tasks with Python.

Prepared by Mykhailo Vladymyrov & Aris Marcolongo,
Data Science Lab, University Of Bern, 2023

This work is licensed under <a href="https://creativecommons.org/share-your-work/public-domain/cc0/">CC0</a>.

# Part 2: Unsupervised Machine Learning

# Load libraries

In [0]:
from sklearn import tree
from sklearn import ensemble

from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from matplotlib import  pyplot as plt
import seaborn as sns
#sns.set()

from time import time as timer
from imageio import imread
import pandas as pd
import numpy as np
import os

from sklearn.manifold import TSNE

import tensorflow as tf
import tarfile

%matplotlib inline
from matplotlib import animation
from IPython.display import HTML

In [0]:
pip install umap-learn

In [0]:
import umap

In [0]:
import os
import tensorflow as tf
import tarfile
import os
from packaging import version
import shutil
import sys 

def download_and_extract_data(
    url="https://github.com/neworldemancer/DSF5/raw/master/colab_material.tgz",
    target_dir="data",
    fname="colab_material.tgz",
    update_folder=False
):
    """Download and extract a tar.gz dataset into target_dir."""
    
    if update_folder and os.path.exists(target_dir):
        shutil.rmtree(target_dir)

    if not os.path.exists(target_dir):
        cache_dir = os.path.abspath(".")

        if version.parse(tf.__version__) >= version.parse("2.13.0"):
            # new behavior: fname must be only a filename
            path = tf.keras.utils.get_file(
                fname=fname,
                origin=url,
                cache_dir=cache_dir
            )
        else:
            # old behavior: can pass full path
            path = tf.keras.utils.get_file(
                fname=os.path.join(cache_dir, fname),
                origin=url
            )
        # extract tar into target_dir
        with tarfile.open(path, "r:gz") as tar:
            tar.extractall(target_dir)
    else:
        print('Data already present. Use update_folder = True to overwrite/update if desired.')
    return os.path.abspath(target_dir)

data_path = download_and_extract_data(update_folder=False)
sys.path.append(data_path)
print("Data available at:", data_path)

In [0]:
from utils.routines import *

# Datasets

In this course we will use several synthetic and real-world datasets to illustrate the behavior of the models and exercise our skills.

## 1. Synthetic linear

In [0]:
def get_linear(n_d=1, n_points=10, w=None, b=None, sigma=5):
  x = np.random.uniform(0, 10, size=(n_points, n_d))

  w = w or np.random.uniform(0.1, 10, n_d)
  b = b or np.random.uniform(-10, 10)
  y = np.dot(x, w) + b + np.random.normal(0, sigma, size=n_points)

  print('true slopes: w =', w, ';  b =', b)

  return x, y

In [0]:
x, y = get_linear(n_d=1, sigma=0)
plt.plot(x[:, 0], y, '*')
plt.show()

In [0]:
n_d = 2
x, y = get_linear(n_d=n_d, n_points=100)

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x[:,0], x[:,1], y, marker='x', color='b',s=40)
plt.show()

## 2. House prices

Subset of the Ames Houses dataset: http://jse.amstat.org/v19n3/decock.pdf

In [0]:
def house_prices_dataset(return_df=False, return_df_xy=False, price_max=400000, area_max=40000):
  path = 'data/AmesHousing.csv'

  df = pd.read_csv(path, na_values=('NaN', ''), keep_default_na=False)

  rename_dict = {k:k.replace(' ', '').replace('/', '') for k in df.keys()}
  df.rename(columns=rename_dict, inplace=True)

  useful_fields = ['LotArea',
                  'Utilities', 'OverallQual', 'OverallCond',
                  'YearBuilt', 'YearRemodAdd', 'ExterQual', 'ExterCond',
                  'HeatingQC', 'CentralAir', 'Electrical',
                  '1stFlrSF', '2ndFlrSF','GrLivArea',
                  'FullBath', 'HalfBath',
                  'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
                  'Functional','PoolArea',
                  'YrSold', 'MoSold'
                  ]
  target_field = 'SalePrice'

  df.dropna(axis=0, subset=useful_fields+[target_field], inplace=True)

  cleanup_nums = {'Street':      {'Grvl': 0, 'Pave': 1},
                  'LotFrontage': {'NA':0},
                  'Alley':       {'NA':0, 'Grvl': 1, 'Pave': 2},
                  'LotShape':    {'IR3':0, 'IR2': 1, 'IR1': 2, 'Reg':3},
                  'Utilities':   {'ELO':0, 'NoSeWa': 1, 'NoSewr': 2, 'AllPub': 3},
                  'LandSlope':   {'Sev':0, 'Mod': 1, 'Gtl': 3},
                  'ExterQual':   {'Po':0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex':4},
                  'ExterCond':   {'Po':0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex':4},
                  'BsmtQual':    {'NA':0, 'Po':1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex':5},
                  'BsmtCond':    {'NA':0, 'Po':1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex':5},
                  'BsmtExposure':{'NA':0, 'No':1, 'Mn': 2, 'Av': 3, 'Gd': 4},
                  'BsmtFinType1':{'NA':0, 'Unf':1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ':5, 'GLQ':6},
                  'BsmtFinType2':{'NA':0, 'Unf':1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ':5, 'GLQ':6},
                  'HeatingQC':   {'Po':0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex':4},
                  'CentralAir':  {'N':0, 'Y': 1},
                  'Electrical':  {'':0, 'NA':0, 'Mix':1, 'FuseP':2, 'FuseF': 3, 'FuseA': 4, 'SBrkr': 5},
                  'KitchenQual': {'Po':0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex':4},
                  'Functional':  {'Sal':0, 'Sev':1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2':5, 'Min1':6, 'Typ':7},
                  'FireplaceQu': {'NA':0, 'Po':1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex':5},
                  'PoolQC':      {'NA':0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex':4},
                  'Fence':       {'NA':0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv':4},
                  }

  df_X = df[useful_fields].copy()
  df_X.replace(cleanup_nums, inplace=True)  # convert continous categorial variables to numerical
  df_Y = df[target_field].copy()

  x = df_X.to_numpy().astype(np.float32)
  y = df_Y.to_numpy().astype(np.float32)

  if price_max>0:
    idxs = y<price_max
    x = x[idxs]
    y = y[idxs]

  if area_max>0:
    idxs = x[:,0]<area_max
    x = x[idxs]
    y = y[idxs]

  return (x, y, df) if return_df else (x,y)

In [0]:
def house_prices_dataset_normed():
    x, y = house_prices_dataset(return_df=False, price_max=-1, area_max=-1)

    scaler=StandardScaler()
    features_scaled=scaler.fit_transform(x)

    return features_scaled

In [0]:
x, y, df = house_prices_dataset(return_df=True)
print(x.shape, y.shape)
df.head()

In [0]:
plt.plot(x[:, 0], y, '.')
plt.xlabel('area, sq.ft')
plt.ylabel('price, $');
plt.show()

## 3. Blobs

In [0]:
x, y = make_blobs(n_samples=1000, centers=[[0,0], [5,5], [10, 0]])
colors = "ygr"
for i, color in enumerate(colors):
    idx = y == i
    plt.scatter(x[idx, 0], x[idx, 1], c=color, edgecolor='gray', s=25)
plt.show()

## 4. MNIST

The MNIST database of handwritten digits has a training set of 60,000 examples, and a test set of 10,000 examples. The digits have been size-normalized and centered in a fixed-size image.
It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting (taken from http://yann.lecun.com/exdb/mnist/). Each example is a 28x28 grayscale image and the dataset can be readily downloaded from Tensorflow.

In [0]:
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

Let's check few samples:

In [0]:
n = 3
fig, ax = plt.subplots(n, n, figsize=(2*n, 2*n))
ax = [ax_xy for ax_y in ax for ax_xy in ax_y]
for axi, im_idx in zip(ax, np.random.choice(len(train_images), n**2)):
  im = train_images[im_idx]
  im_class = train_labels[im_idx]
  axi.imshow(im, cmap='gray')
  axi.text(1, 4, f'{im_class}', color='r', size=16)
  axi.grid(False)
plt.tight_layout()
plt.show()

## 5. Fashion MNIST

`Fashion-MNIST` is a dataset of Zalando's article images—consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. (from https://github.com/zalandoresearch/fashion-mnist)

In [0]:
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

Let's check few samples:

In [0]:
n = 3
fig, ax = plt.subplots(n, n, figsize=(2*n, 2*n))
ax = [ax_xy for ax_y in ax for ax_xy in ax_y]
for axi, im_idx in zip(ax, np.random.choice(len(train_images), n**2)):
  im = train_images[im_idx]
  im_class = train_labels[im_idx]
  axi.imshow(im, cmap='gray')
  axi.text(1, 4, f'{im_class}', color='r', size=16)
  axi.grid(False)
plt.tight_layout()
plt.show()

Each of the training and test examples is assigned to one of the following labels:

| Label | Description |
| --- | --- |
| 0 | T-shirt/top |
| 1 | Trouser |
| 2 | Pullover |
| 3 | Dress |
| 4 | Coat |
| 5 | Sandal |
| 6 | Shirt |
| 7 | Sneaker |
| 8 | Bag |
| 9 | Ankle boot |

In this course we will use several synthetic and real-world datasets to illustrate the behavior of the models and exercise our skills.

## 1. Data visualization and low-dimensional embeddings

**Unsupervised learning techniques** differ from supervised ones from the fact that data are not labelled (no supervision).

We do not aim at fitting a mapping from $X$ to $Y$, but to understand patterns in the data cloud $X$.

Many datasets have a large number of features $D$ (design matrix of shape $N \times D$). Dimensionality reduction aims at reducing the number of features $M$ (new design matrix of shape $N \times M$), with $M \ll D$, without losing information.

### Applications:

- **EDA**: When $M=2$ or $M=3$ the dataset can also be visualized in a 2D or 3D plot, usable to perform exploratory data analysis, build visualization Apps and supporting feature engineering for complex data structures.

- **DATA UNDERSTANDING**: Umap visualization may highlight dataset structures and outliers/data with poor quality or measurement errors. 

- **NOISE REDUCTION AND DATA COMPRESSION**: by performing dimensionality reduction the noise may be removed. This may simplify downstreams tasks (e.g. clustering) or fitting downstream models, using the reduced features instead of the original one.

### Methods:

Embedding techniques start from a local description of the environment of each sample point in the original space:

- `t-Sne` uses a `statistical description` of the environment of a sample point ;
- `UMAP` describes the `topology` of the environment through a generalized "triangulation" (simplex decomposition) ;

The projection on the low-dimensional space is optimized in order to match as much as possible the description of the local environment.

It is not the goal of this introduction to discuss the derivation of such approaches, which can be found in the references:

https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf

https://arxiv.org/pdf/1802.03426.pdf



### Utilization in Python and examples

To begin with, we create a t-SNE object that we are going to use.

In [0]:
tsne_model = TSNE(perplexity=30, n_components=2, learning_rate=200, early_exaggeration=4.0,init='pca',
                      n_iter=2000, random_state=2233212, metric='euclidean', verbose=100 )

umap_model = umap.UMAP(n_neighbors=30, n_components=2, random_state=1711)

### Example 1: the Heart dataset

In [0]:
data= load_ex2_data_pca(seed=13423, n_add=20)

In [0]:
data.shape

In [0]:
plt.plot(data[:,0], data[:,1],'o')

In [0]:
import numpy as np
from sklearn.manifold import TSNE
from utils.routines import *
import matplotlib.pyplot as plt

data= load_ex2_data_pca(seed=1235, n_add=20)

tsne_model = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3, verbose=1)

tsne_heart = tsne_model.fit_transform(data)

In [0]:
plt.scatter(tsne_heart[:,0],tsne_heart[:,1])
plt.show()

In [0]:
data= load_ex2_data_pca(seed=1235, n_add=20)

tsne_model = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=10, verbose=1)

tsne_heart = tsne_model.fit_transform(data)

In [0]:
plt.scatter(tsne_heart[:,0],tsne_heart[:,1])
plt.show()

In [0]:
umap_model = umap.UMAP(n_neighbors=30, n_components=2, random_state=1711)

umap_hart = umap_model.fit_transform(data)
plt.scatter(umap_hart[:, 0], umap_hart[:, 1])
plt.show()

### Example 2: Mnist dataset

In [0]:
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

n_examples = 5000
data=train_images[:n_examples,:].reshape(n_examples,-1)
data=data/255

labels=train_labels[:n_examples]

In [0]:
# not to run on COLAB

# tsne_model = TSNE(perplexity=10, n_components=2, learning_rate=200,
#                   early_exaggeration=4.0,init='pca',
#                   n_iter=2000, random_state=2233212,
#                   metric='euclidean', verbose=100, n_jobs=1)

# tsne_mnist = tsne_model.fit_transform(data)

# plt.scatter(tsne_mnist[:,0],tsne_mnist[:,1],c=labels,s=10)
# plt.show()

In [0]:
umap_model = umap.UMAP(n_neighbors=10, n_components=2, random_state=1711)
umap_mnist = umap_model.fit_transform(data)
plt.scatter(umap_mnist[:, 0], umap_mnist[:, 1], c=labels, s=10)
plt.show()

### Example 3: Fashion_Mnist dataset

In [0]:
fmnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fmnist.load_data()

n_examples = 5000
data=train_images[:n_examples,:].reshape(n_examples,-1)
data=data/255

labels=train_labels[:n_examples]

In [0]:
# not to run on COLAB

# tsne_model = TSNE(perplexity=50, n_components=2, learning_rate=200, early_exaggeration=4.0,init='pca',
#                      n_iter=1000, random_state=2233212, metric='euclidean', verbose=100 )

# tsne_fmnist = tsne_model.fit_transform(data)

# plt.scatter(tsne_fmnist[:,0],tsne_fmnist[:,1],c=labels,s=10)
# plt.show()

In [0]:
umap_model = umap.UMAP(n_neighbors=50, n_components=2, random_state=1711)
umap_fmnist = umap_model.fit_transform(data)
plt.scatter(umap_fmnist[:, 0], umap_fmnist[:, 1], c=labels, s=10)
plt.show()

### Example 4: AMES dataset

In the case of the AMES dataset we must normalize somehow the dataset. Why?

In [0]:
data = house_prices_dataset_normed()
x, y, df = house_prices_dataset(return_df=True, price_max=-1, area_max=-1)

In [0]:
umap_model = umap.UMAP(n_neighbors=30, n_components=2, random_state=1711)
umap_houses = umap_model.fit_transform(data)
plt.scatter(umap_houses[:, 0], umap_houses[:, 1], s=20)
plt.show()

In such a situation we do not have 'labels' to superimpose. To understand what are the features characterizing each clusters it time consuming and there is no general-purpose method.

The following techniques can be used:

1. **Visualization of Individual points:** Make an interactive App (e.g. using dash) that permits to visualize points quickly (e.g. if the dataset is made of images or texts)

2. **Colour plots:** Color the points according to a selected variable and check if points get mixed.

Using clustering techniques (you will here more about them next session), one can attach a cluster label to each point. After this step, an other range of exploration methodologies can be performed:

3. **Visualization of Centroids:** Make the mean of the each features across all members of one cluster. Compare between clusters. 

4. **Analyze distributions:** Make a descriptive analysis of the populations in the different clusters.

5. **Use cluster indexes as labels and apply supervised machine learning methods.** 

As an example we start an analysis using approach 5:

In [0]:
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=30, gen_min_span_tree=True)
clusterer.fit(umap_houses)

scatter = plt.scatter(umap_houses[:, 0], umap_houses[:, 1], c=clusterer.labels_, cmap='viridis', s=2)
plt.colorbar(scatter)

labels=clusterer.labels_
classes=set(labels)-set([-1])

In [0]:
useful_fields = ['LotArea',
                'Utilities', 'OverallQual', 'OverallCond',
                'YearBuilt', 'YearRemodAdd', 'ExterQual', 'ExterCond',
                'HeatingQC', 'CentralAir', 'Electrical',
                '1stFlrSF', '2ndFlrSF','GrLivArea',
                'FullBath', 'HalfBath',
                'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
                'Functional','PoolArea',
                'YrSold', 'MoSold'
                ]
df = pd.DataFrame(data, columns=useful_fields)
df['cluster']=clusterer.labels_

# Select only the points that have been assigned to a cluster (NB: hdbscan may fail to assign a point to a cluster)
df_2=df[df['cluster']!=-1]
df_2=df_2.sort_values(by='cluster')

In [0]:
n_clusters=len(set(df_2['cluster']))
print(n_clusters)
for i in range(n_clusters):
    pop=df_2[df_2['cluster']==i].shape[0]
    print(f'Cluster {i} has population: {pop}')

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


for cluster in range(n_clusters):
        X = df_2.drop('cluster', axis=1)  
        y = df_2['cluster']            
        # binarization
        y = [1 if i==cluster else 0 for i in y]
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        rf_model = RandomForestClassifier(n_estimators=10, random_state=42, max_depth=5)
        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        # Print evaluation metrics
        print(f"Accuracy: {accuracy}") ## --> Note that this is an inbalanced dataset
        print(conf_matrix)
        plt.figure()
        plt.barh(useful_fields,
                rf_model.feature_importances_)
        plt.title(f'cluster: {cluster} - {conf_matrix}')

NB: a similar result approach can be pursued following the SHAP method.

## 2. Principal Component Analysis (PCA)

PCA can be considered as a dimensionality reduction technique where there is a constrain of the reducting to be a linear projection. As a consequence:

1- only few datasets can be projected to $M=2$ dimensions.

2- the amount of information that can be extracted is higher and is easier to interpret when the variables have all the same units and numerical.

Let's start with the example that we will use to make the theory more concrete. We will take a dataset from from kaggle https://www.kaggle.com/datasets/miroslavsabo/young-people-survey?resource=download (already downloaded for you in the folder `data`)

The datasets consists of the results of a survey about the music preferences of several students, arriving at the following dataset:

In [0]:
data = pd.read_csv("https://raw.githubusercontent.com/Tao-Pi/CAS-Applied-Data-Science/refs/heads/main/DSF5%20-%20copy/data/responses.csv")

In [0]:
data

In [0]:
music_columns=data.columns[:19]
print(music_columns)
music_data=data[music_columns].dropna()

The answers are of course correlated and we expect to have typical patterns recurring, that we define as people liking similar types of songs.

The patterns may be also mixing, for e.g. a class of people may like classic `Pop` and `Reggae`, but not `Latino`. An other class may like `Latino` and `Reggae`, but not `Pop`.

PCA will help to find these typical patterns and their number in a data driven fashion. As we will see these patterns will naturally appear when trying to compress data in a lower dimensional space.

## 2.1 Theory overview.

We will look at PCA from the point of view of `dimensionaliy reduction`.

**Objective:** PCA is used for dimensionality reduction when we have a large number $D$ of features with non-trivial intercorrelation ( data redundancy ) and to isolate relevant features. The number of features $D$ defines the original dimension of the dataset. Each sample defines a vector of dimensionality $D$.


    QUESTION: what are the starting vectors in our survey dataset? How many do we have?

PCA provides a new set of $M$ uncorrelated features for every data point, with $M \le D$. The new features are:

- a linear combination of the original ones ;
- uncorrelated between each other ;

If $M \ll D$ we get an effective dimensionality reduction.

    QUESTION: Does the number of data points changes after applying PCA?

---

## **PCA as a Linear Dimensionality Reduction**

Each data point, indexed by $p=1..N$, starts from an original $D$-dimensional space:

$$
\mathbf{x}^p = (x^p_{1},\ldots,x^p_{D})
$$

where $D$ is the **number of features**.

We want to find a more economical representation with $M$ components, e.g. $M=2$ (two coordinates):

$$
\mathbf{s}^p = (s^p_{1},\ldots,s^p_{M})
$$

How are the new components related to the previous ones? We have a reconstruction property:

$$
\mathbf{x}^p \approx \mathbf{x}^p_M \equiv \mathbf{m} + \sum_{j=1}^M s^p_{j} \ \mathbf{v_{j}}
$$

where $\mathbf{m}$ represents the mean of each feature for the entire dataset. Note that:

- PCA optimizes $\mathbf{v_{j}},\ j=1..M$, called the **principal components** or **principal vectors**, so that the reconstruction property holds optimally for every fixed $M$, minimizing the loss of information.

---

### **More on the Principal Vectors: Explained Variance**

**Orthonormality:**

$$
\mathbf{v}_1 = (v_{1,1},\ldots,v_{1,D}), ..., \mathbf{v}_M = (v_{M,1},\ldots,v_{M,D})
$$

$$
\mathbf{v}_{i} \cdot \mathbf{v}_{j} = \delta_{i,j}
$$

The variance of the points in the dataset can be approximated by the variance of the reconstructed points. In particular, we have the decomposition of the total variance:

$$
S^2 = \frac{1}{N}\sum_{p=1}^{N}  |\mathbf{x}_p - \mathbf{m}|^2 = \epsilon_1 + \epsilon_2 + \ldots + \epsilon_D
$$

where:
- $\epsilon_1$: variance of the reconstructed points $\mathbf{x}^p_{M=1}$, also called the variance explained by the first principal component
- $\epsilon_1 + \epsilon_2$: variance of the reconstructed points $\mathbf{x}^p_{M=1}$, also called the variance explained by the first two components
- $\ldots$
- $S^2 = \epsilon_1 + \epsilon_2 + \ldots + \epsilon_D$

It turns out that **minimizing the reconstruction error is equivalent to maximizing the variance**. The $\epsilon$ values are called the **explained variance**.

The ratios:

$$
\rho_1 = \frac{\epsilon_1}{S^2},\ \rho_2 = \frac{\epsilon_2}{S^2},\ldots
$$

are called the **explained variance ratio**, and their cumulative sums:

$$
r_1 = \frac{\epsilon_1}{S^2},\ r_2 = \frac{\epsilon_1 + \epsilon_2}{S^2},\ldots,\ r_D = 1
$$

are the **cumulative explained variance ratio**.

The plot of the explained variance ratio as a function of the number of components is called a *scree plot* and serves to select an optimal value of $M$ (if it exists). If $M$ is small, we have obtained a new, more compact data representation.

---

### **Terminology to Remember**

- **Principal components:** A sequence of orthonormal vectors $\mathbf{v}_1,\ldots,\mathbf{v}_n$. These vectors represent the typical patterns found in the data, ordered from most to least important.
- **Scores:** For every sample point $p$, the new features (scores) are called $s_{p,i}$. They can be computed from the original representation and the principal vectors with a scalar product: $\mathbf{x}_p \cdot \mathbf{v}_i$
- **Explained variance:** For every $M$, the ratio between the variance of the reconstructed vectors and the total variance. The number of components is chosen by selecting an optimal $M$. The plot of the explained variance as a function of $M$ is called a *scree plot*.

### 2.2 PCA - Example 1: music dataset


https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA

In [0]:
pca=PCA()
pca.fit(music_data)

In [0]:
plt.plot(pca.explained_variance_ratio_,'-o')

In [0]:
df = pd.DataFrame(pca.components_.transpose(),
                  columns = [f'V_{i+1}' for i in range(len(music_columns))],
                  index=music_columns)
df

In [0]:
for vector in ['V_1','V_2','V_3','V_4']:
    plt.figure()
    plt.title(vector)
    plt.plot(np.arange(len(music_columns)),list(df[vector]),'-o')
    _=plt.xticks(np.arange(len(music_columns)),music_columns, rotation=90)
    plt.grid()

### 2.2 PCA - Example 2: Using the scores for visualization

<img src="https://github.com/neworldemancer/DSF5/raw/master/figures/pca-ames.png" width="60%"/>

<img src="https://github.com/neworldemancer/DSF5/raw/master/figures/pca-wine.png" width="60%"/>

<img src="https://github.com/neworldemancer/DSF5/raw/master/figures/pca-breast-cancer.png" width="60%"/>

### Note 2: Intepretation of PCA plot when M=2

When $M=2$ the scatterplot of $s_1$ vs. $s_2$ is an effective 2 dimensional visualization of the dataset. If such a scatterplot shows actual patterns in the data depends on the dataset. 

 See https://www.fbbva.es/wp-content/uploads/2017/05/dat/DE_2010_biplots_in_practice.pdf for a deeper introduction on visualization using PCA.

### 2.3 PCA Example 3 - Denoise

In [0]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from skimage.util import random_noise

# Example adapted from https://scikit-learn.org/stable/auto_examples/applications/plot_digits_denoising.html: 

import matplotlib.pyplot as plt
def plot_digits(X, title):
    """Small helper function to plot 100 digits."""
    fig, axs = plt.subplots(nrows=10, ncols=10, figsize=(8, 8))
    for img, ax in zip(X, axs.ravel()):
        ax.imshow(img.reshape((16, 16)), cmap="Greys")
        ax.axis("off")
    fig.suptitle(title, fontsize=24)
X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True)
X_noise = random_noise(X, mode='gaussian', var=0.1)

In [0]:
plot_digits(X, "Uncorrupted test images")
plot_digits(X_noise, f"Noisy test images")

In [0]:
pca = PCA(n_components=32)
pca.fit(X)
X_reconstructed = pca.inverse_transform(pca.transform(X))
plot_digits(X_reconstructed,'reconstructed')

## Final Notes:

### Note 1: Uncorrelation of PCA scores:


PCA is able therefore to make this mapping:

$(x_1,...,x_D) \rightarrow (y_1,..,y_M)$

Here we focused on data compression, but it is also very important that $y_1,...,y_M$ are uncorrolated. Being uncorrelated means (roughly) that in our dataset we can change one variable without affecting the others. The dimensions 1,...,M are often therefore more interpretable and providing more information.

Always remember that the results of UMAP AND PCA depend on normalization of variables!