In [69]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

plt.style.use("seaborn-v0_8")

## Curse of dimensionality 
It refers to the problems associated with multivariate data analysis: for a given sample size, there is a maximum number of features above which the performance of our classifier will degrade rather than improve. In most cases, the extra information that is lost by discarding certain features is compensated by a more accurate mapping into the smaller dimensional space
- Solutions: 
    - Add a priori knowledge to weigh more some variables instead of others; 
    - Reduce dimensionality by using unsupervised algorithms

---


In [70]:
data = pd.read_csv("datasets/iris.data.txt")
X = data.iloc[:, :-1]
y = data.iloc[:, -1]  # last column
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

### Principal Component Analysis (PCA) (|||)
- Unsupervised method: it does not consider the class labels of the samples ("class separability")
- A new dataset is build, that will have only the attributes which capture most of the data variation. 1) Finds the eigenvector of the covariance matrix; 2) The eigenvectors define the new space.
- Excluding the "non-significant" principal components may filter out the noise that is present in the data.
- The maximum number of PCs that can be calculated is min(n_samples, n_features)
- The number of PCs to keep is the one that preserves at least 95% of the total variance

In [71]:
# n_components is how many PCs to keep
pca = PCA(random_state=42, n_components="mle")
pca.fit(X_train)

X_train_t = pca.transform(X_train)
X_test_t = pca.transform(X_test)

# Percentage of variance explained by each of the selected components
# The sum needs to be at least 95%, otherwise too much information is lost
print(pca.explained_variance_ratio_)
print("Sum:", pca.explained_variance_ratio_.sum())

[0.91939858 0.05522612 0.02020546]
Sum: 0.9948301607584098


In [72]:
# Classification with original data
clf = LogisticRegression(solver="liblinear", random_state=42, max_iter=100)
clf.fit(X_train, y_train)
print("Classification Accuracy: original:", clf.score(X_test, y_test))


# Classification with PCA data transformed
clf = LogisticRegression(solver="liblinear", random_state=42, max_iter=100)
clf.fit(X_train_t, y_train)
print("Classification Accuracy: transformed by PCA:", clf.score(X_test_t, y_test))

# Classification with PCA data transformed and selecting ONLY the most valuable attribute (the first)
col = 0
clf = LogisticRegression(solver="liblinear", random_state=42, max_iter=100)
X_train_t_head = X_train_t[:, col].reshape(-1, 1)
X_test_t_head = X_test_t[:, col].reshape(-1, 1)
clf.fit(X_train_t_head, y_train)
print(
    "Classification Accuracy: transformed by PCA, selecting only most valuable variable:",
    clf.score(X_test_t_head, y_test),
)

Classification Accuracy: original: 0.9777777777777777
Classification Accuracy: transformed by PCA: 0.9111111111111111
Classification Accuracy: transformed by PCA, selecting only most valuable variable: 0.8444444444444444


---
### Linear Discriminant Analysis
- Supervised method: considers class separability
- It works projecting the input data to a linear subspace consisting of the directions which maximize the separation between classes
- The maximum number of PCs that can be calculated is min(n_classes - 1, n_features)

In [73]:
lda = LinearDiscriminantAnalysis(solver="eigen")
lda.fit(X_train, y_train)

X_train_t = lda.transform(X_train)
X_test_t = lda.transform(X_test)

In [74]:
# Classification with original data
clf = LogisticRegression(solver="liblinear", random_state=42, max_iter=200)
clf.fit(X_train, y_train)
print("Classification Accuracy: original:", clf.score(X_test, y_test))


# Classification with data transformed with LDA
clf = LogisticRegression(solver="liblinear", random_state=42, max_iter=200)
clf.fit(X_train_t, y_train)
print("Classification Accuracy: transformed by LDA:", clf.score(X_test_t, y_test))

# Classification with data transformed with LDA and selecting ONLY the most valuable attribute (the first)
col = 0
clf = LogisticRegression(solver="liblinear", random_state=42, max_iter=200)
X_train_t_head = X_train_t[:, col].reshape(-1, 1)
X_test_t_head = X_test_t[:, col].reshape(-1, 1)
clf.fit(X_train_t_head, y_train)
print(
    "Classification Accuracy: transformed by LDA, selecting only most valuable variable:",
    clf.score(X_test_t_head, y_test),
)

Classification Accuracy: original: 0.9777777777777777
Classification Accuracy: transformed by LDA: 0.9777777777777777
Classification Accuracy: transformed by LDA, selecting only most valuable variable: 0.9777777777777777


---
### t-distributed Stochastic Neighbor Embedding (t-SNE)
- It's a technique for dimensionality reduction that is particularly well-suited for the visualization of high-dimensional datasets
- More info:
• https://lvdmaaten.github.io/tsne/
• https://distill.pub/2016/misread-tsne