## Обучение без учителя (Unsupervised)

* https://setosa.io/ev/principal-component-analysis/
* https://pair-code.github.io/understanding-umap/

In [2]:
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
import plotly.express as px
import warnings
from sklearn import metrics
from scipy.cluster.hierarchy import dendrogram, linkage
warnings.filterwarnings("ignore")

In [3]:
iris = datasets.load_iris()
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
print(iris.data[:3,:])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]


In [5]:
fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width',
              color='species')
fig.show()

NameError: name 'df' is not defined

## Decomposition

### PCA
- [Пошаговое описание на английском](https://www.youtube.com/watch?v=FgakZw6K1QQ)
- [Статья на Хабре на математичном](https://habr.com/ru/post/304214/)

![](https://github.com/Yorko/mlcourse.ai/raw/05aae6e7e582c2f17eab7f3355f97d14c2dc9f19/img/pca_good_bad_direction.png)

![](https://neerc.ifmo.ru/wiki/images/5/5a/800px-Pca_3d_to_2d_example_v2.png)

In [None]:
X, y = iris.data, iris.target
pca = PCA (n_components=2)
X_reduced = pca.fit_transform(X)


In [None]:
y

In [None]:
X[0]

In [None]:
X_reduced[:5]

In [None]:
pca.components_

In [None]:
for component in pca.components_:
    print(' + '.join('%.3f x %s' % (value,name) for value,name in zip(component, iris.feature_names)))


plt.figure(figsize = (10,7))
plt.scatter(X_reduced[:,0],X_reduced[:,1], c=y)
plt.show()

In [None]:
def plot_decomposition_3D(model, X, target):
    X_reduced = model.fit_transform(X)
    x,y,z = [X_reduced[:,i] for i in range(3)]
    fig = px.scatter_3d(x=x, y=y, z=z,
              color=iris.target, width=800, height=800)
    fig.show()

In [None]:
pca = PCA(n_components=3)
plot_decomposition_3D(pca, X, iris.target)

### t-SNE

In [None]:
tsne = TSNE(n_components=3, random_state=10)
plot_decomposition_3D(tsne, X, iris.target)


### UMAP

In [None]:
umap = UMAP(n_components=3, random_state=10)
plot_decomposition_3D(umap, X, iris.target)

## Clustering

![](https://www.machinelearningmastery.ru/img/0-898077-26802.png)

### KMeans
- [Пошаговое описание на английском](https://www.youtube.com/watch?v=4b5d3muPQmA)
- https://www.naftaliharris.com/blog/visualizing-k-means-clustering/

![](https://www.projectrhea.org/rhea/images/e/ef/RunyanKmeans.gif)

In [None]:
def plot_cluster_3D(model, X, true):
    pred = model.fit_predict(X)
    X_reduced = PCA(n_components=3).fit_transform(X)
    x,y,z = [X_reduced[:,i] for i in range(3)]
    score = metrics.rand_score(true, pred)
    title = 'Score for %s: %.2f' % (model.__class__.__name__, score)
    fig = px.scatter_3d(x=x, y=y, z=z,
              color=pred, title=title, width=800, height=800)
    fig.show()

km = KMeans (n_clusters=3)
plot_cluster_3D(km, X, iris.target)

In [None]:
km = KMeans (n_clusters=3)
plot_cluster_3D(km, X, iris.target)

### Agglomerative Clustering

In [None]:
ac = AgglomerativeClustering(n_clusters=3)
plot_cluster_3D(ac, X, iris.target)

### DBSCAN

In [None]:
DBSCAN?

- https://www.naftaliharris.com/blog/visualizing-dbscan-clustering/

In [None]:
dbscan = DBSCAN(eps=0.4, min_samples=3)
plot_cluster_3D(dbscan, X, iris.target)

## Metrics

### Rand Index

https://en.wikipedia.org/wiki/Rand_index

$$ RI = \frac{a+b}{a+b+c+d} = \frac{a+b}{{n \choose 2 }} $$

$$ {\displaystyle {n \choose 2}} = {\displaystyle n(n-1)/2} $$

$$ \text{RI} = \frac{2(a + b)}{n(n-1)} $$

In [None]:
#     А,Б,В,Г,Д,Е
true=[1,1,1,2,2,3]
pred=[1,1,2,2,3,3]
# a: AБ
# b: АГ, АД, АЕ, БГ, БД, БЕ, ВД, ВЕ, ГЕ

def pairs_by_n(n):
    return n * (n - 1) / 2

def calc_rand(a,b,n):
    return (a+b)/pairs_by_n(n)

a=1
b=9
n=len(true)
calc_rand(a,b,n)

In [None]:
metrics.rand_score(true, pred)

In [None]:
true=[1,1,1,2,2,2,3,3,3,1]
pred=[0,0,1,1,2,2,3,3,4,4]
# a=
# b=
# n=
# calc_rand(a,b,n)

In [None]:
# metrics.rand_score(true, pred)

### Adjusted Rand index

$$ ARI={\frac {\left.\sum _{ij}{\binom {n_{ij}}{2}}-\left[\sum _{i}{\binom {a_{i}}{2}}\sum _{j}{\binom {b_{j}}{2}}\right]\right/{\binom {n}{2}}}{\left.{\frac {1}{2}}\left[\sum _{i}{\binom {a_{i}}{2}}+\sum _{j}{\binom {b_{j}}{2}}\right]-\left[\sum _{i}{\binom {a_{i}}{2}}\sum _{j}{\binom {b_{j}}{2}}\right]\right/{\binom {n}{2}}}} $$

In [None]:
true=[1,1,1,2,2,2,3,3,3]
pred=[1,1,2,2,3,3,1,1,2]

In [None]:
metrics.confusion_matrix(true,pred)

$$ \sum _{ij}{\binom {n_{ij}}{2}} = {\binom {2}{2}} + {\binom {1}{2}} +{\binom {0}{2}} +{\binom {0}{2}} +{\binom {1}{2}} +{\binom {2}{2}} +{\binom {2}{2}} +{\binom {1}{2}} +{\binom {0}{2}} $$

In [None]:
sum_nij = pairs_by_n(2) + pairs_by_n(1) + pairs_by_n(0) + pairs_by_n(0) + pairs_by_n(1) + pairs_by_n(2) + pairs_by_n(2) + pairs_by_n(1) + pairs_by_n(0)
sum_nij

$$ \sum _{i}{\binom {a_{i}}{2}} ={\binom {3}{2}} +{\binom {3}{2}} +{\binom {3}{2}}  $$

In [None]:
sum_ai = pairs_by_n(3) + pairs_by_n(3) + pairs_by_n(3)
sum_ai

$$ \sum _{j}{\binom {b_{j}}{2}} ={\binom {4}{2}} +{\binom {3}{2}} +{\binom {2}{2}}  $$

In [None]:
sum_bj = pairs_by_n(4) + pairs_by_n(3) + pairs_by_n(2)
sum_bj

$$ ARI={\frac {\left.\sum _{ij}{\binom {n_{ij}}{2}}-\left[\sum _{i}{\binom {a_{i}}{2}}\sum _{j}{\binom {b_{j}}{2}}\right]\right/{\binom {n}{2}}}{\left.{\frac {1}{2}}\left[\sum _{i}{\binom {a_{i}}{2}}+\sum _{j}{\binom {b_{j}}{2}}\right]-\left[\sum _{i}{\binom {a_{i}}{2}}\sum _{j}{\binom {b_{j}}{2}}\right]\right/{\binom {n}{2}}}} = \frac {3 - [9 \times 10] / {\binom {n}{2}}}{\frac {1}{2}[9 + 10]- [9 \times 10] / {\binom {n}{2}}} $$

In [None]:
ARI = (sum_nij - (sum_ai * sum_bj) / pairs_by_n(len(pred))) / ((1/2)*(sum_ai+sum_bj) - (sum_ai * sum_bj)/pairs_by_n(len(pred)))
ARI

In [None]:
metrics.adjusted_rand_score(true,pred)

### Adjusted Mutual Information
https://en.wikipedia.org/wiki/Adjusted_mutual_information

$$ AMI(U,V)={\frac  {MI(U,V)-E\{MI(U,V)\}}{\max {\{H(U),H(V)\}}-E\{MI(U,V)\}}} $$

In [None]:
metrics.adjusted_mutual_info_score(true,pred)

### Homogeneity, completeness, V-measure

$$ h = 1 - \frac{H(C\mid K)}{H(C)}, c = 1 - \frac{H(K\mid C)}{H(K)}, $$

In [None]:
metrics.homogeneity_score(true,pred)

In [None]:
metrics.completeness_score(true,pred)

$$ v = 2\frac{hc}{h+c} $$ 

In [None]:
metrics.v_measure_score(true,pred)

### Силуэт

$$ s = \frac{b - a}{\max(a, b)} $$

In [None]:
pca=PCA(n_components=3)
X_reduced = pca.fit_transform(X)
model = KMeans(n_clusters=3)
model.fit(X_reduced)
metrics.silhouette_score(X_reduced, labels=model.labels_)

In [None]:
pca=PCA(n_components=3)
X_reduced = pca.fit_transform(X)
model = KMeans(n_clusters=3)
model.fit(X_reduced)
metrics.silhouette_score(X_reduced, labels=model.labels_)

## MNIST Example

In [None]:
digits = datasets.load_digits()
X, y = digits.data, digits.target

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

### EDA

In [None]:
fig, axes = plt.subplots (2, 5, figsize = (10,7))
for i, ax in enumerate(axes.flat):
    im = X[i].reshape(1,-1)
    ax.imshow(im.reshape((8,8)), cmap='binary')
    ax.text(0.95,0.95, y[i])
    ax.set_xticks([])
    ax.set_yticks([])


### Decomposition

In [None]:
def plot_digits(X,y):
    plt.figure(figsize = (10,7))
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.get_cmap('nipy_spectral', 10))
    plt.colorbar()
    plt.show()

decompositions = [
    PCA(n_components=2),
    TSNE(n_components=2),
    UMAP(n_components=2)
]

for decomposition in decompositions:
    X_reduced = decomposition.fit_transform(X)
    print(decomposition.__class__.__name__)
    plot_digits(X_reduced, y)

In [None]:
X_reduced = TSNE(n_components=2).fit_transform(X)

In [None]:
number = 7
plt.figure(figsize=(4,2))
plt.imshow(X[number].reshape(8,8))
plt.xticks([])
plt.yticks([])
plt.show

fig, axes = plt.subplots (8, 8, figsize = (10,7))
fig.subplots_adjust(hspace=0.1,wspace=0.1)
for i, ax in enumerate(axes.flat):
    pca = PCA(i+1).fit(X)
    im = pca.inverse_transform(pca.transform(X[number].reshape(1,-1)))
    ax.imshow(im.reshape(8,8), cmap='binary')
    ax.text(0.95,0.95, i+1)
    ax.set_xticks([])
    ax.set_yticks([])

In [None]:
pca = PCA().fit(X)

plt.figure(figsize=(10,7))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.show()

In [None]:
pca=PCA(0.9)
X_pca = pca.fit_transform(X)
pca.n_components_

In [None]:
sum(pca.explained_variance_ratio_)

### Clustering

#### K-means

In [None]:
inertia = []
silhouette = []
rng = range(2,20)
for k in rng:
    kmeans = KMeans(n_clusters=k, random_state=17)
    kmeans.fit(X)
    inertia.append(np.sqrt(kmeans.inertia_))
    score = metrics.silhouette_score(X, kmeans.labels_)
    silhouette.append(score)

In [None]:
plt.plot(rng, inertia, marker='s')
plt.title('Elbow')

In [None]:
plt.plot(rng, silhouette, marker='s')
plt.title('Silhouette')

In [None]:
pred = KMeans(n_clusters=10).fit_predict(X_reduced)
print(metrics.adjusted_rand_score(y,pred))
plot_digits(X_reduced, pred)

#### Agglomerative Clustering

In [None]:
pred = AgglomerativeClustering(n_clusters=10).fit_predict(X_reduced)
print(metrics.adjusted_rand_score(y,pred))
plot_digits(X_reduced, pred)

In [None]:
plt.figure(figsize=(20,10))
linkage_ = linkage(X, method='ward')
dendrogram_ = dendrogram(linkage_)

#### DBSCAN

In [None]:
silhouette = []
rng = range(2,20)

X_scaled=scaler.fit_transform(X)
best_epsilon=0
for eps in rng:
    dbscan = DBSCAN(eps=eps)
    dbscan.fit(X_reduced)
    labels=dbscan.labels_
    uniq_labels = np.unique(labels)
    n_clusters = len(uniq_labels[uniq_labels != -1])
    if n_clusters > 1:
        score = metrics.silhouette_score(X_reduced, labels)
    else:
        score = 0
    silhouette.append(score)
    if score>=np.max(silhouette):
        best_epsilon=eps

print('Best epsilon = %.2f, silhouette = %.2f' % (best_epsilon, np.max(silhouette)))
plt.plot(rng, silhouette, marker='s')
plt.title('Silhouette')
plt.show()

In [None]:
dbscan = DBSCAN(eps=best_epsilon)
pred = dbscan.fit_predict(X_reduced)
print(metrics.adjusted_rand_score(y,pred))
plot_digits(X_reduced, pred)

🛠 Сократить число признаков данных клиентов банка до 3 признаков (предварительно нормализовав данные) с помощью PCA или t-SNE, сделать кластеризацию без учёта целевого признака Geography на 3 кластера. Визуально отобразить данные.

In [None]:
import pandas as pd
df = pd.read_csv('../data/bank.csv')
X,y= df.drop(columns = ['Geography']), df['Geography']
df.head()
# Ваш код здесь

🛠 Найти аномальных клиентов среди всех данных с помощью DBSCAN.

In [None]:
# Ваш код здесь