## PCA(主成分分析)
可将多变量数据以主成分简洁的表现出来

In [4]:
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris

data = load_iris()
n_components = 2
model = PCA(n_components=n_components)
model.fit(data.data)
print(data.data[:5])  # 变换前的前5条数据
print(model.transform(data.data)[:5]) # 变换后的前5条数据

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[[-2.68412563  0.31939725]
 [-2.71414169 -0.17700123]
 [-2.88899057 -0.14494943]
 [-2.74534286 -0.31829898]
 [-2.72871654  0.32675451]]


## NMF(非负矩阵分解)
降维，特点是输入数据和输出数据的值都是非负的

In [9]:
from sklearn.decomposition import NMF
from sklearn.datasets._samples_generator import make_blobs

centers = [[5,10,5],[10,4,10],[6,8,8]]
V, _ = make_blobs(centers=centers)  # 以centers为中心生成数据
n_components = 2  # 降低后的维数
model = NMF(n_components=n_components, max_iter=300)
model.fit(V)
W = model.transform(V)  # 分解后的矩阵
H = model.components_
print(V.shape)
print(W.shape)
print(H)

(100, 3)
(100, 2)
[[7.3987459  1.77558008 7.45835991]
 [0.         8.90966188 0.81262671]]




## k-means
聚类

In [12]:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans

data = load_iris()
cluster = 3
model = KMeans(n_clusters=cluster)
model.fit(data.data)
print(model.labels_)  # 各数据点所属的簇
print(model.cluster_centers_) # 通过fit()计算得到的簇重心

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]
[[5.9016129  2.7483871  4.39354839 1.43387097]
 [5.006      3.428      1.462      0.246     ]
 [6.85       3.07368421 5.74210526 2.07105263]]


## Gaussian Mixture
多个高斯分布的线性组合实现聚类

In [17]:
from sklearn.datasets import load_iris
from sklearn.mixture import GaussianMixture

data = load_iris()
n_components = 3  # 高斯分布的数量
model = GaussianMixture(n_components=n_components)
model.fit(data.data[:4])
print(model.predict(data.data[:4])) # 预测类别
print()
print(model.means_) # 各高斯分布的均值
print()
print(model.covariances_) # 各高斯分布的方差

[1 0 2 2]

[[4.9  3.   1.4  0.2 ]
 [5.1  3.5  1.4  0.2 ]
 [4.65 3.15 1.4  0.2 ]]

[[[ 1.00000000e-06  7.09974815e-29  3.31321580e-29  4.73316543e-30]
  [ 7.09974815e-29  1.00000000e-06  2.07075988e-29  2.95822839e-30]
  [ 3.31321580e-29  2.07075988e-29  1.00000000e-06  1.38050658e-30]
  [ 4.73316543e-30  2.95822839e-30  1.38050658e-30  1.00000000e-06]]

 [[ 1.00000000e-06  8.71691300e-29  3.58931712e-29  5.12759588e-30]
  [ 8.71691300e-29  1.00000000e-06  2.34686119e-29  3.35265885e-30]
  [ 3.58931712e-29  2.34686119e-29  1.00000000e-06  1.38050658e-30]
  [ 5.12759588e-30  3.35265885e-30  1.38050658e-30  1.00000000e-06]]

 [[ 2.50100000e-03  2.50000000e-03 -5.00000000e-03  1.08468374e-30]
  [ 2.50000000e-03  2.50100000e-03 -5.00000000e-03  7.39557099e-31]
  [-5.00000000e-03 -5.00000000e-03  1.00010000e-02  3.69778549e-31]
  [ 1.08468374e-30  7.39557099e-31  3.69778549e-31  1.00000000e-06]]]


## LLE(局部线性嵌入)
可以将弯曲或扭曲的状态埋藏在高维空间中的结构简单地表示在低维空间中

In [26]:
from sklearn.datasets import _samples_generator
from sklearn.manifold import LocallyLinearEmbedding

data, color = _samples_generator.make_swiss_roll(n_samples=1500)
n_neighbor = 12
n_components = 2
model = LocallyLinearEmbedding(n_neighbors=n_neighbor, n_components=n_components)
model.fit(data)
print(data.shape)
print(data[:5])
print(model.transform(data).shape)
print(model.transform(data)[:5])

(1500, 3)
[[-4.14015679 10.7778069   7.27585219]
 [ 1.07434604  7.30690446  7.63908285]
 [ 5.95243107 15.44653911 -1.23338107]
 [ 9.98893687  5.53289754 -6.6162025 ]
 [-8.35243855  2.87727039  3.4420624 ]]
(1500, 2)
[[ 0.01252748  0.00219997]
 [ 0.01796864  0.01389929]
 [ 0.02891092 -0.03741342]
 [-0.02401416  0.00726797]
 [ 0.00678705  0.02608275]]


## t-SNE(t分布随机邻域嵌入)
将高维的复杂数据降维为二维或三维的算法，用于低维空间的可视化

In [42]:
from sklearn.datasets import load_digits
from sklearn.manifold import TSNE

data = load_digits()
n_components = 2
model = TSNE(n_components=n_components)
print(data.data.shape)
print(model.fit_transform(data.data))

(1797, 64)
[[-60.04456     8.190712 ]
 [ -1.3752689 -22.912495 ]
 [ 31.29138   -14.407837 ]
 ...
 [ 17.838503   -8.484607 ]
 [  8.271461   21.663279 ]
 [ 20.568012   -4.3612924]]
