## 学习向量量化（Learning Vector Quantization,LVQ)

好奇怪？  
这种技术很难在目标变量和预测变量之间做出合适的推断。  
和其他方法不同，很难搞清楚反应变量y和预测期x之间存在什么样的联系。  

在实际应用上作为一种黑箱方法来加以应用

In [2]:
# 使用iris数据集
from sklearn.datasets import load_iris
import numpy as np
from sklearn.metrics import euclidean_distances

In [3]:
data = load_iris()
x = data['data']
y = data['target']

In [4]:
# 对变量进行缩放,使用自带的库函数很简单方便
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
x = minmax.fit_transform(x)

先声明LVQ的参数  
R的值设为2，即每个类别标签有两种原型向量  
iris数据是三维数据
定义epsilon以及缩减因子

In [5]:
R = 2
n_classes = 3
epsilon = 0.9
epsilon_dec_factor = 0.01

In [6]:
# 定义一个类来保存原型向量
class prototype(object):
    '''
    class to hold prototype vectors
    '''
    def __init__(self,class_id,p_vector,epsilon):
        self.class_id = class_id
        self.p_vector = p_vector
        self.epsilon = epsilon
        
    def update(self,u_vector,increment=True):
        if increment:
            # 将原型向量向输入向量靠近
            self.p_vector = self.p_vector + self.epsilon*(u_vector - self.p_vector)
        else:
            self.p_vector = self.p_vector - self.epsilon*(u_vector - self.p_vector)

In [7]:
# 找出离给定向量最近的原型向量
def find_closest(in_vector,proto_vectors):
    closest = None
    closest_distance = 9999999
    for p_v in proto_vectors:
        distance = euclidean_distances(in_vector,p_v.p_vector)
        if distance < closest_distance:
            closest_distance = distance
            closest = p_v
    return closest

In [8]:
# 快速找出最近的原型向量的类别ID
def find_class_id(test_vector,p_vectors):
    return find_closest(test_vector,p_vectors).class_id

In [9]:
# 选择初始化的kx原型向量类别数
# 为每个类选择R个原型
p_vectors = []
for i in range(n_classes):
    # 选择一个类
    y_subset = np.where(y==i)
    # 为选中的类选择元组
    x_subset = x[y_subset]
    # 获得R个随机下标，介于0~50
    samples = np.random.randint(0,len(x_subset),R)
    # 选择p_vectors
    for sample in samples:
        s = x_subset[sample]
        p = prototype(i,s,epsilon)
        p_vectors.append(p)

print 'class id \t Inital protype vector\n'
for p_v in p_vectors:
    print p_v.class_id,'\t',p_v.p_vector
print 

class id 	 Inital protype vector

0 	[ 0.11111111  0.5         0.10169492  0.04166667]
0 	[ 0.41666667  0.83333333  0.03389831  0.04166667]
1 	[ 0.33333333  0.16666667  0.45762712  0.375     ]
1 	[ 0.33333333  0.16666667  0.47457627  0.41666667]
2 	[ 0.66666667  0.20833333  0.81355932  0.70833333]
2 	[ 0.61111111  0.41666667  0.81355932  0.875     ]



In [10]:
# 利用已有的数据点，执行循环调整原型向量，对新的点进行分类、聚类
while epsilon >= 0.01:
    # 随机采样一个训练实例
    rnd_i = np.random.randint(0,149)
    rnd_s = x[rnd_i]
    target_y = y[rnd_i]
    
    # 为下一次循环减少
    epsilon = epsilon - epsilon_dec_factor
    # 查找与给定点最相近的原型向量
    closest_pvector = find_closest(rnd_s,p_vectors)
    
    # 更新最相近的原型向量
    if target_y == closest_pvector.class_id:
        closest_pvector.update(rnd_s)
    else:
        closest_pvector.update(rnd_s,False)
    closest_pvector.epsilon = epsilon
    
print 'class id \t final prototype vector \n'
for p_vector in np.array(p_vectors):
    print p_vector.class_id,'\t',p_vector.p_vector

class id 	 final prototype vector 

0 	[ 0.17509956  0.54968947  0.08380805  0.08344845]
0 	[ 0.30496129  0.77555476  0.07780385  0.07776592]
1 	[ 0.30354664  0.26253709  0.49247495  0.4534678 ]
1 	[ 0.56870532  0.38844757  0.60394626  0.52306284]
2 	[ 0.54679649  0.33666678  0.73396125  0.77258314]
2 	[ 0.74788063  0.53536012  0.81498245  0.90226068]




In [14]:
predicted_y = []
for instance in x:
    print instance
    predicted_y.append(find_class_id(instance,p_vectors))

[ 0.22222222  0.625       0.06779661  0.04166667]
[ 0.16666667  0.41666667  0.06779661  0.04166667]
[ 0.11111111  0.5         0.05084746  0.04166667]
[ 0.08333333  0.45833333  0.08474576  0.04166667]
[ 0.19444444  0.66666667  0.06779661  0.04166667]
[ 0.30555556  0.79166667  0.11864407  0.125     ]
[ 0.08333333  0.58333333  0.06779661  0.08333333]
[ 0.19444444  0.58333333  0.08474576  0.04166667]
[ 0.02777778  0.375       0.06779661  0.04166667]
[ 0.16666667  0.45833333  0.08474576  0.        ]
[ 0.30555556  0.70833333  0.08474576  0.04166667]
[ 0.13888889  0.58333333  0.10169492  0.04166667]
[ 0.13888889  0.41666667  0.06779661  0.        ]
[ 0.          0.41666667  0.01694915  0.        ]
[ 0.41666667  0.83333333  0.03389831  0.04166667]
[ 0.38888889  1.          0.08474576  0.125     ]
[ 0.30555556  0.79166667  0.05084746  0.125     ]
[ 0.22222222  0.625       0.06779661  0.08333333]
[ 0.38888889  0.75        0.11864407  0.08333333]
[ 0.22222222  0.75        0.08474576  0.08333333]






[ 0.5         0.33333333  0.50847458  0.5       ]
[ 0.55555556  0.20833333  0.66101695  0.58333333]
[ 0.5         0.33333333  0.62711864  0.45833333]
[ 0.58333333  0.375       0.55932203  0.5       ]
[ 0.63888889  0.41666667  0.57627119  0.54166667]
[ 0.69444444  0.33333333  0.6440678   0.54166667]
[ 0.66666667  0.41666667  0.6779661   0.66666667]
[ 0.47222222  0.375       0.59322034  0.58333333]
[ 0.38888889  0.25        0.42372881  0.375     ]
[ 0.33333333  0.16666667  0.47457627  0.41666667]
[ 0.33333333  0.16666667  0.45762712  0.375     ]
[ 0.41666667  0.29166667  0.49152542  0.45833333]
[ 0.47222222  0.29166667  0.69491525  0.625     ]
[ 0.30555556  0.41666667  0.59322034  0.58333333]
[ 0.47222222  0.58333333  0.59322034  0.625     ]
[ 0.66666667  0.45833333  0.62711864  0.58333333]
[ 0.55555556  0.125       0.57627119  0.5       ]
[ 0.36111111  0.41666667  0.52542373  0.5       ]
[ 0.33333333  0.20833333  0.50847458  0.5       ]
[ 0.33333333  0.25        0.57627119  0.45833333]




In [15]:
print predicted_y

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [17]:
print np.shape(predicted_y)
print np.shape(y)

(150,)
(150,)


In [18]:
from sklearn.metrics import classification_report
print classification_report(y,predicted_y,target_names=['Iris-Setosa','Iris-Versicolour','Iris-Virginica'])

                  precision    recall  f1-score   support

     Iris-Setosa       1.00      1.00      1.00        50
Iris-Versicolour       0.94      0.94      0.94        50
  Iris-Virginica       0.94      0.94      0.94        50

     avg / total       0.96      0.96      0.96       150

