这个文档实现了半监督学习(sklearn.semi_supervised):
1. .LabelPropagation(考察rbf,knn)
2. .LabelSpreading(考察rbf,knn)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import datasets 
from sklearn.semi_supervised import LabelPropagation

给出加载数据集的函数load_data():

In [6]:
def load_data():
    '''加载数据集，返回一个元组，依次为：样本集合，样本标记集合，未标记样本的下标集合
    '''
    digits = datasets.load_digits()
    
    ###### 混洗样本 ######
    rng = np.random.RandomState(0)
    indices = np.arange(len(digits.data)) # 样本下标集合
    rng.shuffle(indices)  # 混洗样本下标集合
    X = digits.data[indices]
    y = digits.target[indices]
    
    ###### 生成未标记样本的下标集合 ######
    n_labeled_points = int(len(y)/10) # 只有10%的样本有标记
    unlabeled_indices = np.arange(len(y))[n_labeled_points:] # 后面90%的样本未标记
    
    return X,y,unlabeled_indices

给出使用LabelPropagation的函数：

In [11]:
def test_LabelPropagation(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y) # 必须复制，因为后面的操作会修改y_train
    y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为-1
    clf = LabelPropagation(max_iter=100,kernel='rbf',gamma=0.1)
    clf.fit(X,y_train)
    
    ### 获取预测准确率 ###
    true_labels = y[unlabeled_indices]  # 真实标记
    print("Accuracy:%f" %clf.score(X[unlabeled_indices],true_labels))

调用该函数：

In [12]:
data = load_data()
test_LabelPropagation(*data)

Accuracy:0.948084


接下来考察折中系数alpha以及gamma参数对于rbf核的性能影响：

In [17]:
def test_LabelPropagation_rbf(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    alphas = np.linspace(0.01,1,num=10,endpoint=True)
    gammas = np.logspace(-2,2,num=50)
    colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
           (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
    
    ### 训练并绘图
    for alpha,color in zip(alphas,colors):
        scores=[]
        for gamma in gammas:
            clf = LabelPropagation(max_iter=100,gamma=gamma,alpha=alpha,kernel='rbf')
            clf.fit(X,y_train)
            scores.append(clf.score(X[unlabeled_indices],y[unlabeled_indices]))
        ax.plot(gammas,scores,label=r"$\alpha=%s$" %alpha,color=color)
    
    ### 设置图形
    ax.set_xlabel(r"$\alpha$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.ser_title("LabelPropagation rbf kernel")
    ax.legend(loc="best")
    plt.show()

调用该测试函数：

In [None]:
data = load_data()
test_LabelPropagation_rbf(*data)