In [1]:
import os
os.chdir("../data/dzh")

import pandas as pd
import numpy as np
import pickle
import logging
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filename='/home/mountain/atec/notebook/log/semi.log', level=logging.INFO)

from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelPropagation,LabelSpreading
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler

import xgboost as xgb

## 数据加载

In [12]:
train_labeled = pd.read_hdf("train_labeled.h5", "train")

# # 有标签样本和主动学习得到的样本结合起来用
# train_pl = pd.read_pickle("train_pl")
# train_labeled = pd.concat([train_labeled, train_pl], ignore_index=True)

train_unlabeled = pd.read_hdf("train_unlabeled.h5", "train")
test = pd.read_hdf("test.h5", "test")

x_train_labeled = np.array(train_labeled)[:,1:]
y_train_labeled =  np.array(train_labeled)[:,0].reshape(len(x_train_labeled),1)
x_train_unlabeled  = np.array(train_unlabeled)
x_test = np.array(test)

# 采用直推学习，把要预测的数据放在模型中
x_train_unlabeled = np.concatenate((x_train_unlabeled, x_test),axis=0)

In [13]:
print x_train_labeled.shape
print x_train_unlabeled.shape
print x_test.shape

(9000, 128)
(29000, 128)
(8000, 128)


## 主动学习
- 模型暂时用xgb，把无标签样本当做预测样本，再根据把部分无标签样本再当做有标签样本

In [None]:
# 数据加载
dtrain = xgb.DMatrix(x_train_labeled, label=y_train_labeled)
# dtest = xgb.DMatrix(x_train_unlabeled)
dtest = xgb.DMatrix(x_test)

In [None]:
# cross validation，设置参数，训练模型
param_prob = {'max_depth':5, 'eta':0.0, 'tree_method':'gpu_exact', 'eval_metric':'merror', 'objective':'multi:softprob', 'num_class':10}
param = {'max_depth':5, 'eta':0.05, 'tree_method':'gpu_exact', 'eval_metric':'merror', 'objective':'multi:softmax', 'num_class':10}

num_round = 10

# cv_result = xgb.cv(params=param, dtrain=dtrain, num_boost_round=num_round, nfold=10, early_stopping_rounds=10)
bst_prob = xgb.train(params=param_prob, dtrain=dtrain, num_boost_round=num_round)
bst = xgb.train(params=param, dtrain=dtrain, num_boost_round=num_round)

In [None]:
test_predict_prob = bst_prob.predict(dtest)
test_predict = bst.predict(dtest)

In [None]:
# 根据softmax得到的概率值，设定一个阈值，转换部分无标签数据，变成有标签数据
data = pd.concat([pd.Series(test_predict), pd.Series(test_predict_prob.max(axis=1))], axis=1)

In [None]:
train_labeled.head()

In [None]:
x_train_unlabeled_labeled = pd.DataFrame(x_train_unlabeled)[data[1] > 0.3]
y_train_unlabeled_labeled = data[data[1] > 0.3][0]
train_pl = pd.concat([y_train_unlabeled_labeled, x_train_unlabeled_labeled], axis=1)
train_pl.columns = train_labeled.columns

In [None]:
train_pl.to_pickle("train_pl")

## 半监督学习，数据格式转换

In [14]:
y_train_unlabeled = -1*np.ones((len(x_train_unlabeled),1))
x_train  = np.concatenate((x_train_labeled,x_train_unlabeled),axis=0)
y_train = np.concatenate((y_train_labeled,y_train_unlabeled),axis=0).reshape(-1)

# 一定要标准化、归一化，别忘了！！
x_train = MinMaxScaler().fit_transform(x_train)
x_train = Normalizer().fit_transform(x_train)

x_test = MinMaxScaler().fit_transform(x_test)
x_test = Normalizer().fit_transform(x_test)

## 逻辑回归，监督学习

In [None]:
clf = LogisticRegression()
clf.fit(x_train_labeled,y_train_labeled)
y_predict = clf.predict(x_test)

## sklearn图半监督学习，改造源码，打印一些输出
- 内存消耗很大，很慢
- 结果：用rbf核，速度很慢，计算出来全是同一个类别；用knn核，速度很快，但是效果没有很好

In [16]:
clf = LabelSpreading("knn", tol=0.0001, n_jobs=-1)
logging.info("start!")
clf.fit(x_train, y_train)

LabelSpreading(alpha=0.2, gamma=20, kernel='knn', max_iter=30, n_jobs=-1,
        n_neighbors=7, tol=0.0001)

In [17]:
clf.transduction_.shape

(38000,)

In [19]:
y_predict = clf.transduction_[30000:]

#### 测试一个小样本

In [None]:
# shuffle 十分之一的样本出来
y_x_train = pd.concat([pd.Series(y_train), pd.DataFrame(x_train)], axis=1)
y_x_train = y_x_train.sample(frac=0.1).reset_index(drop=True)
y_train = np.array(y_x_train.iloc[:, 0])
x_train = np.array(y_x_train.iloc[:, 1:])

In [None]:
clf = LabelSpreading(tol=0.0001, n_jobs=-1)
# clf = LabelPropagation(tol=0.0001, n_jobs=-1)
logging.info("start!")
clf.fit(x_train, y_train)
y_predict = clf.predict(x_test)

## 生成式方法的半监督学习
- 基于GMM分布假设
- 注意PCA降维，降低计算开销
- 效果估计也不好

## 生成式方法的半监督学习
- 基于GMM分布假设
- 注意PCA降维，降低计算开销
- 效果估计也不好

## 保存结果及模型

In [21]:
res = pd.concat([pd.Series(test.index.values, dtype='int32'), pd.Series(y_predict, dtype='int32')], axis=1)
res.columns = ["Id", "y"]
res.to_csv("result_transduc.csv", index=False)

# with open("model_0516", 'w') as f:
#     pickle.dump(obj=clf, file=f)

# logging.info("end!")

## sklearn semi supervised learning example

In [None]:
"""
===================================================
Label Propagation digits: Demonstrating performance
===================================================

This example demonstrates the power of semisupervised learning by
training a Label Spreading model to classify handwritten digits
with sets of very few labels.

The handwritten digit dataset has 1797 total points. The model will
be trained using all points, but only 30 will be labeled. Results
in the form of a confusion matrix and a series of metrics over each
class will be very good.

At the end, the top 10 most uncertain predictions will be shown.
"""
%matplotlib inline

print(__doc__)

# Authors: Clay Woolam <clay@woolam.org>
# License: BSD

import numpy as np
import matplotlib.pyplot as plt

from scipy import stats

from sklearn import datasets
from sklearn.semi_supervised import label_propagation

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
digits = datasets.load_digits()
rng = np.random.RandomState(0)
indices = np.arange(len(digits.data))
rng.shuffle(indices)

X = digits.data[indices[:330]]
y = digits.target[indices[:330]]
images = digits.images[indices[:330]]

n_total_samples = len(y)
n_labeled_points = 30

indices = np.arange(n_total_samples)

unlabeled_set = indices[n_labeled_points:]

# #############################################################################
# Shuffle everything around
y_train = np.copy(y)
y_train[unlabeled_set] = -1

In [None]:
# #############################################################################
# Learn with LabelSpreading
lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X, y_train)
predicted_labels = lp_model.transduction_[unlabeled_set]
true_labels = y[unlabeled_set]

In [None]:
cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_)

print("Label Spreading model: %d labeled & %d unlabeled points (%d total)" %
      (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))

print(classification_report(true_labels, predicted_labels))

print("Confusion matrix")
print(cm)

# #############################################################################
# Calculate uncertainty values for each transduced distribution
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)

# #############################################################################
# Pick the top 10 most uncertain labels
uncertainty_index = np.argsort(pred_entropies)[-10:]

# #############################################################################
# Plot
f = plt.figure(figsize=(7, 5))
for index, image_index in enumerate(uncertainty_index):
    image = images[image_index]

    sub = f.add_subplot(2, 5, index + 1)
    sub.imshow(image, cmap=plt.cm.gray_r)
    plt.xticks([])
    plt.yticks([])
    sub.set_title('predict: %i\ntrue: %i' % (
        lp_model.transduction_[image_index], y[image_index]))

f.suptitle('Learning with small amount of labeled data')
plt.show()