In [257]:
#import classes
from random import shuffle
import gensim
import os
import collections
import smart_open
import random
import re

In [258]:
def read_labels(fname, nums=False):
    arr = []
    with smart_open.smart_open(fname) as f:
        for line in enumerate(f):
            s = line[1].decode('utf-8')
            if nums:
                yield line[1].decode('utf-8')
            else:
                if "UNK" in s:
                    yield 3
                elif "NON" in s:
                    yield 2
                elif "PAST" in s:
                    yield 1
                else:
                    yield 0
    

In [259]:
train_nums = list(read_labels("labels-train.txt"))
test_nums = list(read_labels("labels-test.txt"))
train_labels = list(read_labels("labels-train.txt", nums=True))
test_labels = list(read_labels("labels-test.txt", nums=True))

In [None]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [None]:
train_corpus = list(read_corpus("smoking-train.txt"))
test_corpus = list(read_corpus("smoking-test.txt",tokens_only=True))
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)
model.build_vocab(train_corpus)
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=100)

In [None]:
train_vecs = model.docvecs
test_vecs = []
for elem in test_corpus:
    a = list(model.infer_vector(elem))
    test_vecs.append(a)

In [None]:
import xgboost as xgb
import scipy.sparse
import numpy as np
from sklearn.calibration import CalibratedClassifierCV

In [None]:
xg_train = xgb.DMatrix(train_vecs, label=train_nums)
xg_test = xgb.DMatrix(test_vecs, label=test_nums)


In [None]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['max_depth'] = 2
param['eta'] = 1
param['silent'] = 1
param['num_class'] = 4
print(param)

In [None]:
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 5
bst = xgb.train(param, xg_train, num_round, watchlist)
# get prediction
pred = bst.predict(xg_test)
error_rate = np.sum(pred != test_Y) / test_Y.shape[0]
print('Test error using softmax = {}'.format(error_rate))

# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
pred_prob = bst.predict(xg_test).reshape(test_Y.shape[0], 6)
pred_label = np.argmax(pred_prob, axis=1)
error_rate = np.sum(pred != test_Y) / test_Y.shape[0]
print('Test error using softprob = {}'.format(error_rate))