In [294]:
#import classes
from random import shuffle
import gensim
import os
import collections
import smart_open
import random
import re

In [295]:
def read_labels(fname, nums=False):
    arr = []
    with smart_open.smart_open(fname) as f:
        for line in enumerate(f):
            s = line[1].decode('utf-8')
            if nums:
                yield line[1].decode('utf-8')
            else:
                if "UNK" in s:
                    yield 3
                elif "NON" in s:
                    yield 2
                elif "PAST" in s:
                    yield 1
                else:
                    yield 0
    

In [296]:
train_nums = list(read_labels("labels-train.txt"))
test_nums = list(read_labels("labels-test.txt"))
train_labels = list(read_labels("labels-train.txt", nums=True))
test_labels = list(read_labels("labels-test.txt", nums=True))

In [297]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [298]:
train_corpus = list(read_corpus("smoking-train.txt"))
test_corpus = list(read_corpus("smoking-test.txt",tokens_only=True))
model = gensim.models.doc2vec.Doc2Vec(size=100, min_count=2, iter=55)
model.build_vocab(train_corpus)
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=100)

CPU times: user 1min 7s, sys: 1.61 s, total: 1min 9s
Wall time: 30.6 s


22380916

In [299]:
train_vecs = model.docvecs
test_vecs = []
for elem in test_corpus:
    a = list(model.infer_vector(elem))
    test_vecs.append(a)

In [300]:
import xgboost as xgb
import scipy.sparse
import numpy as np
from sklearn.calibration import CalibratedClassifierCV

In [301]:
xg_train = xgb.DMatrix(train_vecs, label=train_nums)
xg_test = xgb.DMatrix(test_vecs, label=test_nums)


In [302]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['max_depth'] = 6
param['silent'] = 1
param['num_class'] = 4
print(param)

{'objective': 'multi:softmax', 'max_depth': 6, 'silent': 1, 'num_class': 4}


In [303]:
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 10
bst = xgb.train(param, xg_train, num_round, watchlist)
# get prediction
pred = bst.predict(xg_test)
error_rate = np.sum(pred != test_Y) / test_Y.shape[0]
print('Test error using softmax = {}'.format(error_rate))

# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
pred_prob = bst.predict(xg_test)#.reshape(np.array(test_labels).shape[0], 6)
pred_label = np.argmax(pred_prob, axis=1)
error_rate = np.sum(pred != test_labels) / np.array(test_labels).shape[0]
print('Test error using softprob = {}'.format(error_rate))

[0]	train-merror:0.102041	test-merror:0.376238
[1]	train-merror:0.053061	test-merror:0.366337
[2]	train-merror:0.026531	test-merror:0.346535
[3]	train-merror:0.012245	test-merror:0.346535
[4]	train-merror:0.010204	test-merror:0.346535
[5]	train-merror:0.010204	test-merror:0.346535
[6]	train-merror:0.002041	test-merror:0.366337
[7]	train-merror:0.002041	test-merror:0.376238
[8]	train-merror:0	test-merror:0.376238
[9]	train-merror:0	test-merror:0.376238
Test error using softmax = 0.00909090909090909
[0]	train-merror:0.102041	test-merror:0.376238
[1]	train-merror:0.053061	test-merror:0.366337
[2]	train-merror:0.026531	test-merror:0.346535


  


[3]	train-merror:0.012245	test-merror:0.346535
[4]	train-merror:0.010204	test-merror:0.346535
[5]	train-merror:0.010204	test-merror:0.346535
[6]	train-merror:0.002041	test-merror:0.366337
[7]	train-merror:0.002041	test-merror:0.376238
[8]	train-merror:0	test-merror:0.376238
[9]	train-merror:0	test-merror:0.376238
Test error using softprob = 0.009900990099009901


  app.launch_new_instance()


In [321]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [306]:
rdf = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state = 12)
_ = rdf.fit(train_vecs, train_nums)
np.mean(rdf.predict(test_vecs) == test_nums)

0.62376237623762376

In [338]:
param_grid = {'max_depth': np.arange(3, 10)}

tree = GridSearchCV(DecisionTreeClassifier(random_state = 42, class_weight = 'balanced'), param_grid)

tree.fit(train_vecs, train_nums)
tree_preds = (tree.predict(test_vecs))
np.mean(tree_preds == test_nums)

0.6633663366336634