In [1]:
import os
import sys
import random
import numpy as np
import argparse
import chainer
import pickle
import importlib
from collections import namedtuple, defaultdict
from typing import List, Dict, Tuple, Optional

from qanta import logging
from qanta.config import conf
from qanta.guesser.abstract import AbstractGuesser
from qanta.util.multiprocess import _multiprocess
from qanta.buzzer import configs
from qanta.buzzer.progress import ProgressBar
from qanta.buzzer.trainer import Trainer
from qanta.buzzer.iterator import QuestionIterator
from qanta.buzzer.util import load_quizbowl, GUESSERS
from qanta.buzzer.models import MLP, RNN
from qanta.buzzer import constants as bc
from qanta.util import constants as c
from qanta import logging

log = logging.get(__name__)
N_GUESSERS = len(GUESSERS)
Batch = namedtuple('Batch', ['qids', 'answers', 'mask', 'vecs', 'results'])

ERROR (theano.gpuarray): pygpu was configured but could not be imported or is too old (version 0.6 or higher required)
NoneType: None


In [2]:
option2id, all_guesses = load_quizbowl()

2017-05-25 17:45:31,379 - qanta.buzzer.util - INFO - Merging guesser DataFrames.
2017-05-25 17:45:31,380 - qanta.buzzer.util - INFO - Merged buzzertrain exists, skipping.
2017-05-25 17:45:31,381 - qanta.buzzer.util - INFO - Merged buzzerdev exists, skipping.
2017-05-25 17:45:31,382 - qanta.buzzer.util - INFO - Merged dev exists, skipping.
2017-05-25 17:45:31,383 - qanta.buzzer.util - INFO - Merged test exists, skipping.
2017-05-25 17:45:31,384 - qanta.buzzer.util - INFO - Merged expo exists, skipping.
2017-05-25 17:45:31,385 - qanta.buzzer.util - INFO - Loading data
2017-05-25 17:45:41,061 - qanta.buzzer.util - INFO - Number of options 8247
2017-05-25 17:46:19,307 - qanta.buzzer.util - INFO - Loading buzzertrain guesses
2017-05-25 17:46:26,957 - qanta.buzzer.util - INFO - Loading buzzerdev guesses
2017-05-25 17:46:28,476 - qanta.buzzer.util - INFO - Loading dev guesses
2017-05-25 17:46:34,721 - qanta.buzzer.util - INFO - Loading test guesses
2017-05-25 17:46:34,747 - qanta.buzzer.util 

# Don't rerun above

# without positional features

In [12]:
import os
import sys
import random
import numpy as np
import pickle
from collections import defaultdict, namedtuple
from typing import List, Dict, Tuple, Optional
from qanta.config import conf
from qanta.buzzer.util import GUESSERS
from qanta.buzzer import constants as bc
from qanta.util.multiprocess import _multiprocess
from qanta import logging

Batch = namedtuple('Batch', ['qids', 'answers', 'mask', 'vecs', 'results'])

N_GUESSERS = len(GUESSERS)
N_GUESSES = conf['buzzer']['n_guesses']

log = logging.get(__name__)

class QuestionIteratorNoPos(object):
    '''Each batch contains:
        qids: list, (batch_size,)
        answers: list, (batch_size,)
        mask: list, (length, batch_size,)
        vecs: xp.float32, (length, batch_size, 4 * NUM_GUESSES)
        results: xp.int32, (length, batch_size)
    '''

    def __init__(self, dataset: list, option2id: Dict[str, int], batch_size:int,
            bucket_size=4, step_size=1, neg_weight=1, shuffle=True, pkl_dir=None):
        self.dataset = dataset
        self.option2id = option2id
        self.batch_size = batch_size
        self.bucket_size = bucket_size
        self.step_size = step_size
        self.neg_weight = neg_weight
        self.shuffle = shuffle
        self.epoch = 0
        self.iteration = 0
        self.batch_index = 0
        self.is_end_epoch = False
        sys.stdout.flush()
        if pkl_dir is not None:
            if os.path.exists(pkl_dir):
                with open(pkl_dir, 'rb') as f:
                    self.batches = pickle.load(f)
                log.info('Finish loading batches')
            else:
                log.info('Creating batches (nopos)')
                self.create_batches()
                with open(pkl_dir, 'wb') as f:
                    pickle.dump(self.batches, f)
                log.info('Finish creating batches')
        else:
            log.info('Creating batches (nopos)')
            self.create_batches()
            log.info('Finish creating batches')

    def get_guesser_acc(self, i, length):
        if i == length:
            return bc.GUESSER_ACC[-1]
        if i == 0:
            return bc.GUESSER_ACC[0]
        ratio = i / length
        pos = 0
        for i, r in enumerate(bc.GUESSER_ACC_POS):
            if r > ratio:
                pos = i
                break
        acc = bc.GUESSER_ACC[pos - 1] * (ratio - bc.GUESSER_ACC_POS[pos - 1]) +\
                bc.GUESSER_ACC[pos] * (bc.GUESSER_ACC_POS[pos] - ratio)
        return acc

    def dense_vector(self, dicts: List[List[Dict[str, float]]],
            wordvecs: List[List[np.ndarray]], step_size=1) -> List[List[float]]:
        '''Generate dense vectors from a sequence of guess dictionaries.
        dicts: a sequence of guess dictionaries for each guesser
        '''
        length = len(dicts)
        prev_vecs = [[0. for _ in range(N_GUESSERS * N_GUESSES)] \
                for i in range(step_size)]
        vecs = []
        for i in range(length):
            if len(dicts[i]) != N_GUESSERS:
                raise ValueError("Inconsistent number of guessers ({0}, {1}).".format(
                    N_GUESSERS, len(dicts)))
            vec = []
            diff_vec = []
            isnew_vec = []
            for j in range(N_GUESSERS):
                dic = sorted(dicts[i][j].items(), key=lambda x: x[1], reverse=True)
                for guess, score in dic:
                    vec.append(score)
                    if i > 0 and guess in dicts[i-1][j]:
                        diff_vec.append(score - dicts[i-1][j][guess])
                        isnew_vec.append(0)
                    else:
                        diff_vec.append(score) 
                        isnew_vec.append(1)
                if len(dic) < N_GUESSES:
                    for k in range(max(N_GUESSES - len(dic), 0)):
                        vec.append(0)
                        diff_vec.append(0)
                        isnew_vec.append(0)
            guesser_acc = self.get_guesser_acc(i, length)
            features = [sum(isnew_vec), np.average(vec), vec[0], vec[1], vec[2],
                    isnew_vec[0], isnew_vec[1], vec[0] - vec[1], vec[1] -
                    vec[2], isnew_vec[2], diff_vec[0], 
                    vec[0] - prev_vecs[-1][0], np.var(vec),
                    np.var(prev_vecs[-1])]

            vecs.append(features)
            prev_vecs.append(vec)
            if step_size > 0:
                prev_vecs = prev_vecs[-step_size:]
        return vecs

    def _process_example(self, qid, answer, dicts, results, wordvecs):
        
        results = np.asarray(results, dtype=np.int32)
        length, n_guessers = results.shape

        if n_guessers != N_GUESSERS:
            raise ValueError(
                "Inconsistent number of guessers ({0}, {1}.".format(
                    N_GUESSERS, n_guessers))

        # append the not buzzing action to each time step
        # not buzzing = 1 when no guesser is correct
        new_results = []
        for i in range(length):
            not_buzz = int(not any(results[i] == 1)) * self.neg_weight
            new_results.append(np.append(results[i], not_buzz))
        results = np.asarray(new_results, dtype=np.int32)

        if len(dicts) != length:
            raise ValueError("Inconsistant shape of results and vecs.")
        vecs = self.dense_vector(dicts, wordvecs, self.step_size)
        vecs = np.asarray(vecs, dtype=np.float32)
        assert length == vecs.shape[0]
        self.n_input = len(vecs[0])

        padded_length = -((-length) // self.bucket_size) * self.bucket_size
        vecs_padded = np.zeros((padded_length, self.n_input))
        vecs_padded[:length,:self.n_input] = vecs

        results_padded = np.zeros((padded_length, (N_GUESSERS + 1)))
        results_padded[:length, :(N_GUESSERS + 1)] = results

        mask = [1 for _ in range(length)] + \
               [0 for _ in range(padded_length - length)]

        example = (qid, answer, mask, vecs_padded, results_padded)
        return example, padded_length

    def create_batches(self):
        self.batches = []
        buckets = defaultdict(list)
        total = len(self.dataset)
        returns = _multiprocess(self._process_example, self.dataset,
                info="creat batches", multi=False)
        for example, padded_length in returns:
            buckets[padded_length].append(example)

        for examples in buckets.values():
            for i in range(0, len(examples), self.batch_size):
                qids, answers, mask, vecs, results = \
                        zip(*examples[i : i + self.batch_size])
                batch = Batch(qids, answers, mask, vecs, results)
                self.batches.append(batch)

    @property
    def size(self):
        return len(self.batches)
    
    def finalize(self, reset=False):
        if self.shuffle:
            random.shuffle(self.batches)
        if reset:
            self.epoch = 0
            self.iteration = 0
            self.batch_index = 0

    def next_batch(self, xp, train=True):
        self.iteration += 1
        if self.batch_index == 0:
            self.epoch += 1
        self.is_end_epoch = (self.batch_index == self.size - 1)
        qids, answers, mask, vecs, results = self.batches[self.batch_index]

        vecs = xp.asarray(vecs, dtype=xp.float32).swapaxes(0, 1) # length * batch_size * dim
        results = xp.asarray(results, dtype=xp.int32).swapaxes(0, 1) # length * batch_size * n_guessers
        mask = xp.asarray(mask, dtype=xp.float32).T # length * batch_size
        # results = results * 2 - 1 # convert from (0, 1) to (-1, 1)

        self.batch_index = (self.batch_index + 1) % self.size
        batch = Batch(qids, answers, mask, vecs, results)
        return batch
    
    @property
    def epoch_detail(self):
        return self.iteration, self.iteration * 1.0 / self.size

In [16]:
iterators = dict()
for fold in c.BUZZER_INPUT_FOLDS:
    iterators[fold] = QuestionIteratorNoPos(all_guesses[fold], option2id,
        batch_size=128, step_size=1, neg_weight=1)
print(iterators[c.BUZZER_TRAIN_FOLD].n_input)

2017-05-25 18:17:37,942 - __main__ - INFO - Creating batches (nopos)
[creat batches] (1) done: 23210/23211
2017-05-25 18:23:01,426 - __main__ - INFO - Finish creating batches
2017-05-25 18:23:01,427 - __main__ - INFO - Creating batches (nopos)
[creat batches] (1) done: 7586/7587
2017-05-25 18:24:19,470 - __main__ - INFO - Finish creating batches
2017-05-25 18:24:19,471 - __main__ - INFO - Creating batches (nopos)
[creat batches] (1) done: 2088/2089
2017-05-25 18:24:48,339 - __main__ - INFO - Finish creating batches
2017-05-25 18:24:48,340 - __main__ - INFO - Creating batches (nopos)
[creat batches] (1) done: 1411/1412
2017-05-25 18:25:16,361 - __main__ - INFO - Finish creating batches
2017-05-25 18:25:16,363 - __main__ - INFO - Creating batches (nopos)
[creat batches] (1) done: 68/69
2017-05-25 18:25:16,935 - __main__ - INFO - Finish creating batches


14


In [17]:
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import cuda

class RNN(chainer.Chain):
    def __init__(self, n_input, n_hidden, n_output):
        super(RNN, self).__init__(
            rnn=L.LSTM(n_input, n_hidden),
            linear=L.Linear(n_hidden, n_output))

    @property
    def xp(self):
        if not cuda.available or self.linear._cpu:
            return np
        return cuda.cupy

    def get_device(self):
        if not cuda.available or self.linear._cpu:
            return -1
        return self.linear._device_id

    def __call__(self, xs, train=True):
        length, batch_size, _ = xs.shape
        self.rnn.reset_state()
        ys = F.stack([self.rnn(x) for x in xs], axis=0)
        ys = F.reshape(ys, (length * batch_size, -1))
        ys = self.linear(ys)
        return ys

In [25]:
cfg = configs.rnn()
cfg.n_hidden = 25
cfg.model_dir = 'output/buzzer/rnn_nopos_25.npz'
model = RNN(iterators[c.BUZZER_TRAIN_FOLD].n_input, cfg.n_hidden, 2)

chainer.cuda.get_device(0).use()
model.to_gpu(0)

pickle.dump(cfg, open(cfg.ckp_dir, 'wb'))

In [26]:
trainer = Trainer(model,'output/buzzer/rnn_nopos_25.npz')
trainer.run(iterators[c.BUZZER_TRAIN_FOLD], iterators[c.BUZZER_DEV_FOLD], 25)

2017-05-25 18:57:42,700 - qanta.buzzer.trainer - INFO - epoch 0
2017-05-25 18:58:21,535 - qanta.buzzer.trainer - INFO - train loss: 0.00  acc: 0.66  
2017-05-25 18:58:26,778 - qanta.buzzer.trainer - INFO - eval loss: 0.01  acc: 0.75  
2017-05-25 18:58:26,783 - qanta.buzzer.trainer - INFO - epoch 1
2017-05-25 18:58:55,687 - qanta.buzzer.trainer - INFO - train loss: 0.00  acc: 0.76  
2017-05-25 18:59:04,825 - qanta.buzzer.trainer - INFO - eval loss: 0.01  acc: 0.80  
2017-05-25 18:59:04,833 - qanta.buzzer.trainer - INFO - epoch 2
2017-05-25 18:59:33,868 - qanta.buzzer.trainer - INFO - train loss: 0.00  acc: 0.80  
2017-05-25 18:59:41,485 - qanta.buzzer.trainer - INFO - eval loss: 0.01  acc: 0.80  
2017-05-25 18:59:41,491 - qanta.buzzer.trainer - INFO - epoch 3
2017-05-25 19:00:14,217 - qanta.buzzer.trainer - INFO - train loss: 0.00  acc: 0.81  
2017-05-25 19:00:20,910 - qanta.buzzer.trainer - INFO - eval loss: 0.01  acc: 0.80  
2017-05-25 19:00:20,916 - qanta.buzzer.trainer - INFO - epoc

In [27]:
BUZZES_DIR='output/buzzer/{0}_buzzes_{1}.pkl'
for fold in c.BUZZER_GENERATION_FOLDS:
    buzzes = trainer.test(iterators[fold])
    log.info('Buzzes generated. Size {0}.'.format(len(buzzes)))
    buzzes_dir = BUZZES_DIR.format(fold, 'rnn_nopos_25')
    with open(buzzes_dir, 'wb') as outfile:
        pickle.dump(buzzes, outfile)
    log.info('Buzzes saved to {0}.'.format(buzzes_dir))

2017-05-25 19:14:02,950 - __main__ - INFO - Buzzes generated. Size 7587.
2017-05-25 19:14:03,094 - __main__ - INFO - Buzzes saved to output/buzzer/buzzerdev_buzzes_rnn_nopos_25.pkl.
2017-05-25 19:14:07,764 - __main__ - INFO - Buzzes generated. Size 2089.
2017-05-25 19:14:07,818 - __main__ - INFO - Buzzes saved to output/buzzer/dev_buzzes_rnn_nopos_25.pkl.
2017-05-25 19:14:11,901 - __main__ - INFO - Buzzes generated. Size 1412.
2017-05-25 19:14:11,931 - __main__ - INFO - Buzzes saved to output/buzzer/test_buzzes_rnn_nopos_25.pkl.
2017-05-25 19:14:12,174 - __main__ - INFO - Buzzes generated. Size 69.
2017-05-25 19:14:12,176 - __main__ - INFO - Buzzes saved to output/buzzer/expo_buzzes_rnn_nopos_25.pkl.


In [28]:
print(iterators[c.BUZZER_TRAIN_FOLD].n_input)

14
