In [57]:
%reset -f

import sys
import time
from svector import svector
import operator

def read_from(textfile):
    for line in open(textfile):
        label, words = line.strip().split("\t")
        yield (1 if label=="+" else -1, words.split())

def make_vector(words):
    v = svector()
    for word in words:
        v[word] += 1
    v['<bias>'] = -5
    return v
    
def test(devfile, model):
    tot, err = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1): # note 1...|D|
        err += label * (model.dot(make_vector(words))) <= 0
    return err/i  # i is |D| now
            
def train(trainfile, devfile, epochs=5):
    t = time.time()
    best_err = 1.
    model = svector()
    avgModel = svector()
    c=0
#     print('initial model: {}'.format(model))

    for it in range(1, epochs+1):
#     for it in range(1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): # label is +1 or -1
            sent = make_vector(words)
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
                avgModel += c*label*sent
            c += 1
#         print('len of sent: {} len of model: {} and avgmodel: {}'.format(len(sent), len(model), len(avgModel)))
            
        dev_err = test(devfile, model)
        dev_err_avg = test(devfile, c*model - avgModel)
        
        best_err = min(best_err, dev_err_avg)
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err_avg * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))
    
    return model, avgModel

if __name__ == "__main__":
    model, avgModel = train('train.txt', 'dev.txt', 10)
    sortedWeightVectors = sorted(avgModel, key=avgModel.get)
    print('len of model: {}'.format(len(model)))
    print('------------------------')
    print('Most positive features: {}'.format(sortedWeightVectors[:20]))
    print('------------------------')
    print('Most negative features: {}'.format(sortedWeightVectors[-20:]))

epoch 1, update 41.2%, dev 32.3%
epoch 2, update 31.5%, dev 30.8%
epoch 3, update 25.9%, dev 29.3%
epoch 4, update 22.5%, dev 28.9%
epoch 5, update 20.0%, dev 28.5%
epoch 6, update 17.8%, dev 28.1%
epoch 7, update 15.8%, dev 27.4%
epoch 8, update 15.4%, dev 27.7%
epoch 9, update 14.5%, dev 27.7%
epoch 10, update 12.4%, dev 27.3%
best dev err 27.3%, |w|=16744, time: 3.6 secs
len of model: 16744
------------------------
Most positive features: ['generic', 'harmless', 'bore', 'incoherent', 'period', 'base', 'pie', 'inane', 'sort', 'ingredients', 'devoid', 'routine', 'badly', '2002', 'seagal', 'shoot', 'wet', 'scattered', 'flat', 'ludicrous']
------------------------
Most negative features: ['uses', 'breath', 'runs', 'moved', 'rare', 'proves', 'refreshingly', 'lively', 'child', 'remarkable', 'wonderful', 'heard', '1920', 'entertain', 'carefully', 'dots', 'loved', 'smarter', 'triumph', 'am']
