In [71]:
%reset -f

import sys
import time
from svector import svector
import operator
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
import re
# import string

def read_from(textfile):
    for line in open(textfile):
        label, words = line.strip().split("\t")
#         words = words.translate(str.maketrans('','',string.punctuation))
        words = re.sub(r' \'s','s',words)
#         words =  re.sub(r'[\*&$\-(),!;"?+]','',words)
#         print(words)
        yield (1 if label=="+" else -1, words.split())

def make_vector(words):
    v = svector()
    for word in words:
        v[word] += 1
    v['<bias>'] = -1
    return v
    
def test(devfile, model):
    tot, err = 0, 0
    for i, (label, words) in enumerate(devfile, 1): # note 1...|D|
        err += label * (model.dot(make_vector(words))) <= 0
    return err/i  # i is |D| now
            
def train(trainfile, devfile, epochs=5):
    t = time.time()
    best_err = 1.
    wordDict, oneWordCountSet = defaultdict(int), set()
    model = svector()
    avgModel = svector()
    c=0

    for label,words in trainfile:
        for word in words:
            wordDict[word] += 1
    
    for word in wordDict:
#         if wordDict[word] == 1 or wordDict[word]==2: 
        if wordDict[word] == 1: 
            oneWordCountSet.add(word)
                        
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(trainfile, 1): # label is +1 or -1
            words = [word for word in words if word not in oneWordCountSet and word not in stopWords]
#             print(words)
#             words = [word for word in words if word not in oneWordCountSet]
#             words += list(zip(words[:-1],words[1:]))
            sent = make_vector(words)
            
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
                avgModel += c*label*sent
            c += 1
            
#         dev_err = test(devfile, model)
        dev_err_avg = test(devfile, c*model - avgModel)
        
        best_err = min(best_err, dev_err_avg)
#         best_err = min(best_err, dev_err)
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err_avg * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))
    
    return model, avgModel

if __name__ == "__main__":
    trainfile = list(read_from('train.txt'))
    devfile = list(read_from('dev.txt'))
    stopWords = stopwords.words('english')
    
    model, avgModel = train(trainfile, devfile, 10)
    sortedWeightVectors = sorted(avgModel, key=avgModel.get)
    print('len of model: {}'.format(len(model)))
    print('------------------------')
    print('Most positive features: {}'.format(sortedWeightVectors[:20]))
    print('------------------------')
    print('Most negative features: {}'.format(sortedWeightVectors[-20:]))

epoch 1, update 37.9%, dev 30.2%
epoch 2, update 25.1%, dev 28.3%
epoch 3, update 19.2%, dev 26.9%
epoch 4, update 15.2%, dev 26.0%
epoch 5, update 13.8%, dev 25.8%
epoch 6, update 12.6%, dev 25.9%
epoch 7, update 10.2%, dev 25.5%
epoch 8, update 8.8%, dev 25.3%
epoch 9, update 8.7%, dev 25.9%
epoch 10, update 7.4%, dev 25.9%
best dev err 25.3%, |w|=8389, time: 4.3 secs
len of model: 8389
------------------------
Most positive features: ['base', 'fanatics', 'undermines', 'tension', 'textbook', 'dramas', 'angel', 'copy', 'deal', 'hills', 'sucks', 'edition', 'bland', 'birds', 'claims', 'apparent', 'situation', 'player', '1982', 'learn']
------------------------
Most negative features: ['dancing', 'growing', 'earns', 'flawless', 'sleeper', 'nail', 'bam', 'reflect', 'otherwise', 'empire', 'near', 'direct', 'widget', 'provides', 'smarter', 'roaring', 'ode', 'nicky', 'liberating', 'dared']
