In [1]:
import random
import collections
import math
import sys
from util import readExamples, evaluatePredictor
from model import extractFeatures, learnPredictor, plot_loss
import unicodedata
import re
from sklearn.model_selection import train_test_split


In [2]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


In [3]:
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z\u4e00-\u9fa5.!?，。？]+", r" ", s)
    return s


In [4]:
def TestModel(numIters, eta, reg, mode, train_data, test_data, train_label, test_label):
    #train_set = readExamples('data/data_rt.train')
    #train_new = []
    #for example in train_set:
        #text = normalize_string(example[0])
        #label = example[1]
        #train_new.append((text, label))

    
    #train_corpus = [example[0] for example in train_new]
    
    #train_label = [example[1] for example in train_new]
   
    #train_embed = extractFeatures(train_corpus, mode)
    
    #train_data, val_data, train_l, val_l = train_test_split(train_embed, train_label, test_size = 0.1, random_state = 31)
    weight, bias, training_loss, test_error_list = learnPredictor(train_data, test_data, train_label, test_label, numIters=numIters, eta=eta, reg = reg)
    
    plot_loss(training_loss, 'train', mode, eta)
    plot_loss(test_error_list, 'test', mode, eta)

    trainError = evaluatePredictor(train_data, train_label, weight, bias)
    testError = evaluatePredictor(test_data, test_label, weight, bias)
    
    print ("training error = %s, test error = %s" % (trainError, testError))
    return weight, bias, testError

In [5]:

train_set = readExamples('data/data_rt.train')
train_new = []
for example in train_set:
    text = normalize_string(example[0])
    label = example[1]
    train_new.append((text, label))

train_corpus = [example[0] for example in train_new]
train_label = [example[1] for example in train_new]
    
test_set = readExamples('data/data_rt.test')
test_new = []
for example in test_set:
    text = normalize_string(example[0])
    label = example[1]
    test_new.append((text, label))
    
test_corpus = [example[0] for example in test_new]
test_label = [example[1] for example in test_new]
    
train_num = len(train_corpus)
#mode_list = ['BOW', 'Bigram', 'Trigram', 'Combo', 'Word2Vec', 'Glove']
mode_list = ['BOW', 'Bigram', 'Trigram', 'Combo', 'Word2Vec']
combo_embed = extractFeatures(train_corpus + test_corpus, mode_list[1])
train_embed = combo_embed[:train_num]
test_embed = combo_embed[train_num:]
    #train_data, val_data, train_l, val_l = train_test_split(train_embed, train_label, test_size = 0.1, random_state = 31)

min_error = 1.0
iters_list = [200000]
learning_rate = [0.02, 0.1]
reg_list = [0.0]
best_w = None
best_b = None
best_combo = None
for iters in iters_list:
    for lr in learning_rate:
        for reg in reg_list:
            for mode in mode_list:
                combo_embed = extractFeatures(train_corpus + test_corpus, mode)
                train_embed = combo_embed[:train_num]
                test_embed = combo_embed[train_num:]
                w, b, testError = TestModel(iters, lr, reg, mode, train_embed, test_embed, train_label, test_label)
                print('Test error of {} is: {}'.format(mode, testError))
                    
                #if valError < min_error:
                    #best_w = w
                    #best_b = b
                    #best_combo = [iters, lr, reg]
#print('Best hyper combo is: ')
#print(best_combo)

#testError = evaluatePredictor(test_embed, test_label, best_w, best_b)
#print('Test error is: {}'.format(testError))

training error at 0 is: 0.498030388294879
test error at 0 is: 0.4926842993809792
current test loss is: 1.0000184732510105
training loss at 5000 is: 0.9647835402087905
training error at 5000 is: 0.18908272369161508
test error at 5000 is: 0.29487900956668545
current test loss is: 0.958157189905375
training loss at 10000 is: 0.8910441937043544
training error at 10000 is: 0.18007878446820483
test error at 10000 is: 0.2763083849184018
current test loss is: 0.9146462344369057
training loss at 15000 is: 0.825406851486149
training error at 15000 is: 0.17332583005064717
test error at 15000 is: 0.2734946539110861
current test loss is: 0.8726463865197506
training loss at 20000 is: 0.7375710989101162
training error at 20000 is: 0.17248171074845245
test error at 20000 is: 0.27152504220596513
current test loss is: 0.8315577351555264
training loss at 25000 is: 0.6857052915732115
training error at 25000 is: 0.1671356218345526
test error at 25000 is: 0.26814856499718626
current test loss is: 0.79845706

training loss at 60000 is: 0.683331031828566
training error at 60000 is: 0.009285312324141813
test error at 60000 is: 0.47945976364659537
current test loss is: 0.9985799859394887
training loss at 65000 is: 0.6563936201625825
training error at 65000 is: 0.009285312324141813
test error at 65000 is: 0.47945976364659537
current test loss is: 0.9984979845939056
training loss at 70000 is: 0.6267991057900906
training error at 70000 is: 0.009285312324141813
test error at 70000 is: 0.47945976364659537
current test loss is: 0.9984109615748274
training loss at 75000 is: 0.6003520035963505
training error at 75000 is: 0.009285312324141813
test error at 75000 is: 0.4800225098480585
current test loss is: 0.9982840515887953
training loss at 80000 is: 0.568982564261847
training error at 80000 is: 0.009285312324141813
test error at 80000 is: 0.47974113674732694
current test loss is: 0.9981619823679959
training loss at 85000 is: 0.5429031631206188
training error at 85000 is: 0.009285312324141813
test err

training loss at 30000 is: 0.27054606880453397
training error at 30000 is: 0.07850309510410805
test error at 30000 is: 0.2543612830613393
current test loss is: 0.6277498605851939
training loss at 35000 is: 0.23690704175035904
training error at 35000 is: 0.06555993247045583
test error at 35000 is: 0.2537985368598762
current test loss is: 0.6222566461606024
training loss at 40000 is: 0.20865503895683793
training error at 40000 is: 0.05965109735509285
test error at 40000 is: 0.25239167135621837
current test loss is: 0.6213737669812701
training loss at 45000 is: 0.19564696018746894
training error at 45000 is: 0.050365785030951044
test error at 45000 is: 0.2585818795723129
current test loss is: 0.6204839784331896
training loss at 50000 is: 0.1670898687238448
training error at 50000 is: 0.04614518851997749
test error at 50000 is: 0.2597073719752392
current test loss is: 0.6184055868000949
training loss at 55000 is: 0.14842204413715915
training error at 55000 is: 0.04276871131119865
test erro

training loss at 35000 is: 0.13835484378812227
training error at 35000 is: 0.0036578503095104106
test error at 35000 is: 0.25520540236353406
current test loss is: 0.7817976925803636
training loss at 40000 is: 0.09309387205137411
training error at 40000 is: 0.0014068655036578502
test error at 40000 is: 0.25323579065841306
current test loss is: 0.7804174474873993
training loss at 45000 is: 0.058096503093152264
training error at 45000 is: 0.0008441193021947102
test error at 45000 is: 0.24985931344963422
current test loss is: 0.7793962426028587
training loss at 50000 is: 0.03540163164738038
training error at 50000 is: 0.0005627462014631402
test error at 50000 is: 0.24845244794597637
current test loss is: 0.7787325679013014
training loss at 55000 is: 0.025308803800998477
training error at 55000 is: 0.0005627462014631402
test error at 55000 is: 0.25070343275182894
current test loss is: 0.7789724712042697
training loss at 60000 is: 0.014790169170845429
training error at 60000 is: 0.0002813731