In [1]:
import random
import collections
import math
import sys
from util import readExamples, evaluatePredictor
from model import extractFeatures, learnPredictor, plot_loss
import unicodedata
import re
from sklearn.model_selection import train_test_split


In [2]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


In [3]:
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z\u4e00-\u9fa5.!?，。？]+", r" ", s)
    return s


In [4]:
def TestModel(numIters, eta, reg, mode, train_data, test_data, train_label, test_label):
    #train_set = readExamples('data/data_rt.train')
    #train_new = []
    #for example in train_set:
        #text = normalize_string(example[0])
        #label = example[1]
        #train_new.append((text, label))

    
    #train_corpus = [example[0] for example in train_new]
    
    #train_label = [example[1] for example in train_new]
   
    #train_embed = extractFeatures(train_corpus, mode)
    
    #train_data, val_data, train_l, val_l = train_test_split(train_embed, train_label, test_size = 0.1, random_state = 31)
    weight, bias, training_loss, test_error_list = learnPredictor(train_data, test_data, train_label, test_label, numIters=numIters, eta=eta, reg = reg)
    
    plot_loss(training_loss, 'train', mode, eta)
    plot_loss(test_error_list, 'test', mode, eta)

    trainError = evaluatePredictor(train_data, train_label, weight, bias)
    testError = evaluatePredictor(test_data, test_label, weight, bias)
    
    print ("training error = %s, test error = %s" % (trainError, testError))
    return weight, bias, testError

In [5]:

train_set = readExamples('data/data_rt.train')
train_new = []
for example in train_set:
    text = normalize_string(example[0])
    label = example[1]
    train_new.append((text, label))

train_corpus = [example[0] for example in train_new]
train_label = [example[1] for example in train_new]
    
test_set = readExamples('data/data_rt.test')
test_new = []
for example in test_set:
    text = normalize_string(example[0])
    label = example[1]
    test_new.append((text, label))
    
test_corpus = [example[0] for example in test_new]
test_label = [example[1] for example in test_new]
    
train_num = len(train_corpus)
#mode_list = ['BOW', 'Bigram', 'Trigram', 'Combo', 'Word2Vec', 'Glove']
mode_list = ['BOW', 'Bigram', 'Trigram', 'Combo', 'Word2Vec']
combo_embed = extractFeatures(train_corpus + test_corpus, mode_list[1])
train_embed = combo_embed[:train_num]
test_embed = combo_embed[train_num:]
    #train_data, val_data, train_l, val_l = train_test_split(train_embed, train_label, test_size = 0.1, random_state = 31)

min_error = 1.0
iters_list = [200000]
learning_rate = [0.02, 0.1, 0.5]
reg_list = [0.0]
best_w = None
best_b = None
best_combo = None
for iters in iters_list:
    for lr in learning_rate:
        for reg in reg_list:
            for mode in mode_list:
                combo_embed = extractFeatures(train_corpus + test_corpus, mode)
                train_embed = combo_embed[:train_num]
                test_embed = combo_embed[train_num:]
                w, b, testError = TestModel(iters, lr, reg, mode, train_embed, test_embed, train_label, test_label)
                print('Test error of {} is: {}'.format(mode, testError))
                    
                #if valError < min_error:
                    #best_w = w
                    #best_b = b
                    #best_combo = [iters, lr, reg]
#print('Best hyper combo is: ')
#print(best_combo)

#testError = evaluatePredictor(test_embed, test_label, best_w, best_b)
#print('Test error is: {}'.format(testError))

training error at 0 is: 0.5014068655036579
test error at 0 is: 0.4935284186831739
current test loss is: 1.0000020233600262
training loss at 5000 is: 0.9663863544076908
training error at 5000 is: 0.19048958919527292
test error at 5000 is: 0.28221722003376476
current test loss is: 0.9578710095939273
training loss at 10000 is: 0.8939231004747782
training error at 10000 is: 0.18176702307259426
test error at 10000 is: 0.2765897580191334
current test loss is: 0.9145755229170989
training loss at 15000 is: 0.8246045181068161
training error at 15000 is: 0.18036015756893642
test error at 15000 is: 0.2732132808103545
current test loss is: 0.8738965102764824
training loss at 20000 is: 0.7474506942686021
training error at 20000 is: 0.17613956105796286
test error at 20000 is: 0.2723691615081598
current test loss is: 0.8335480120677965
training loss at 25000 is: 0.6814327747347662
training error at 25000 is: 0.16966797974113676
test error at 25000 is: 0.2678671918964547
current test loss is: 0.798577

training loss at 45000 is: 0.7663786398006479
training error at 45000 is: 0.009285312324141813
test error at 45000 is: 0.48086662915025324
current test loss is: 0.998956410393301
training loss at 50000 is: 0.7362376408369309
training error at 50000 is: 0.009285312324141813
test error at 50000 is: 0.48086662915025324
current test loss is: 0.9988323632354853
training loss at 55000 is: 0.7112268757591751
training error at 55000 is: 0.009285312324141813
test error at 55000 is: 0.4805852560495217
current test loss is: 0.9987362377963689
training loss at 60000 is: 0.6824759191902704
training error at 60000 is: 0.009285312324141813
test error at 60000 is: 0.48086662915025324
current test loss is: 0.9986140729579082
training loss at 65000 is: 0.652930139064929
training error at 65000 is: 0.009285312324141813
test error at 65000 is: 0.48086662915025324
current test loss is: 0.998517030480121
training loss at 70000 is: 0.6286983546000426
training error at 70000 is: 0.009285312324141813
test erro

training loss at 40000 is: 0.20581212212055117
training error at 40000 is: 0.056555993247045584
test error at 40000 is: 0.26195835678109175
current test loss is: 0.6222999596493537
training loss at 45000 is: 0.18804707166617007
training error at 45000 is: 0.050928531232414184
test error at 45000 is: 0.26167698368036013
training error = 0.050928531232414184, test error = 0.26167698368036013
Test error of BOW is: 0.26167698368036013
training error at 0 is: 0.49774901519414744
test error at 0 is: 0.4926842993809792
current test loss is: 1.0
training loss at 5000 is: 0.9277262145299233
training error at 5000 is: 0.10354530106921778
test error at 5000 is: 0.41840180078784467
current test loss is: 0.9917793887532202
training loss at 10000 is: 0.7746720753218456
training error at 10000 is: 0.030951041080472707
test error at 10000 is: 0.415588069780529
current test loss is: 0.9839351008682391
training loss at 15000 is: 0.6320433243043775
training error at 15000 is: 0.009285312324141813
test er

training loss at 25000 is: 0.3863421546372628
training error at 25000 is: 0.009566685424873381
test error at 25000 is: 0.48086662915025324
current test loss is: 0.9973170739221988
training loss at 30000 is: 0.27554057967293927
training error at 30000 is: 0.009285312324141813
test error at 30000 is: 0.48086662915025324
current test loss is: 0.9969894209399903
training loss at 35000 is: 0.18631656902657467
training error at 35000 is: 0.009285312324141813
test error at 35000 is: 0.48086662915025324
current test loss is: 0.996612459056943
training loss at 40000 is: 0.12386161753878218
training error at 40000 is: 0.009285312324141813
test error at 40000 is: 0.4803038829487901
current test loss is: 0.9964584640508767
training loss at 45000 is: 0.08202174016796199
training error at 45000 is: 0.009285312324141813
test error at 45000 is: 0.4805852560495217
current test loss is: 0.9962593554439111
training loss at 50000 is: 0.05444467675868169
training error at 50000 is: 0.009285312324141813
tes

training loss at 60000 is: 0.012119643220588462
training error at 60000 is: 0.0002813731007315701
test error at 60000 is: 0.24957794034890265
current test loss is: 0.779074869532214
training loss at 65000 is: 0.008648979096324215
training error at 65000 is: 0.0002813731007315701
test error at 65000 is: 0.24929656724817106
current test loss is: 0.7791981731026153
training loss at 70000 is: 0.005374691653744922
training error at 70000 is: 0.0
test error at 70000 is: 0.2518289251547552
current test loss is: 0.7791421268917766
training loss at 75000 is: 0.004419438475118238
training error at 75000 is: 0.0
test error at 75000 is: 0.25239167135621837
training error = 0.0, test error = 0.25239167135621837
Test error of Combo is: 0.25239167135621837
training error at 0 is: 0.498030388294879
test error at 0 is: 0.4926842993809792
current test loss is: 0.9957639016367903
training loss at 5000 is: 1.0446304499803452
training error at 5000 is: 0.5016882386043894
test error at 5000 is: 0.5073157006

training loss at 95000 is: 0.0038
training error at 95000 is: 0.0011254924029262803
test error at 95000 is: 0.4034890264490715
current test loss is: 0.9426233421874873
training loss at 100000 is: 0.0024
training error at 100000 is: 0.0011254924029262803
test error at 100000 is: 0.4034890264490715
current test loss is: 0.9426233421874873
training loss at 105000 is: 0.0048
training error at 105000 is: 0.0011254924029262803
test error at 105000 is: 0.4034890264490715
current test loss is: 0.9426233421874873
training loss at 110000 is: 0.0034
training error at 110000 is: 0.0011254924029262803
test error at 110000 is: 0.4034890264490715
current test loss is: 0.9426233421874873
training loss at 115000 is: 0.0038
training error at 115000 is: 0.0011254924029262803
test error at 115000 is: 0.4034890264490715
current test loss is: 0.9426233421874873
training loss at 120000 is: 0.0022
training error at 120000 is: 0.0011254924029262803
test error at 120000 is: 0.4034890264490715
current test loss 

training loss at 135000 is: 0.0216
training error at 135000 is: 0.009285312324141813
test error at 135000 is: 0.47974113674732694
current test loss is: 0.9953310408579308
training loss at 140000 is: 0.021
training error at 140000 is: 0.009285312324141813
test error at 140000 is: 0.47974113674732694
current test loss is: 0.9953310408579308
training loss at 145000 is: 0.0224
training error at 145000 is: 0.009285312324141813
test error at 145000 is: 0.47974113674732694
current test loss is: 0.9953310408579308
training loss at 150000 is: 0.0212
training error at 150000 is: 0.009285312324141813
test error at 150000 is: 0.47974113674732694
current test loss is: 0.9953310408579308
training loss at 155000 is: 0.0256
training error at 155000 is: 0.009285312324141813
test error at 155000 is: 0.47974113674732694
current test loss is: 0.9953310408579308
training loss at 160000 is: 0.022
training error at 160000 is: 0.009285312324141813
test error at 160000 is: 0.47974113674732694
current test loss

training error = 0.0, test error = 0.2515475520540236
Test error of Combo is: 0.2515475520540236
training error at 0 is: 0.498030388294879
test error at 0 is: 0.4926842993809792
current test loss is: 1.9215841201725035
training loss at 5000 is: 1.5024155655967946
training error at 5000 is: 0.5016882386043894
test error at 5000 is: 0.5073157006190209
current test loss is: 2.217436577008844
training loss at 10000 is: 1.5020401036817252
training error at 10000 is: 0.5014068655036579
test error at 10000 is: 0.5073157006190209
current test loss is: 1.7311502276747808
training loss at 15000 is: 1.508093760219434
training error at 15000 is: 0.498030388294879
test error at 15000 is: 0.4926842993809792
current test loss is: 1.9136551631768761
training loss at 20000 is: 1.488958456092407
training error at 20000 is: 0.4395047833427124
test error at 20000 is: 0.43528418683173886
current test loss is: 0.9230325328220443
training loss at 25000 is: 1.4677562613963648
training error at 25000 is: 0.498