In [0]:
import re
import numpy as np


def preprocess(original_file, input_file, label_file):
  """
  This function reads original training file
  return: input file with no spaces and label file of Boolean sequences
  """

  fi = open(input_file, 'w')
  fl = open(label_file, 'w')

  with open(original_file, 'r', encoding='utf8') as f:
    line = f.readline()
    while line:
      line_no_space = line.replace(' ', '')
      fi.write(line_no_space)
      line_of_bool_label = ''
      for i in range(len(line)-1):
        if line[i] is not ' ':
          if line[i+1] is ' ' or line[i+1] is '\n':
            line_of_bool_label += '1'
          else:
            line_of_bool_label += '0'
        else:
          pass
      line_of_bool_label += '\n'
      fl.write(line_of_bool_label)
      line = f.readline()
  fi.close()
  fl.close()

preprocess('msr_test_gold.utf8', 'test_input', 'test_label')
preprocess('msr_training.utf8', 'train_input', 'train_label')

In [0]:
import torch
import torch.nn as nn
from random import shuffle
import numpy as np
use_cuda = torch.cuda.is_available()
print(torch.version.__version__, use_cuda)
###############################################################################
def cuda(arr):
    if use_cuda:
        return arr.cuda()
    return arr

###############################################################################
# model paras
###############################################################################
char_embed_size = 30

rnn_size = 300
# dimension of hidden layer
rnn_nLayers = 2

RNN_layers = [rnn_size, rnn_nLayers]

# feed-forward layers
layer0 = 128
layer1 = 64
layer2 = 128
FFNN_layers = [layer0, layer1, layer2]

dropout = 0.1

class RNN(nn.Module):
  def __init__(self, specs):
    super(RNN, self).__init__()

    nChars, embed_size, rnn_layers, ffnn_layers, dropout = specs
    self.CharEmbed = nn.Embedding(nChars, embed_size)

    rnn_size, rnn_nLayers = rnn_layers
    # self.rnn = nn.GRU(embed_size, rnn_size, rnn_nLayers, dropout=dropout, batch_first = True)
    self.rnn = nn.LSTM(embed_size, rnn_size, rnn_nLayers, bidirectional=True, dropout=dropout, batch_first = True)

    self.layers = nn.ModuleList([])
    prev_size = rnn_size * 2
    for i, layer_size in enumerate(ffnn_layers):
        layer = nn.Linear(prev_size, layer_size)
        self.layers.append(layer)
        prev_size = layer_size

    self.out = nn.Linear(prev_size, 2) # map to 2 classes

    self.non_linear = nn.LeakyReLU(negative_slope=0.01)
    
    self.dropout = nn.Dropout(dropout)

    for p in self.parameters(): # optionally apply different randomization
        if p.dim() > 1:
            nn.init.kaiming_normal_(p)
            pass

  def forward(self, seqs, hidden=None):
    # if seqs is a list of tensors already stored in cuda:
    # nBatch = len(seqs)
    # nChars = len(seqs[0])

    # seqs = torch.cat(seqs).view(nBatch, nChars)

    # if seqs is already tensor of shape[nBatch, nChars] on cuda, start here:
    embed = self.CharEmbed(seqs)

    prev, hidden = self.rnn(embed, hidden)

    for layer in self.layers:
        prev = layer(prev)
        prev = self.non_linear(prev)
        prev = self.dropout(prev)

    out = self.out(prev) # chars
    # print(out.size())
    #hidden = torch.transpose(hidden, 0, 1)
    return out, hidden

    

1.4.0 True


In [0]:
###############################################################################
# these values are used within the training code
###############################################################################
global learning_rate
global batch_size
global chunk_size
global nEpochs
global L2_lambda

learning_rate = 0.0001
learning_rate = 0.0003

batch_size = 5
batch_size = 100

chunk_size = 50
chunk_size = 100

#nEpochs = 1
nEpochs = 10

L2_lambda = 0.0

def RNN_train(model, optimizer, criterion, input_chunk, target_chunk, update=True):
  model.zero_grad()
  loss = 0
  nFrames = 0

  out, hidden = model(input_chunk)
  
  nBatch = len(input_chunk)
  for i in range(nBatch):
    loss += criterion(out[i][:,:], target_chunk[i][:])
  
  if update:
    if not loss is 0:
      loss.backward()
      optimizer.step()
  
  return loss.data.item()

"""
the train_rnn_model function make batchs that can be directly feed into model: 
tensor of shape[nBatch, nChars]; nBatch = batch_size; nChars = seq_len
Input: list of training sequences, Shape[N_lines, len(i_line)],
       \chars represented by int
do: randomly choose some lines in the train set to make a batch
making batch_in: pad all lines(with period?) to the batch max_len, convert chunk 
                \into a tensor stored on cuda;
making batch_tar: pad all lines(with 1?) to the batch max_len, convert chunk into
                \tensor on cuda
"""
def train_rnn_model(model, train_input, train_target):
  if use_cuda:
    model = model.cuda()
  
  criterion = nn.CrossEntropyLoss(reduction='sum')
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=L2_lambda)

  train_list  = [i for i in range(len(train_input))]    # list of indexes of training data

  train_loss = 0
  trained_chars = 0
  input_pad = converter.word_to_int('。')
  tar_pad = 1

  model.train()
  for epoch in range(nEpochs):
    shuffle(train_list)
    ith_batch = 0
    for i in range(0, len(train_list), batch_size):
      ith_batch += 1
      batch_ndx = train_list[i:i+batch_size]
      batch_in = []
      batch_tar = []
      l_max = 0 # max len sentence of this batch
      for n in batch_ndx:
        trained_chars += len(train_input[n])
        l_max = max(l_max, len(train_input[n]))
        batch_in.append(train_input[n])
        batch_tar.append(train_target[n])
      for n in range(len(batch_ndx)):
        for x in range(l_max-len(batch_in[n])):
          batch_in[n].append(input_pad)
          batch_tar[n].append(tar_pad)
      # now batch_in and batch_tar is shape[batch_size, l_max]
      batch_in = torch.tensor(batch_in)
      batch_in = cuda(batch_in)
      batch_tar = torch.tensor(batch_tar)
      batch_tar = cuda(batch_tar)

      loss = RNN_train(model, optimizer, criterion, batch_in, batch_tar)

      train_loss += loss
      if ith_batch % 20 == 0:
        print("%2d %6d %8.3f" % (epoch, ith_batch*batch_size, train_loss/trained_chars))
    torch.save(model, "model.pth")


In [0]:
class TextConverter:
  """
  build vocabulary(not include space, no <unk>) from all text(train and test)
  create dictionary of {char:index}
  and convert characters to index integers
  """
  def __init__(self, text_path1, text_path2 = None):
    vocab = {}
    with open(text_path1, 'r') as f:
      text = f.read()
      if text_path2 is not None:
        with open(text_path2, 'r') as f2:
          text2 = f2.read()
          text = text + text2
    
    ndx = 0
    for c in text:
      if c not in vocab and c is not ' ' and c is not '\n':
        vocab[c] = ndx
        ndx += 1

    self.vocab = vocab

    self.vocab_size = len(vocab)

    self.vocab_list = [0 for i in range(self.vocab_size)]
    for c in vocab:
      self.vocab_list[vocab[c]] = c
  
  def word_to_int(self, word):
    return self.vocab[word]
  
  def int_to_word(self, index):
    return self.vocab_list[index]
  

In [0]:
#### main ####
global converter
converter = TextConverter('train_input', 'test_input')
with open('train_input', 'r') as f:
  corpus = f.readlines()
with open('train_label', 'r') as f:
  corpus_target = f.readlines()
for i in range(len(corpus)):
  corpus[i] = corpus[i].replace('\n','')
  # corpus = ['扬帆远东做与中国合作的先行', '希腊的经济结构较特殊。', ...]
  corpus_target[i] = corpus_target[i].replace('\n', '')
  # corpus_target = ['0101110101101', '01101011011', ...]

"""
the main part of program then have two tasks:
1. initialize neural network model for the train_rnn func
1. generate train_input and train_output for the train_rnn func
  \ just convert the corpus to list of list of int(index of each char)
  \ convert the corpus_target to list of int(1) and int(0)
"""

train_input = []
for line in corpus:
  conv_line = []
  for c in line:
    conv_line.append(converter.word_to_int(c))
  train_input.append(conv_line)
print(train_input[:100])

train_tar = []
for line in corpus_target:
  conv_line = []
  for c in line:
    conv_line.append(int(c))
  train_tar.append(conv_line)
print(train_tar[:100])

nChars = len(converter.vocab_list)
specs = [nChars, char_embed_size, RNN_layers, FFNN_layers, dropout]
model = RNN(specs)
print(model)

train_rnn_model(model, train_input, train_tar)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 7, 22, 23, 24, 25, 18, 10, 11, 12, 13, 26, 27, 28, 7, 29, 30, 31, 28, 18, 32, 33, 18, 34, 35, 36, 37], [0, 38, 39, 40, 41, 42, 43, 44, 13, 45, 46, 47, 48, 49, 5, 50, 37], [0, 51, 52, 18, 53, 54, 13, 55, 8, 56, 57, 58, 59, 22, 60, 61, 62], [63, 0, 64, 65, 66, 67, 68, 69, 13, 70, 8, 71, 11, 72, 73, 74, 18, 75, 76, 77, 78, 8, 71, 79, 34, 18, 80, 81, 81, 82, 83, 13, 75, 14, 77, 78, 8, 84, 85, 73, 18, 86, 87, 88, 89, 81, 82, 19, 76, 13, 90, 74, 91, 92, 93, 18, 88, 89, 81, 82, 94, 95, 37], [0, 96, 14, 97, 98, 18, 99, 100, 101, 102, 103, 104, 18, 105, 100, 13, 22, 106, 107, 108, 37], [0, 109, 110, 111, 7, 71, 80, 112, 113, 114, 13, 80, 112, 18, 115, 116, 113, 114, 37], [36, 33, 117, 118, 119, 120, 121, 122, 123, 13, 63, 8, 124, 125, 73, 126, 124, 127, 37], [0, 128, 129, 111, 130, 131, 132, 18, 133, 102, 134, 135, 136, 13, 137, 22, 7, 88, 89, 81, 82, 133, 138, 139, 18, 140, 96, 37], [0, 109, 71, 141, 125, 70, 142, 

  "type " + obj.__name__ + ". It won't be checked "


 1   2000    0.356
 1   4000    0.334
 1   6000    0.315
 1   8000    0.297
 1  10000    0.283
 1  12000    0.270
 1  14000    0.258
 1  16000    0.248
 1  18000    0.238
 1  20000    0.230
 1  22000    0.222
 1  24000    0.215
 1  26000    0.208
 1  28000    0.202
 1  30000    0.197
 1  32000    0.191
 1  34000    0.187
 1  36000    0.182
 1  38000    0.178
 1  40000    0.174
 1  42000    0.170
 1  44000    0.166
 1  46000    0.163
 1  48000    0.160
 1  50000    0.157
 1  52000    0.154
 1  54000    0.151
 1  56000    0.149
 1  58000    0.146
 1  60000    0.144
 1  62000    0.142
 1  64000    0.140
 1  66000    0.138
 1  68000    0.136
 1  70000    0.134
 1  72000    0.132
 1  74000    0.130
 1  76000    0.129
 1  78000    0.127
 1  80000    0.125
 1  82000    0.124
 1  84000    0.123
 1  86000    0.121
 2   2000    0.115
 2   4000    0.110
 2   6000    0.105
 2   8000    0.101
 2  10000    0.097
 2  12000    0.093
 2  14000    0.090
 2  16000    0.087
 2  18000    0.084
 2  20000   

In [0]:

# model = torch.load("model.pth")

with open('test_input', 'r') as f:
  corpus = f.readlines()
with open('test_label', 'r') as f:
  corpus_target = f.readlines()
for i in range(len(corpus)):
  corpus[i] = corpus[i].replace('\n','')
  # corpus = ['扬帆远东做与中国合作的先行', '希腊的经济结构较特殊。', ...]
  corpus_target[i] = corpus_target[i].replace('\n', '')

test_input = []
for line in corpus:
  conv_line = []
  for c in line:
    conv_line.append(converter.word_to_int(c))
  test_input.append(conv_line)
print(test_input[:100])

test_tar = []
for line in corpus_target:
  conv_line = []
  for c in line:
    conv_line.append(int(c))
  test_tar.append(conv_line)
print(test_tar[:100])

model.eval()
def out_to_class(out):
  m = nn.Softmax(2)
  a = m(out).cpu()
  a= a.detach().numpy()
  return np.argmax(a,axis=-1)[0]

err_count  =0 
total_count = 0
for i,test_text in enumerate(test_input):
  data = torch.tensor([test_text]) 
  data = cuda(data)
  out, ht = model(data)
  answer = out_to_class(out)
  l = len(answer)
  total_count += l
  local =0
  for j in range(l):
    
    if test_tar[i][j] != answer[j]:
      local+=1
      err_count +=  1
      if local/l>0.2:
        print (corpus[i] )
        print( [ (corpus[i][k], answer[k])  if answer[k]!= test_tar[i][k] else answer[k]  for k in range(l) ] )
        
        break
      

    

print(err_count)
print(total_count, err_count/total_count)
  




[[565, 2442, 558, 53, 160, 16, 502, 325, 677, 181, 18, 111, 342], [291, 2833, 18, 221, 578, 498, 505, 272, 140, 1186, 37], [44, 419, 95, 1233, 3003, 518, 418, 347, 110, 13, 1116, 1939, 84, 1013, 746, 295, 296, 349, 567, 18, 1927, 1925, 1926, 37], [196, 87, 873, 874, 155, 1537, 2231, 58, 7, 221, 578, 103, 1120, 18, 398, 66, 1073, 77, 9, 328, 13, 134, 653, 95, 987, 1118, 449, 373, 272, 271, 37], [24, 475, 185, 13, 502, 291, 1203, 465, 142, 850, 462, 91, 272, 516, 18, 490, 378, 13, 291, 2833, 484, 1100, 146, 102, 132, 502, 325, 204, 265, 37], [939, 484, 475, 185, 13, 400, 401, 311, 496, 18, 502, 325, 221, 578, 93, 1747, 336, 402, 13, 558, 53, 132, 2635, 209, 37], [1923, 54, 235, 2130, 18, 1443, 158, 502, 102, 1924, 1918, 1926, 686, 294, 558, 53, 13, 47, 71, 215, 484, 1100, 145, 102, 119, 1243, 167, 1443, 521, 471, 502, 325, 1949, 684, 37], [63, 115, 610, 228, 118, 502, 325, 221, 578, 336, 402, 18, 34, 1910, 37], [63, 66, 16, 502, 325, 1, 677, 181, 37], [63, 185, 228, 502, 325, 13, 77, 78,

## Chinese Segmenter
---
1. Upload the given training and test corpus into the Colab notebook and run all the cells one by one gives us the result.
2. The output from this neural net is a '1' and '0' representation of either each character has a space behind it.
3. The loss function is nn.CrossEntropyLoss, which measures the accuracy of prediction.
4. BiDirectional_RNN works better for this task. We need to see both before and after each character to know either to put a segmentor behind it or not.
5. The result from my model after 10 epochs is 96.9%.
6. May or may not help if we add traditional Chinese character data into the corpus. It may enlarge the different character list and cause extra learning load for the network, while the simplified counterpart of a traditional character may offer more information for a character. The embedding for traditional characters will probably be close to its simplified counterpart, for they often mean the same thing in context.

## Non-coding Problems - 1
1. Model 1, CNN classifier:  kernel-size=3, six convolutional layers of size: 48, 48, 96, 96, 192, 192, with 2 hidden layers of size 512 and 256; Number of parameters is 3 * 3 * (3 * 48+ 48 * 48+ 48 * 96+ 96 * 96 + 96 * 192+192 * 192) + 4 * 4 * 192 * 512 + 512 * 256 + 256 * 10 = 2,350,608
<p> Model 2, VGG16: 15,106,240
<p> Model 3, Auto-Encoder: 3 * 3 *( 3 * 3 + 3 * 3) * 2 = 324
2. Model 1, Char-Generator: take the GRU case for example, (3 * hidden_size * input_size + 3 * hidden_size * hidden_size) * layer_1, (3 * hidden_size * hidden_size + 3 * hidden_size * hidden_size) * layer_1, embedding 3601 * 300, plus fully connected layers, 1,460,742 in total.
<p> Model 2, Chinese-segmentor: Embed(5180 * 30) + biLSTM(input_size=30, hidden_size=300, layers=2) + FFNN(600 * 128, 128 * 64, 64 * 128, 128 * 2); which is 1,040,840 in total.



In [1]:
3 * 3 * (3 * 48+ 48 * 48+ 48 * 96+ 96 * 96 + 96 * 192+192 * 192) + 4 * 4 * 192 * 512 + 512 * 256 + 256 * 10

2350608

In [2]:
3 * 3 *( 3 * 64 + 64 * 64 + 64 * 128 + 128 * 128 + 128 * 256 + 256 * 256 + 256 * 256 + 256 * 512 + 512 * 512 * 5 ) + 512 * 512 + 512 * 256 + 256 * 10 

15106240

In [4]:
3601*30+(3*300*30+3*300*300+3*300*300*2)+300*128+128*64+64*128+128*3601

1460742

In [5]:
5180*30+(4*300*30+4*300*300)*2+600*128+128*64+64*128+128*2

1040840

## Non-coding problems-2
1. char-level model vs. word-level model:
<p> pros: smaller model, freqency of each char is bigger
<p> cons: longer to train, less meaning
2. RNN vs. CNN :
<p> pros: better for understanding sequential data
<p> cons: deep and not able to train in parallel
3. Make use of past and futrure info:
<p> using bi-directional RNN archetecture. It is useful when we need to predict the feature that depends not only on previous info but also on the latter one -- depends on the whole seq.
4. Difference between GRU and LSTM:
<p> GRU has one less forget gate than LSTM, when we have unlimited amount of data and training resource, we should always go for LSTM.
5. Large size of text input will make the RNN structure too deep and the gradients will explode or vanish.