# Allowed libraries
- Tensorflow (compatible with 1.12.x)
- Numpy
- Sklearn
- nltk
- Maplotlib
- gensim
- All the standard libraries
 

https://medium.com/the-artificial-impostor/nlp-four-ways-to-tokenize-chinese-documents-f349eb6ba3c3

https://stanfordnlp.github.io/CoreNLP/download.html

In [None]:
import os
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf
import tensorflow.keras as K
import numpy as np 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator

In [None]:
class Make_Feed(object):
    '''makes feed files of combined unigrams and bigrams'''
    def __init__(self):
        pass

In [None]:
datasets = {"training":'../icwb2-data/training',
             "dev":'../icwb2-data/gold',
             "testing":'../icwb2-data/testing'}

In [4]:
def get_file_names(path, type_='LabelFile'):
    x = []
    for i in os.listdir(path):
        if os.path.splitext(i)[0].split("_")[-1] == type_:
            x.append(os.path.join(path, i))
    return x

Label_files = get_file_names(path = datasets['training'], type_ = 'LabelFile')
Input_files = get_file_names(path = datasets['training'], type_ = 'InputFile')

In [5]:
names = ['msr','cityu','as','pku']
choose = lambda i: i.split(".utf8")[0].split('/')[-1].split("_")[0]

e, r = False, False
chosen = False
while not chosen:
    print("Choose from the following")
    print(names)
    x = input("")
    for i in range(4):
        print(choose(Input_files[i]))
        if choose(Input_files[i]) == x: 
            Input_file = Input_files[i]
            e = True
        if choose(Label_files[i]) == x:
            Label_file = Label_files[i]
            r = True
        if e and r:
            chosen = True
            print("CHOSEN {}".format(x))
    
    
print(Label_file, Input_file)

Choose from the following
['msr', 'cityu', 'as', 'pku']

msr
cityu
as
pku
Choose from the following
['msr', 'cityu', 'as', 'pku']

msr
cityu
as
pku
Choose from the following
['msr', 'cityu', 'as', 'pku']
cityu
msr
cityu
CHOSEN cityu
as
CHOSEN cityu
pku
CHOSEN cityu
../icwb2-data/training/cityu_training_simplified_LabelFile.utf8 ../icwb2-data/training/cityu_training_simplified_InputFile.utf8


In [6]:
from typing import Tuple, List, Dict

def split_into_grams(sentence: str, type_ = 'uni_grams') -> List[str]:
    """
    :param sentence Sentence as str
    :type_: uni_grams or _bigrams
    :return bigrams List of unigrams or bigrams
    """
    n = 1 if type_ == 'uni_grams' else 2
    grams = []
    for i in range(len(sentence)-1):
        gram = sentence[i:i+n]
        grams.append(gram)
    return grams


In [7]:
big_line = ''

with open(Input_file, 'r', encoding ='utf8') as f1:
    for line in f1:
        big_line+=line.rstrip()
        
final = split_into_grams(big_line, type_ = 'bi_grams') + split_into_grams(big_line, type_ = 'uni_grams')


In [8]:
# file_ = "../icwb2-data/training/pku_training_simplified_InputFile_FEED.utf8"
# with open(file_, 'w') as t:
#     for item in final:
#         t.write("%s\n" % item)

In [9]:
# final_1 = []
# with open(file_, 'r', encoding ='utf8') as f1:
#     for line in f1:
#         final_1.append(line.rstrip())

In [10]:
vocab = set(final)

In [11]:
word_to_index = {value:key for key,value in enumerate(vocab)}
word_to_index['UNK'] = 0

In [19]:
len(vocab)

379343

In [20]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator


# creating feature vectors

In [21]:
#number assigned to a unigram or bigram in a sentence  based on the dictionary created earlier
#one feature vector are the numbers from a sentence (line of the Input file)

features_vectors = []
with open(Input_file, 'r', encoding ='utf8') as f1:
    for line in f1:
        l = line.rstrip()
        grams = split_into_grams(l, 'uni_grams') + split_into_grams(l,'bi_grams')

        #difference is creating by grams line by line
        features_vectors.append([word_to_index[i] for i in grams])

In [22]:
def OHE(label):
    label = label.reshape(-1, 1)
    enc = OneHotEncoder(categories='auto')
    enc.fit(label)
    return enc.transform(label).toarray()
BIES = {'B' : 0, 'I' : 1, 'E' : 2, 'S' : 3}

In [23]:
#numerical BIES class given to a line 
labels = []
with open(Label_file, 'r', encoding ='utf8') as f1:
    count = 0
    for line in f1:
        l = line.rstrip()
        labels.append([BIES[i] for i in l])

In [24]:
TO_BE_FOUND = int(np.mean([len(i) for i in features_vectors])) #length of longest line
print("MAXLEN: {}".format(TO_BE_FOUND))

MAXLEN: 88


In [25]:
padded_labels = pad_sequences(labels, truncating='pre', padding='post', maxlen = TO_BE_FOUND)
y =  K.utils.to_categorical(padded_labels, num_classes=4)

In [26]:
X = pad_sequences(features_vectors, truncating='pre', padding='post', maxlen = TO_BE_FOUND)

### N_lines $\times$ charecters per line (padded)  $\times$ class from OneHotEncoding

In [27]:
y.shape

(53019, 88, 4)

### N_lines $\times$ charecters per line (padded)

In [28]:
X.shape


(53019, 88)

# model

In [29]:
#DEFINE SOME COSTANTS
MAX_LENGTH = 88
VOCAB_SIZE = len(vocab)
EMBEDDING_SIZE = 32
HIDDEN_SIZE = 256


https://github.com/keras-team/keras/issues/1029 

Explains Timedistributed in many-to-many models

In [30]:
TO_BE_FOUND

88

In [44]:
def create_keras_model(vocab_size, embedding_size, hidden_size, TO_BE_FOUND):
    print("Creating KERAS model")
    
    model = K.models.Sequential()
    # remember to set mask_zero=True or the model consider the padding as a valid timestep!
    model.add(K.layers.Embedding(vocab_size, embedding_size, mask_zero=True, input_length = TO_BE_FOUND))
    #add a LSTM layer with some dropout in it
    model.add(K.layers.LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
    # add a dense layer with sigmoid to get a probability value from 0.0 to 1.0s
    model.add(K.layers.TimeDistributed(K.layers.Dense(4, activation='softmax')))
    #time distribution
    # we are going to use the Adam optimizer which is a really powerful optimizer.
    #optimizer = K.optimizers.Adam()
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

    return model

In [54]:
batch_size = 32
epochs = 10
model = create_keras_model(VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, TO_BE_FOUND)
# Let's print a summary of the model
model.summary()

Creating KERAS model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 88, 32)            12138976  
_________________________________________________________________
lstm_3 (LSTM)                (None, 88, 256)           295936    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 88, 4)             1028      
Total params: 12,435,940
Trainable params: 12,435,940
Non-trainable params: 0
_________________________________________________________________


In [46]:
cbk = K.callbacks.TensorBoard("logging/keras_model")
print("\nStarting training...")


Starting training...


In [55]:
percent = 10
size = int(len(X)/(1-percent))
X_train = X[:2000]
y_train = y[:2000]
dev_x = X[-300:]
dev_y = y[-300:]

In [56]:
print("X train {}".format(X_train.shape))
print("y train {}".format(y_train.shape))
print("X dev {}".format(dev_x.shape))
print("y dev {}".format(dev_y.shape))

X train (2000, 88)
y train (2000, 88, 4)
X dev (300, 88)
y dev (300, 88, 4)


In [None]:

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
          shuffle=True, validation_data=(dev_x, dev_y), callbacks=[cbk]) 
print("Training complete.\n")



Train on 2000 samples, validate on 300 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
 288/2000 [===>..........................] - ETA: 22s - loss: 0.8766 - acc: 0.6000

In [None]:
#print("\nEvaluating test...")
#loss_acc = model.evaluate(test_x, test_y, verbose=0)
#print("Test data: loss = %0.6f  accuracy = %0.2f%% " % (loss_acc[0], loss_acc[1]*100))

- original file $\rightarrow$ simplified Chinese
- Input file $\rightarrow$ used to feed Bi-LSTM model
- Label file $\rightarrow$ used to test the predictions

TO DO: probably need a decoder