# Allowed libraries
- Tensorflow (compatible with 1.12.x)
- Numpy
- Sklearn
- nltk
- Maplotlib
- gensim
- All the standard libraries
 

https://medium.com/the-artificial-impostor/nlp-four-ways-to-tokenize-chinese-documents-f349eb6ba3c3

https://stanfordnlp.github.io/CoreNLP/download.html

In [1]:
import os
import numpy as np
from typing import Tuple, List, Dict

import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
def ChooseDataset():
    '''returns paths to Label and Input file for a specific dataset'''
    
    '''
    args: set_type, subset
    return: returns paths to Label and Input file
    '''
    datasets = {"training":'../icwb2-data/training',
                 "dev":'../icwb2-data/gold',
                 "testing":'../icwb2-data/testing'}
        
    def get_file_names(path, type_='LabelFile'):
        x = []
        for i in os.listdir(path):
            if os.path.splitext(i)[0].split("_")[-1] == type_:
                x.append(os.path.join(path, i))
        return x
    Label_files = get_file_names(path = datasets['training'], type_ = 'LabelFile')
    Input_files = get_file_names(path = datasets['training'], type_ = 'InputFile')
    
    e, r = False, False
    chosen = False
    while not chosen:
        print("Choose from the following")
        print(names)
        x = input("")
        for i in range(4):
            #print(choose(Input_files[i]))
            if choose(Input_files[i]) == x: 
                Input_file = Input_files[i]
                e = True
            if choose(Label_files[i]) == x:
                Label_file = Label_files[i]
                r = True
            if e and r:
                chosen = True
            
    return Label_file, Input_file

Choose from the following
['msr', 'cityu', 'as', 'pku']
pku
../icwb2-data/training/pku_training_simplified_LabelFile.utf8 ../icwb2-data/training/pku_training_simplified_InputFile.utf8


In [16]:
class CreateDataset(object):
    '''makes feed files of combined unigrams and bigrams'''
    def __init__(self, LabelFile_path, InputFile_path):
        self.Label_File = LabelFile_path
        self.Input_File = InputFile_path
    
    def DateGen(self):
        '''creates labels from the label file'''
        
        features_vectors = self.FeatureGenerator() 
        labels = self.BIESToNumerical()
        
        Optimal_Line_Length = int(np.mean([len(i) for i in features_vectors])) #length of longest line
        
        print("MAXLEN: {}".format(Optimal_Line_Length)) 
        padded_labels = pad_sequences(labels, truncating='pre', padding='post', maxlen = Optimal_Line_Length)
        
        y =  K.utils.to_categorical(padded_labels, num_classes=4)
        X = pad_sequences(features_vectors, truncating='pre', padding='post', maxlen = Optimal_Line_Length)
        
        return X, y
    
    def BIESToNumerical(self):
        '''Converts Label File from BIES encoding to numerical classes'''
        BIES = {'B' : 0, 'I' : 1, 'E' : 2, 'S' : 3}
        #numerical BIES class given to a line 
        labels = []
        with open(self.Label_File, 'r', encoding ='utf8') as f1:
            count = 0
            for line in f1:
                l = line.rstrip()
                labels.append([BIES[i] for i in l])
        return labels
    
    def FeatureGenerator(self):
        '''Generates features based on '''
        features_vectors = []
        word_to_index = self.generateVocab()
        with open(self.Input_File, 'r', encoding ='utf8') as f1:
            for line in f1:
                l = line.rstrip()
                grams = self.split_into_grams(l, 'uni_grams') + self.split_into_grams(l,'bi_grams')

                #difference is creating by grams line by line
                features_vectors.append([word_to_index[i] for i in grams])
        
        return features_vectors
    
    def generateVocab(self):
        '''
        Generates vocabulary based on file
        args: Inputfile, returns: word_to_index dict
        '''
        big_line = ''
        with open(self.Input_File, 'r', encoding ='utf8') as f1:
            for line in f1:
                big_line+=line.rstrip()
        final = self.split_into_grams(big_line, type_ = 'bi_grams') + self.split_into_grams(big_line, type_ = 'uni_grams')
        vocab = set(final)
        word_to_index = {value:key for key,value in enumerate(vocab)}
        word_to_index['UNK'] = 0
        
        
        return word_to_index
        
    
    @staticmethod
    def split_into_grams(sentence: str, type_ = 'uni_grams') -> List[str]:
        """
        :param sentence Sentence as str
        :type_: uni_grams or _bigrams
        :return bigrams List of unigrams or bigrams
        """
        n = 1 if type_ == 'uni_grams' else 2
        grams = []
        for i in range(len(sentence)-1):
            gram = sentence[i:i+n]
            grams.append(gram)
        return grams

    

In [None]:
Label_file, Input_file = ChooseDataset("training","pku")

In [17]:
A = CreateDataset(Label_file, Input_file)

In [18]:
X, y = A.DateGen()

MAXLEN: 189


In [19]:

y.shape

(19056, 189, 4)

In [20]:
Label_file, Input_file = ChooseDataset("training","pku")

NameError: name 'ChooseDataset' is not defined

In [None]:
datasets = {"training":'../icwb2-data/training',
             "dev":'../icwb2-data/gold',
             "testing":'../icwb2-data/testing'}

In [None]:
def get_file_names(path, type_='LabelFile'):
    x = []
    for i in os.listdir(path):
        if os.path.splitext(i)[0].split("_")[-1] == type_:
            x.append(os.path.join(path, i))
    return x

Label_files = get_file_names(path = datasets['training'], type_ = 'LabelFile')
Input_files = get_file_names(path = datasets['training'], type_ = 'InputFile')

In [None]:
names = ['msr','cityu','as','pku']
choose = lambda i: i.split(".utf8")[0].split('/')[-1].split("_")[0]

e, r = False, False
chosen = False
while not chosen:
    print("Choose from the following")
    print(names)
    x = input("")
    for i in range(4):
        print(choose(Input_files[i]))
        if choose(Input_files[i]) == x: 
            Input_file = Input_files[i]
            e = True
        if choose(Label_files[i]) == x:
            Label_file = Label_files[i]
            r = True
        if e and r:
            chosen = True
            print("CHOSEN {}".format(x))
    
    
print(Label_file, Input_file)

In [None]:


def split_into_grams(sentence: str, type_ = 'uni_grams') -> List[str]:
    """
    :param sentence Sentence as str
    :type_: uni_grams or _bigrams
    :return bigrams List of unigrams or bigrams
    """
    n = 1 if type_ == 'uni_grams' else 2
    grams = []
    for i in range(len(sentence)-1):
        gram = sentence[i:i+n]
        grams.append(gram)
    return grams


In [None]:
big_line = ''

with open(Input_file, 'r', encoding ='utf8') as f1:
    for line in f1:
        big_line+=line.rstrip()
        
final = split_into_grams(big_line, type_ = 'bi_grams') + split_into_grams(big_line, type_ = 'uni_grams')


In [None]:
# file_ = "../icwb2-data/training/pku_training_simplified_InputFile_FEED.utf8"
# with open(file_, 'w') as t:
#     for item in final:
#         t.write("%s\n" % item)

In [None]:
# final_1 = []
# with open(file_, 'r', encoding ='utf8') as f1:
#     for line in f1:
#         final_1.append(line.rstrip())

In [None]:
vocab = set(final)

In [None]:
word_to_index = {value:key for key,value in enumerate(vocab)}
word_to_index['UNK'] = 0

In [None]:
len(vocab)

In [6]:
word_to_index

NameError: name 'word_to_index' is not defined

# creating feature vectors

In [None]:
#number assigned to a unigram or bigram in a sentence  based on the dictionary created earlier
#one feature vector are the numbers from a sentence (line of the Input file)

features_vectors = []
with open(Input_file, 'r', encoding ='utf8') as f1:
    for line in f1:
        l = line.rstrip()
        grams = split_into_grams(l, 'uni_grams') + split_into_grams(l,'bi_grams')

        #difference is creating by grams line by line
        features_vectors.append([word_to_index[i] for i in grams])

In [None]:
def OHE(label):
    label = label.reshape(-1, 1)
    enc = OneHotEncoder(categories='auto')
    enc.fit(label)
    return enc.transform(label).toarray()


In [None]:
BIES = {'B' : 0, 'I' : 1, 'E' : 2, 'S' : 3}
#numerical BIES class given to a line 
labels = []
with open(Label_file, 'r', encoding ='utf8') as f1:
    count = 0
    for line in f1:
        l = line.rstrip()
        labels.append([BIES[i] for i in l])

In [None]:
TO_BE_FOUND = int(np.mean([len(i) for i in features_vectors])) #length of longest line
print("MAXLEN: {}".format(TO_BE_FOUND))

In [None]:
padded_labels = pad_sequences(labels, truncating='pre', padding='post', maxlen = TO_BE_FOUND)
y =  K.utils.to_categorical(padded_labels, num_classes=4)

In [None]:
X = pad_sequences(features_vectors, truncating='pre', padding='post', maxlen = TO_BE_FOUND)

### N_lines $\times$ charecters per line (padded)  $\times$ class from OneHotEncoding

In [None]:
y.shape

### N_lines $\times$ charecters per line (padded)

In [None]:
X.shape


# model

In [None]:
#DEFINE SOME COSTANTS
MAX_LENGTH = 88
VOCAB_SIZE = len(vocab)
EMBEDDING_SIZE = 32
HIDDEN_SIZE = 256


https://github.com/keras-team/keras/issues/1029 

Explains Timedistributed in many-to-many models

In [None]:
TO_BE_FOUND

In [None]:
def create_keras_model(vocab_size, embedding_size, hidden_size, TO_BE_FOUND):
    print("Creating KERAS model")
    
    model = K.models.Sequential()
    # remember to set mask_zero=True or the model consider the padding as a valid timestep!
    model.add(K.layers.Embedding(vocab_size, embedding_size, mask_zero=True, input_length = TO_BE_FOUND))
    #add a LSTM layer with some dropout in it
    model.add(K.layers.LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
    # add a dense layer with sigmoid to get a probability value from 0.0 to 1.0s
    model.add(K.layers.TimeDistributed(K.layers.Dense(4, activation='softmax')))
    #time distribution
    # we are going to use the Adam optimizer which is a really powerful optimizer.
    #optimizer = K.optimizers.Adam()
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

    return model

In [None]:
def create_keras_model(vocab_size, embedding_size, hidden_size, TO_BE_FOUND):
    print("Creating KERAS model")
    
    model = K.models.Sequential()
    # remember to set mask_zero=True or the model consider the padding as a valid timestep!
    model.add(K.layers.Embedding(vocab_size, embedding_size, mask_zero=True, input_length = TO_BE_FOUND))
    #add a LSTM layer with some dropout in it
    model.add(K.layers.LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
    # add a dense layer with sigmoid to get a probability value from 0.0 to 1.0s
    model.add(K.layers.TimeDistributed(K.layers.Dense(4, activation='softmax')))
    #time distribution
    # we are going to use the Adam optimizer which is a really powerful optimizer.
    #optimizer = K.optimizers.Adam()
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

    return model

In [None]:
batch_size = 32
epochs = 10
model = create_keras_model(VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, TO_BE_FOUND)
# Let's print a summary of the model
model.summary()

In [None]:
cbk = K.callbacks.TensorBoard("logging/keras_model")
print("\nStarting training...")

In [None]:
percent = 10
size = int(len(X)/(1-percent))
X_train = X[:2000]
y_train = y[:2000]
dev_x = X[-300:]
dev_y = y[-300:]

In [None]:
print("X train {}".format(X_train.shape))
print("y train {}".format(y_train.shape))
print("X dev {}".format(dev_x.shape))
print("y dev {}".format(dev_y.shape))

In [None]:

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
          shuffle=True, validation_data=(dev_x, dev_y), callbacks=[cbk]) 
print("Training complete.\n")



In [None]:
#print("\nEvaluating test...")
#loss_acc = model.evaluate(test_x, test_y, verbose=0)
#print("Test data: loss = %0.6f  accuracy = %0.2f%% " % (loss_acc[0], loss_acc[1]*100))

- original file $\rightarrow$ simplified Chinese
- Input file $\rightarrow$ used to feed Bi-LSTM model
- Label file $\rightarrow$ used to test the predictions

TO DO: probably need a decoder