# Allowed libraries
- Tensorflow (compatible with 1.12.x)
- Numpy
- Sklearn
- nltk
- Maplotlib
- gensim
- All the standard libraries
 

https://medium.com/the-artificial-impostor/nlp-four-ways-to-tokenize-chinese-documents-f349eb6ba3c3

https://stanfordnlp.github.io/CoreNLP/download.html

In [1]:
import os
import numpy as np
from typing import Tuple, List, Dict

import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
#tf.__version__ fuck

In [3]:
def ChooseDataset(set_type, subset):
    '''returns paths to Label and Input file for a specific dataset
    args: set_type
    return: Label_file, Input_file
    '''
    datasets = {"training":'../icwb2-data/training',
                "dev":'../icwb2-data/gold',
                "testing":'../icwb2-data/testing'}
        
    def get_file_names(path, type_='LabelFile'):
        x = []
        dev = True if path.split("/")[-1] == 'gold' else False #checks for dev
        for i in os.listdir(path):
            if dev and i.split("_")[1][:4]=='test': #eliminates 'training' from 'gold' 
                if os.path.splitext(i)[0].split("_")[-1] == type_:
                    x.append(os.path.join(path, i))
            elif not dev:
                if os.path.splitext(i)[0].split("_")[-1] == type_:
                    x.append(os.path.join(path, i))
        return x
    
    Label_files = get_file_names(path = datasets[set_type], type_ = 'LabelFile')
    Input_files = get_file_names(path = datasets[set_type], type_ = 'InputFile')
    names = ['msr','cityu','as','pku']
    choose = lambda i: i.split(".utf8")[0].split('/')[-1].split("_")[0]
    e, r = False, False
    chosen = False
    while not chosen:
        #x = input("Choose from the following: {}".format(names))
        x = subset
        for i in range(len(Label_files)):
            if choose(Input_files[i]) == x: 
                Input_file = Input_files[i]
                e = True
            if choose(Label_files[i]) == x:
                Label_file = Label_files[i]
                r = True
            if e and r:
                chosen = True
            
    return Label_file, Input_file

In [4]:
class CreateDataset(object):
    '''makes feed files of combined unigrams and bigrams'''
    def __init__(self, LabelFile_path, InputFile_path, PaddingSize, set_type, TrainingVocab):
        self.Label_File = LabelFile_path
        self.Input_File = InputFile_path
        self.PaddingSize = PaddingSize
        self.set_type = set_type
        self.TrainingVocab = TrainingVocab
    
    def DateGen(self):
        '''creates labels from the label file'''
        
        features_vectors, word_to_index = self.FeatureGenerator() 
        labels = self.BIESToNumerical()
        #Optimal_Line_Length = int(np.mean([len(i) for i in features_vectors])) #length of longest line
        Optimal_Line_Length = self.PaddingSize
        
        #print("MAXLEN: {}".format(Optimal_Line_Length)) 
        padded_labels = pad_sequences(labels, truncating='pre', padding='post', maxlen = Optimal_Line_Length)
        
        y =  K.utils.to_categorical(padded_labels, num_classes=4)
        X = pad_sequences(features_vectors, truncating='pre', padding='post', maxlen = Optimal_Line_Length)
        
        info = {"MAXLEN": Optimal_Line_Length,
                "VocabSize": len(word_to_index)}
        return X, y, info, word_to_index
    
    def BIESToNumerical(self):
        '''Converts Label File from BIES encoding to numerical classes'''
        BIES = {'B' : 0, 'I' : 1, 'E' : 2, 'S' : 3}
        #numerical BIES class given to a line 
        labels = []
        with open(self.Label_File, 'r', encoding ='utf8') as f1:
            count = 0
            for line in f1:
                l = line.rstrip()
                labels.append([BIES[i] for i in l])
        return labels
    
    def FeatureGenerator(self):
        '''Generates features based on '''
        features_vectors = []
        if self.set_type == 'training':
            word_to_index = self.generateVocab()
        else:
            word_to_index = self.TrainingVocab

        with open(self.Input_File, 'r', encoding ='utf8') as f1:
            for line in f1:
                l = line.rstrip()
                grams = self.split_into_grams(l, 'uni_grams') + self.split_into_grams(l,'bi_grams')
                #difference is creating by grams line by line
                features_vectors.append([word_to_index.get(i, 0) for i in grams])
        return features_vectors, word_to_index
    
    def generateVocab(self):
        '''
        Generates vocabulary based on file
        args: Inputfile, returns: word_to_index dict
        '''
        big_line = ''
        with open(self.Input_File, 'r', encoding ='utf8') as f1:
            for line in f1:
                big_line+=line.rstrip()
        final = self.split_into_grams(big_line, type_ = 'bi_grams') + self.split_into_grams(big_line, type_ = 'uni_grams')
        vocab = set(final)
        word_to_index = dict()
        word_to_index['<UNK>'] = 0
        word_to_index.update({value:key+1 for key,value in enumerate(vocab)})
        
        
        return word_to_index
        
    
    @staticmethod
    def split_into_grams(sentence: str, type_ = 'uni_grams') -> List[str]:
        """
        :param sentence Sentence as str
        :type_: uni_grams or _bigrams
        :return bigrams List of unigrams or bigrams
        """
        n = 1 if type_ == 'uni_grams' else 2
        grams = []
        for i in range(len(sentence)-1):
            gram = sentence[i:i+n]
            grams.append(gram)
        return grams

    

In [5]:
def data(subset='pku',padding=50):
    '''function that creates a dataset -- training, dev, and test
    args: subset: any subset, padding: padding size
    returns: (X_train, y_train), (X_dev, y_dev), (X_test, y_test), info_dev
    '''
    
    type_ = "training"
    print(type_)
    Label_file, Input_file = ChooseDataset(type_, subset)
    A = CreateDataset(Label_file, Input_file, padding, type_, None)
    X_train, y_train, info_train, word_to_index_training = A.DateGen()
    print("X shape: {}\ny shape: {}".format(X_train.shape, y_train.shape))
    print(info_train)
    
    type_ = 'dev'
    Label_file, Input_file = ChooseDataset(type_, subset)
    A = CreateDataset(Label_file, Input_file, padding, type_, word_to_index_training)
    X_dev, y_dev, info_dev, _ = A.DateGen()
    print(type_)
    print("X shape: {}\ny shape: {}".format(X_dev.shape, y_dev.shape))
    print(info_dev)
    
    type_ = 'testing'
    Label_file, Input_file = ChooseDataset(type_, subset)
    A = CreateDataset(Label_file, Input_file, padding, type_, word_to_index_training)
    X_test, y_test, info_test, _ = A.DateGen()
    print(type_)
    print("X shape: {}\ny shape: {}".format(X_test.shape, y_test.shape))
    print(info_dev)
    
    return (X_train, y_train), (X_dev, y_dev), (X_test, y_test), info_dev



In [6]:
(X_train, y_train), (X_dev, y_dev), (X_test, y_test), info_dev = data(subset='pku',padding=50)

training
X shape: (19056, 50)
y shape: (19056, 50, 4)
{'MAXLEN': 50, 'VocabSize': 285201}
dev
X shape: (1945, 50)
y shape: (1945, 50, 4)
{'MAXLEN': 50, 'VocabSize': 285201}
testing
X shape: (1945, 50)
y shape: (1945, 50, 4)
{'MAXLEN': 50, 'VocabSize': 285201}


# model

In [77]:
#DEFINE SOME COSTANTS
VOCAB_SIZE = info_dev['VocabSize']
EMBEDDING_SIZE = 32
HIDDEN_SIZE = 256
TO_BE_FOUND = info_dev['MAXLEN']

https://github.com/keras-team/keras/issues/1029 

Explains Timedistributed in many-to-many models

In [78]:
def create_keras_model(vocab_size, embedding_size, hidden_size, TO_BE_FOUND):
    print("Creating KERAS model")
    model = K.models.Sequential()
    model.add(K.layers.Embedding(vocab_size, embedding_size, mask_zero=True, input_length = TO_BE_FOUND))
    model.add(K.layers.LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
    model.add(K.layers.TimeDistributed(K.layers.Dense(4, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

    return model

In [88]:
batch_size = 128
epochs = 10
model = create_keras_model(VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, TO_BE_FOUND)
# Let's print a summary of the model
model.summary()

Creating KERAS model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 32)            12139008  
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 256)           295936    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 50, 4)             1028      
Total params: 12,435,972
Trainable params: 12,435,972
Non-trainable params: 0
_________________________________________________________________


In [89]:
cbk = K.callbacks.TensorBoard("logging/keras_model")
print("\nStarting training...")


Starting training...


In [90]:
max([max(i) for i in X_train]), max([max(i) for i in X_dev])

(379343, 379336)

In [91]:
K.callbacks.EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=2,
                              verbose=0, mode='auto')

<tensorflow.python.keras.callbacks.EarlyStopping at 0x188c49e10>

In [92]:

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
          shuffle=True, validation_data=(X_dev, y_dev), callbacks=[cbk, csv_logger]) 
print("Training complete.\n")



Train on 53019 samples, validate on 1493 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training complete.



In [103]:
model._get_callback_model()

<tensorflow.python.keras.engine.sequential.Sequential at 0x188c4a320>

In [94]:
model.save_weights('my_model_weights.h5') #saving weights for further analysis
model.save('my_model.h5')

In [100]:
print("\nEvaluating test...")
loss_acc = model.evaluate(X_test, y_test, verbose=3)
print("Test data: loss = %0.6f  accuracy = %0.2f%% " % (loss_acc[0], loss_acc[1]*100))


Evaluating test...
Test data: loss = 4.348248  accuracy = 18.46% 


In [106]:
!tensorboard --logdir logging

  from ._conv import register_converters as _register_converters
W0407 00:25:02.098168 123145567449088 plugin_event_accumulator.py:294] Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
W0407 00:25:02.114874 123145567449088 plugin_event_accumulator.py:294] Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
W0407 00:25:02.134521 123145567449088 plugin_event_accumulator.py:294] Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
W0407 00:25:02.193573 123145567449088 plugin_event_accumulator.py:294] Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Over

W0407 00:25:02.969447 123145567449088 plugin_event_accumulator.py:302] Found more than one metagraph event per run. Overwriting the metagraph with the newest event.
^C


In [None]:
model.save_weights('my_model_weights.h5') #saving weights for further analysis
model.save('my_model.h5')

- original file $\rightarrow$ simplified Chinese
- Input file $\rightarrow$ used to feed Bi-LSTM model
- Label file $\rightarrow$ used to test the predictions

TO DO: probably need a decoder

In [None]:
1448/60

In [105]:
''

SyntaxError: EOL while scanning string literal (<ipython-input-105-8fd4db36f2b7>, line 1)