# Allowed libraries
- Tensorflow (compatible with 1.12.x)
- Numpy
- Sklearn
- nltk
- Maplotlib
- gensim
- All the standard libraries
 

https://medium.com/the-artificial-impostor/nlp-four-ways-to-tokenize-chinese-documents-f349eb6ba3c3

https://stanfordnlp.github.io/CoreNLP/download.html

In [1]:
import os
import numpy as np
from typing import Tuple, List, Dict
import tensorflow.keras as K
from tensorflow.keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator
from sklearn.preprocessing import OneHotEncoder


  from ._conv import register_converters as _register_converters


In [2]:
#tf.__version__ fuck

In [3]:
def ChooseDataset(set_type, subset):
    '''returns paths to Label and Input file for a specific dataset
    args: set_type
    return: Label_file, Input_file
    '''
    datasets = {"training":'../icwb2-data/training',
                "dev":'../icwb2-data/gold',
                "testing":'../icwb2-data/testing'}

    def get_file_names(path, type_='LabelFile'):
        x = []
        dev = True if path.split("/")[-1] == 'gold' else False #checks for dev
        for i in os.listdir(path):
            if dev and i.split("_")[1][:4]=='test': #eliminates 'training' from 'gold'
                if os.path.splitext(i)[0].split("_")[-1] == type_:
                    x.append(os.path.join(path, i))
            elif not dev:
                if os.path.splitext(i)[0].split("_")[-1] == type_:
                    x.append(os.path.join(path, i))
        return x

    Label_files = get_file_names(path = datasets[set_type], type_ = 'LabelFile')
    Input_files = get_file_names(path = datasets[set_type], type_ = 'InputFile')
    names = ['msr','cityu','as','pku']
    choose = lambda i: i.split(".utf8")[0].split('/')[-1].split("_")[0]
    e, r = False, False
    chosen = False
    while not chosen:
        #x = input("Choose from the following: {}".format(names))
        x = subset
        for i in range(len(Label_files)):
            if choose(Input_files[i]) == x:
                Input_file = Input_files[i]
                e = True
            if choose(Label_files[i]) == x:
                Label_file = Label_files[i]
                r = True
            if e and r:
                chosen = True

    return Label_file, Input_file

In [15]:
class CreateDataset(object):
    '''makes feed files of combined unigrams and bigrams'''
    def __init__(self, LabelFile_path, InputFile_path, PaddingSize, set_type, TrainingVocab):
        self.Label_File = LabelFile_path
        self.Input_File = InputFile_path
        self.PaddingSize = PaddingSize
        self.set_type = set_type
        self.TrainingVocab = TrainingVocab
    
    def DateGen(self):
        '''creates labels from the label file'''
        
        uni_feature_vectors, bi_feature_vectors, uni_word_to_idx, bi_word_to_idx = self.FeatureGenerator() 
        
        labels = self.BIESToNumerical()
        Optimal_Line_Length = self.PaddingSize
        
        padded_labels = pad_sequences(labels, truncating='pre', padding='post', maxlen = self.PaddingSize)
        y =  K.utils.to_categorical(padded_labels, num_classes=4)
        
        X_unigrams = pad_sequences(uni_feature_vectors, truncating='pre', padding='post', maxlen = self.PaddingSize)
        X_bigrams = pad_sequences(bi_feature_vectors, truncating='pre', padding='post', maxlen = self.PaddingSize)
        info = {"MAXLEN": Optimal_Line_Length,
                "uni_VocabSize": len(uni_feature_vectors),
                "bi_VocabSize": len(bi_feature_vectors)}
        return X_unigrams, X_bigrams, y, info, uni_word_to_idx, bi_word_to_idx
    
    def BIESToNumerical(self):
        '''Converts Label File from BIES encoding to numerical classes'''
        BIES = {'B' : 0, 'I' : 1, 'E' : 2, 'S' : 3}
        #numerical BIES class given to a line 
        labels = []
        with open(self.Label_File, 'r', encoding ='utf8') as f1:
            count = 0
            for line in f1:
                l = line.rstrip()
                labels.append([BIES[i] for i in l])
        return labels
    
    def FeatureGenerator(self):
        '''Generates features based on unigrams and bigrams going line by line
        returns: unigram_feature_vectors, bigram_feature_vectors
        if training then returns also the word_to_idx for both unigrams and bigrams
        '''
        
        uni_feature_vectors, bi_feature_vectors = [], []
        
        if self.set_type == 'training':
            uni_word_to_idx, bi_word_to_idx = self.generateVocab()
        else:
            uni_word_to_idx, bi_word_to_idx = self.TrainingVocab

        with open(self.Input_File, 'r', encoding ='utf8') as f1:
            for line in f1:
                line = line.rstrip()
                
                unigrams = self.split_into_grams(line, 'uni_grams')
                bigrams = self.split_into_grams(line,'bi_grams')
                
                uni_feature_vectors.append([uni_word_to_idx.get(i, 0) for i in unigrams])
                bi_feature_vectors.append([bi_word_to_idx.get(i, 0) for i in bigrams])
                
        return uni_feature_vectors, bi_feature_vectors, uni_word_to_idx, bi_word_to_idx
    
    def generateVocab(self):
        '''
        Generates vocabulary based on file
        args: Inputfile, returns: word_to_index for unigrams and bigrams seperetly 
        '''
        with open(self.Input_File, 'r', encoding ='utf8') as f1:
            lines = f1.readlines()
            raw = ' '.join(' '.join(lines).split()) #one long string
        #creating unigrams and bigrams
        unigrams, bigrams = self.split_into_grams(raw, 'uni_grams'), self.split_into_grams(raw, 'bi_grams')
        del raw #erase from memory
        #geting seperate vocavularies
        unigrams_vocab, bigrams_vocab = set(unigrams), set(bigrams) 
        #print(len(unigrams_vocab), len(bigrams_vocab))
        #initializing sepeate dictionaries
        uni_word_to_idx, bi_word_to_idx = dict(), dict()
        #Handling OOV
        uni_word_to_idx["<UNK>"], bi_word_to_idx["<UNK>"] = 0, 0
        #creating the rest of the word to index dict
        uni_word_to_idx.update({value:key+1 for key,value in enumerate(unigrams_vocab)})
        bi_word_to_idx.update({value:key+1 for key,value in enumerate(bigrams_vocab)})
        
        return uni_word_to_idx, bi_word_to_idx
        
    
    @staticmethod
    def split_into_grams(sentence: str, type_ = 'uni_grams') -> List[str]:
        """
        :param sentence Sentence as str
        :type_: uni_grams or _bigrams
        :return bigrams List of unigrams or bigrams
        """
        n = 1 if type_ == 'uni_grams' else 2
        grams = []
        for i in range(len(sentence)-1):
            gram = sentence[i:i+n]
            grams.append(gram)
        return grams

    

In [22]:
def data_feed(subset='pku',padding=50):
    '''function that creates a dataset -- training, dev, and test
    args: subset: any subset, padding: padding size
    returns: (X_train, y_train), (X_dev, y_dev), (X_test, y_test), info_dev
    '''
    
    type_ = "training"
    print("*****{}*****".format(type_))
    Label_file, Input_file = ChooseDataset(type_, subset)
    A = CreateDataset(Label_file, Input_file, padding, type_, None)
    X_train_uni, X_train_bi, y_train, info_train, uni_word_to_idx, bi_word_to_idx = A.DateGen()
    print("X uni-bi shape: {}{}\ny shape: {}".format(X_train_uni.shape, X_train_bi.shape, y_train.shape))
    print(info_train)
    
    type_ = 'dev'
    Label_file, Input_file = ChooseDataset(type_, subset)
    print(Label_file, Input_file)
    A = CreateDataset(Label_file, Input_file, padding, type_, [uni_word_to_idx, bi_word_to_idx])
    X_dev_uni, X_dev_bi, y_dev, info_dev, _, _ = A.DateGen()
    print("*****{}*****".format(type_))
    print("X uni-bi shape: {}{}\ny shape: {}".format(X_dev_uni.shape, X_dev_bi.shape, y_train.shape))
    print(info_dev)
    
    type_ = 'testing'
    Label_file, Input_file = ChooseDataset(type_, subset)
    A = CreateDataset(Label_file, Input_file, padding, type_, [uni_word_to_idx, bi_word_to_idx])
    X_test_uni, X_test_bi, y_test, info_test, _, _ = A.DateGen()
    print("*****{}*****".format(type_))
    print("X uni-bi shape: {}{}\ny shape: {}".format(X_test_uni.shape, X_test_bi.shape, y_train.shape))
    print(info_test)
    
    return {"train": {"X": [X_train_uni, X_train_bi],
                      "y": y_train}, 
           "dev": {"X": [X_dev_uni, X_dev_bi],
                   "y": y_dev},
           "test": {"X": [X_test_uni, X_test_bi],
                    "y": y_test},
           "info": info_dev}



In [23]:
data = data_feed(subset='pku',padding=10)

*****training*****
X uni-bi shape: (19056, 10)(19056, 10)
y shape: (19056, 10, 4)
{'MAXLEN': 10, 'uni_VocabSize': 19056, 'bi_VocabSize': 19056}
../icwb2-data/gold/pku_test_gold_simplified_LabelFile.utf8 ../icwb2-data/gold/pku_test_gold_simplified_InputFile.utf8
*****dev*****
X uni-bi shape: (1945, 10)(1945, 10)
y shape: (19056, 10, 4)
{'MAXLEN': 10, 'uni_VocabSize': 1945, 'bi_VocabSize': 1945}
*****testing*****
X uni-bi shape: (1945, 10)(1945, 10)
y shape: (19056, 10, 4)
{'MAXLEN': 10, 'uni_VocabSize': 1945, 'bi_VocabSize': 1945}


# model

In [34]:
#DEFINE SOME COSTANTS
VOCAB_SIZE = 5000#info_dev['uni_VocabSize']
EMBEDDING_SIZE = 32
HIDDEN_SIZE = 256
PADDING_SIZE = data['info']['MAXLEN']

https://github.com/keras-team/keras/issues/1029 

Explains Timedistributed in many-to-many models

In [38]:
def create_keras_model(vocab_size, embedding_size, hidden_size, PADDING_SIZE):
    print("Creating KERAS model")
    model = K.models.Sequential()
    model.add(K.layers.Embedding(vocab_size, embedding_size, mask_zero=True, input_length = PADDING_SIZE))
    
    #concatenate
    model.add(K.layers.Bidirectional(
              K.layers.LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), merge_mode='concat'))
    model.add(K.layers.TimeDistributed(
              K.layers.Dense(4, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

    return model

In [39]:
batch_size = 128
epochs = 10
model = create_keras_model(VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, PADDING_SIZE)
# Let's print a summary of the model
model.summary()

Creating KERAS model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 32)            160000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 10, 512)           591872    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 10, 4)             2052      
Total params: 753,924
Trainable params: 753,924
Non-trainable params: 0
_________________________________________________________________


In [37]:
cbk = K.callbacks.TensorBoard("logging/keras_model")
print("\nStarting training...")


Starting training...


In [44]:
model.fit(data["train"]["X"], data["train"]["y"], epochs=epochs, batch_size=batch_size,
          shuffle=True, validation_data=(data["dev"]["X"], data["dev"]["y"]), callbacks=[cbk]) 
print("Training complete.\n")



Train on 19056 samples, validate on 1945 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training complete.



In [None]:
model._get_callback_model()

In [None]:
model.save_weights('my_model_weights.h5') #saving weights for further analysis
model.save('my_model.h5')

In [46]:
print("\nEvaluating test...")
loss_acc = model.evaluate(data["test"]["X"], data["test"]["y"], verbose=3)
print("Test data: loss = %0.6f  accuracy = %0.2f%% " % (loss_acc[0], loss_acc[1]*100))


Evaluating test...
Test data: loss = 4.975396  accuracy = 3.67% 


In [None]:
!tensorboard --logdir logging

In [None]:
model.save_weights('my_model_weights.h5') #saving weights for further analysis
model.save('my_model.h5')

- original file $\rightarrow$ simplified Chinese
- Input file $\rightarrow$ used to feed Bi-LSTM model
- Label file $\rightarrow$ used to test the predictions

TO DO: probably need a decoder

In [None]:
1448/60

In [None]:
''