# Allowed libraries
- Tensorflow (compatible with 1.12.x)
- Numpy
- Sklearn
- nltk
- Maplotlib
- gensim
- All the standard libraries
 

https://medium.com/the-artificial-impostor/nlp-four-ways-to-tokenize-chinese-documents-f349eb6ba3c3

https://stanfordnlp.github.io/CoreNLP/download.html

In [1]:
import os
from sklearn.preprocessing import OneHotEncoder

In [2]:
class Make_Feed(object):
    '''makes feed files of combined unigrams and bigrams'''
    def __init__(self):
        pass

In [3]:
datasets = {"training":'../icwb2-data/training',
             "dev":'../icwb2-data/gold',
             "testing":'../icwb2-data/testing'}

In [4]:
def get_file_names(path, type_='LabelFile'):
    x = []
    for i in os.listdir(path):
        if os.path.splitext(i)[0].split("_")[-1] == type_:
            x.append(os.path.join(path, i))
    return x

Label_files = get_file_names(path = datasets['training'], type_ = 'LabelFile')
Input_files = get_file_names(path = datasets['training'], type_ = 'InputFile')

In [8]:
Input_files

['../icwb2-data/training/msr_training_simplified_InputFile.utf8',
 '../icwb2-data/training/cityu_training_simplified_InputFile.utf8',
 '../icwb2-data/training/as_training_simplified_InputFile.utf8',
 '../icwb2-data/training/pku_training_simplified_InputFile.utf8']

In [6]:
from typing import Tuple, List, Dict

def split_into_grams(sentence: str, type_ = 'uni_grams') -> List[str]:
    """
    :param sentence Sentence as str
    :type_: uni_grams or _bigrams
    :return bigrams List of unigrams or bigrams
    """
    n = 1 if type_ == 'uni_grams' else 2
    grams = []
    for i in range(len(sentence)-1):
        gram = sentence[i:i+n]
        grams.append(gram)
    return grams


In [9]:
big_line = ''

with open(Input_files[-1], 'r', encoding ='utf8') as f1:
    for line in f1:
        big_line+=line.rstrip()
        
final = split_into_grams(big_line, type_ = 'bi_grams') + split_into_grams(big_line, type_ = 'uni_grams')


In [None]:
file_ = "../icwb2-data/training/pku_training_simplified_InputFile_FEED.utf8"
with open(file_, 'w') as t:
    for item in final:
        t.write("%s\n" % item)

In [None]:
final_1 = []
with open(file_, 'r', encoding ='utf8') as f1:
    for line in f1:
        final_1.append(line.rstrip())

In [10]:
vocab = set(final)

In [11]:
word_to_index = {value:key for key,value in enumerate(vocab)}
word_to_index['UNK'] = 0

In [54]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator


  from ._conv import register_converters as _register_converters


# creating feature vectors

In [13]:
features_vectors = []
with open(Input_files[-1], 'r', encoding ='utf8') as f1:
    for line in f1:
        l = line.rstrip()
        grams = split_into_grams(l, 'uni_grams') + split_into_grams(l,'bi_grams')
        features_vectors.append([word_to_index[i] for i in grams])

In [59]:
def OHE(label):
    label = label.reshape(-1, 1)
    enc = OneHotEncoder(categories='auto')
    enc.fit(label)
    return enc.transform(label).toarray()
BIES = {'B' : 0, 'I' : 1, 'E' : 2, 'S' : 3}

In [104]:
labels = []
with open(Label_files[-1], 'r', encoding ='utf8') as f1:
    count = 0
    for line in f1:
        l = line.rstrip()
        labels.append([BIES[i] for i in l])

In [107]:
TO_BE_FOUND = 50 #length of longest line
padded = pad_sequences(labels, maxlen=TO_BE_FOUND)

In [119]:
y_train = OHE(padded)

In [127]:
X_train = pad_sequences(features_vectors, maxlen=TO_BE_FOUND)

In [122]:
y_train.shape


(952800, 4)

In [132]:
X_train = X_train.reshape(-1)

# model

In [113]:
#DEFINE SOME COSTANTS
MAX_LENGTH = 80
VOCAB_SIZE = 20000
EMBEDDING_SIZE = 32
HIDDEN_SIZE = 100

import tensorflow as tf
import tensorflow.keras as K
import numpy as np 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator

In [112]:
def create_keras_model(vocab_size, embedding_size, hidden_size):
    print("Creating KERAS model")
    
    model = K.models.Sequential()
    # remember to set mask_zero=True or the model consider the padding as a valid timestep!
    model.add(K.layers.Embedding(vocab_size, embedding_size, mask_zero=True))
    #add a LSTM layer with some dropout in it
    model.add(K.layers.LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
    # add a dense layer with sigmoid to get a probability value from 0.0 to 1.0
    model.add(K.layers.Dense(1, activation='sigmoid'))
    
    # we are going to use the Adam optimizer which is a really powerful optimizer.
    optimizer = K.optimizers.Adam()
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc'])

    return model

In [114]:
batch_size = 32
epochs = 3
model = create_keras_model(VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE)
# Let's print a summary of the model
model.summary()

Creating KERAS model
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          640000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 693,301
Trainable params: 693,301
Non-trainable params: 0
_________________________________________________________________


In [133]:
cbk = K.callbacks.TensorBoard("logging/keras_model")
print("\nStarting training...")


Starting training...


In [139]:
percent = 10
size = int(len(X_train)/percent)
X_train = X_train[:size]
y_train = y_train[:size]
dev_x = X_train[size:]
dev_y = y_train[size:]

In [147]:
y_train.shape

(95280, 4)

In [144]:
X_train.shape

(95280,)

In [145]:

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
          shuffle=True, validation_data=(dev_x, dev_y), callbacks=[cbk]) 
print("Training complete.\n")

#print("\nEvaluating test...")
#loss_acc = model.evaluate(test_x, test_y, verbose=0)
#print("Test data: loss = %0.6f  accuracy = %0.2f%% " % (loss_acc[0], loss_acc[1]*100))

ValueError: A target array with shape (95280, 4) was passed for an output of shape (None, 1) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.

- original file $\rightarrow$ simplified Chinese
- Input file $\rightarrow$ used to feed Bi-LSTM model
- Label file $\rightarrow$ used to test the predictions

TO DO: probably need a decoder

In [None]:
from nltk.tokenize.stanford_segmenter import StanfordSegmenter

In [None]:
from nltk.corpus import stopwords
eng_stword = stopwords.words("german")
eng_stword

In [None]:
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
>>> segmenter = StanfordSegmenter(path_to_jar=”stanford-segmenter-3.4.1.jar”, path_to_sihan_corpora_dict=”./data”, path_to_model=”./data/pku.gz”, path_to_dict=”./data/dict-chris6.ser.gz”)
>>> sentence = u”这是斯坦福中文分词器测试”
>>> segmenter.segment(sentence)
>>> u’\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5\n’
>>> segmenter.segment_file(“test.simp.utf8”)
>>> u’\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd .
>>> outfile = open(‘outfile’, ‘w’)
>>> result = segmenter.segment(sentence)
>>> outfile.write(result.encode(‘UTF-8′))
>>> outfile.close()

In [None]:
sentence = '这是斯坦福中文分词器测试'

In [None]:
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter

In [None]:
#segmenter = StanfordSegmenter()
p = '../resources/stanford-chinese-corenlp-2018-10-05-models.jar'

In [None]:
segmenter = StanfordSegmenter(path_to_jar=p)

In [None]:
segmenter.segment(sentence)