## Author: Isaac Coffie
## Course: NLP
## Date: April 26, 2020
## Assignment: POS Tagging Using LSTM

In [None]:
##Import libraries
import numpy as np
import pandas as pd

##preprocessing utils
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


## Model fitting utils
from keras.models import Sequential
from keras.layers import InputLayer, Embedding, LSTM, Bidirectional, TimeDistributed, Dense, Activation
from keras.optimizers import Adam

In [None]:
from pickle import load
from pickle import dump


## Some helper functions

In [None]:
## define helper functions
## converts the softmax output to a POS tag
def convert_logits_to_class_labels(model_pred_tags, index_to_tag_dict, limit):
  counter = 0
  predicted_pos_tag = list()
  for y_hat in model_pred_tags:
    if counter == limit:
      break
    predicted_pos_tag.append(index_to_tag_dict[y_hat])
    counter +=1
  return np.asarray(predicted_pos_tag)

## converts a sequence of words into a sequence of integers using the sentence to index dictionary
def convert_sentence_to_integer_sequence(sentence_to_index_dict, sentence):
    to_return = []
    for word in sentence:
        try:
            to_return.append(sentence_to_index_dict[word])
        except KeyError:
            to_return.append(sentence_to_index_dict['OOV'])
    return np.asarray([to_return])

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

## Load Dataset

In [None]:
##Mount drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
## change directory to point to the Google drive folder that contains the dataset
%cd "/content/drive/My Drive/NLP_Class"
!pwd

/content/drive/My Drive/NLP_Class
/content/drive/My Drive/NLP_Class


In [None]:

training_data = "/sentence_pos_training.csv"
input_path = "/content/drive/My Drive/NLP_Class"


training_df = pd.read_csv(input_path + training_data)


In [None]:
## extract useful columns
training_df = training_df.iloc[:, 1:]
training_df.head()


local_train = training_df.iloc[:,:]

##maintain only setnece with more than two tokens or words
local_train = local_train[local_train.groupby('sentence_id').token_id.transform(len) > 2]
local_train.tail(10)


Unnamed: 0,sentence_id,token_id,pos_tag
4107505,748062,in,IN
4107506,748062,the,DT
4107507,748062,Morbihan,NN
4107508,748062,department,NN
4107509,748062,of,IN
4107510,748062,Brittany,NNP
4107511,748062,in,IN
4107512,748062,north,RB
4107513,748062,western,JJ
4107514,748062,France,NNP


## Creating Sequences of Sentences

In [None]:
unique_sentences_train = np.unique(local_train.sentence_id.values)

sentences_sequence = list()
sentences_tagged_sequence = list()

for sentence in unique_sentences_train:

    this_sentence_record_new = local_train.loc[local_train["sentence_id"] == sentence]
    sent_flattened = this_sentence_record_new["token_id"].values.flatten()
    pos_tag = this_sentence_record_new["pos_tag"].values.flatten()

    sentences_sequence.append(sent_flattened)
    sentences_tagged_sequence.append(pos_tag)
    print(sentence)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
740255
740256
740257
740258
740259
740260
740261
740263
740265
740267
740269
740270
740271
740272
740273
740275
740276
740277
740278
740279
740280
740282
740283
740284
740288
740289
740290
740292
740293
740295
740296
740298
740299
740301
740302
740303
740304
740305
740306
740310
740311
740313
740314
740315
740316
740317
740319
740323
740327
740329
740331
740332
740333
740334
740335
740336
740338
740342
740344
740346
740348
740349
740351
740352
740353
740359
740361
740362
740363
740364
740365
740367
740368
740370
740373
740374
740379
740382
740384
740387
740388
740389
740390
740391
740393
740396
740397
740400
740404
740405
740407
740409
740410
740411
740414
740415
740416
740417
740420
740422
740424
740427
740428
740429
740431
740432
740433
740434
740435
740436
740438
740439
740440
740441
740442
740443
740444
740445
740446
740448
740450
740452
740453
740454
740455
740456
740458
740460
740462
740464
740465
740467
740470
7404

### Flatten the sentence array above

In [None]:

flattend_sentence = np.concatenate(sentences_sequence).ravel().tolist()
flattend_tags = np.concatenate(sentences_tagged_sequence).ravel().tolist()

## Account for instance where our training corpus doesn't have words in the test data
flattend_sentence.append("OOV")  ## oov for out of vocabulary
##flattend_tags.append("OOV_TAG")

### Convert the words into a integer values.. necessary for embedding

In [None]:
tags_to_index_encoded = {tag: index for index, tag in enumerate(np.unique(flattend_tags))}

index_to_tag_encoded = {index: tag for index, tag in enumerate(np.unique(flattend_tags))}

sentence_to_index_encoded = {token: index for index, token in enumerate(np.unique(flattend_sentence))}

del tags_to_index_encoded["nan"] ## delete this unknown tag

print("size of vocabularly", len(sentence_to_index_encoded))


size of vocabularly 239087


In [None]:
save_clean_data(sentence_to_index_encoded, 'sentence_to_index_encoded.pkl')
save_clean_data(tags_to_index_encoded, 'tags_to_index_encoded.pkl')
save_clean_data(index_to_tag_encoded, 'index_to_tag_encoded.pkl')



Saved: sentence_to_index_encoded.pkl
Saved: tags_to_index_encoded.pkl
Saved: index_to_tag_encoded.pkl


## Vectorize the sequence of words in a sentence using their corresponding index number in the above dictionary

In [None]:
## Start with the sentences
sentences_vectorized = list()

for entry in sentences_sequence:
    this_sent = list()
    for token in entry:

      try:
        this_sent.append(sentence_to_index_encoded[token])
      except KeyError:
        this_sent.append(sentence_to_index_encoded['OOV'])
 
    sentences_vectorized.append(this_sent)

sentences_vectorized[15]

[167425,
 179940,
 178860,
 167686,
 126373,
 2725,
 50567,
 209995,
 227640,
 197079,
 158849,
 232074,
 186676,
 195137,
 226580]

In [None]:
## vectorize the sequence of tags as well
tags_vectorized = list()
import random

arr = ["NN", "VBG", "JJ", "SYM"]


for tags in sentences_tagged_sequence:

  this_tag = list()
  for tag in tags:
    try:
      this_tag.append(tags_to_index_encoded[tag])

    except KeyError:
        random_tag = random.choice(arr)
        this_tag.append(tags_to_index_encoded[random_tag])

  tags_vectorized.append(this_tag)

### Pad sequences to max length

In [None]:
MAX_SEQUENCE = 25
sentences_vectorized_padded = pad_sequences(sentences_vectorized, maxlen=MAX_SEQUENCE, padding='post')
tags_vectorized_padded = pad_sequences(tags_vectorized, maxlen=MAX_SEQUENCE, padding='post')

In [None]:
## Save the cleaned file

save_clean_data(sentences_vectorized_padded, 'sentences_vectorized_padded.pkl')
save_clean_data(tags_vectorized_padded, 'tags_vectorized_padded.pkl')

NameError: ignored

## Building the model

In [None]:
## add the structure
model = Sequential()

## add the input layer
model.add(InputLayer(input_shape=(MAX_SEQUENCE, ))) 
## add the embedding layer of size 200 vectors
model.add(Embedding(input_dim= len(sentence_to_index_encoded), output_dim=200, input_length = MAX_SEQUENCE))
## add the bidirectional LSTM model with 128 neurons or units
model.add(Bidirectional(LSTM(128, return_sequences=True)))
## add the fully connected dense layer
model.add(TimeDistributed(Dense(len(tags_to_index_encoded))))
## add the softmax activation function
model.add(Activation('softmax'))
## compile the model using these lose values 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
 
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 200)           47817400  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 25, 256)           336896    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 25, 88)            22616     
_________________________________________________________________
activation_1 (Activation)    (None, 25, 88)            0         
Total params: 48,176,912
Trainable params: 48,176,912
Non-trainable params: 0
_________________________________________________________________


### Converting pos tags to categorical for the dense layer

In [None]:
## convert the POS tags to one hot categorical encoding
encoded = to_categorical(tags_vectorized_padded)
model.fit(sentences_vectorized_padded, encoded, batch_size=200, epochs=40, validation_split=0.2)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 308758 samples, validate on 77190 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.callbacks.History at 0x7fcb875ca710>

## Evaluate and save model

In [None]:
model.evaluate(sentences_vectorized_padded, encoded)

# save model and architecture to single file
model.save("coffie_model_latest.h5")
print("Successfully saved model to file")


Successfully saved model to file


In [None]:
model.evaluate(sentences_vectorized_padded, encoded)




[0.03857001174768763, 0.9952389597892761]

## Test the model

In [None]:
from keras.models import load_model

MAX_SEQUENCE = 25

## load model
model = load_model('coffie_model_latest.h5')

# summarize model ensure its still the one that we saved
print(model.summary())

## get test file names... please change the file names to the correct test file
test_file_name = "test_sentences_hw02.csv" ## change here
test_file_output_name = "coffie_pos_2.csv"

## read test file.. i assumed the format of the test file is tab a delimited .txt file. Please change it to the correct format using the sep paramter
testing_df = pd.read_csv(test_file_name)
unique_sentences_test = np.unique(testing_df.sentence_id.values)
test_sentences_sequence = list()
out_pos_test = pd.DataFrame()

sentence_to_index_encoded = load_clean_sentences('sentence_to_index_encoded.pkl')
index_to_tag_encoded = load_clean_sentences('index_to_tag_encoded.pkl')


for sentence in unique_sentences_test:
    print(sentence)

    this_sentence_record_new = testing_df.loc[testing_df["sentence_id"] == sentence]
    sent_flattened = this_sentence_record_new["before"].values.flatten()
    sent_tokens = this_sentence_record_new["token_id"].values.flatten()


    ## convert_sentence_to_integer_sequence
    sequence_data = convert_sentence_to_integer_sequence(sentence_to_index_encoded, sent_flattened)
    
    ## pad sequence to 25
    sequence_data_padded = pad_sequences(sequence_data, maxlen=MAX_SEQUENCE, padding='post')

    print(sent_flattened)
    sent_len = len(sent_flattened)
    if(sent_len > 25):
      sent_len = MAX_SEQUENCE
      sent_tokens = sent_tokens[0:25]


    
    ## Make prediction
    predicted_labels_prob = model.predict_classes(sequence_data_padded)
    predicted_labels = convert_logits_to_class_labels(predicted_labels_prob[0], index_to_tag_encoded, sent_len)

    ## format the predicted labels correctly
    inner_df = pd.DataFrame({"sentence_id": np.repeat(sentence, sent_len), "token_id": sent_tokens, "pos": predicted_labels})
    out_pos_test = out_pos_test.append(inner_df, ignore_index = True)

## save to csv file. i assumed the required .txt format is also tab delimited
out_pos_test["sentence_id"] = out_pos_test["sentence_id"].astype(int)
out_pos_test["token_id"] = out_pos_test["token_id"].astype(int)
out_pos_test.to_csv(test_file_output_name, index=False)
## this will be saved to the current Google Drive folder... kindly check that


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['Brigaglia' 'Storia' 'della' 'Sardegna' '(' '1995' ')' 'p' '.43' '.']
740685
['Nigeria' 'has' 'joined' 'the' 'space' 'race' 'Others' 'should' 'join'
 'too' '"' '.']
740695
['Code' 'of' 'Honor' 'consists' 'of' 'an' 'oath' 'and' 'five' 'tenets' '.']
740696
['Barksdale' 'and' 'Stringer' 'Bell' 'meet' 'with' 'Wee' 'Bey' 'Brice'
 'and' 'tell' 'him' 'to' 'contact' 'Omar' 'Little' 'to' 'negotiate' 'a'
 'truce' '.']
740697
['the' '1995' 'Nunavut' 'capital' 'plebiscite' ',' 'Iqaluit' 'defeated'
 'Rankin' 'Inlet' 'to' 'become' 'territorial' 'capital' 'of' 'Nunavut' '.']
740699
['the' 'same' 'time' ',' 'the' 'League' 'was' 'incorporated' 'and'
 'engaged' 'its' 'first' 'Assistant' 'Secretary' 'to' 'increase' 'its'
 'services' 'to' 'members' '.']
740703
['$153,000' 'in' 'college' 'scholarships' 'are' 'awarded' 'at' 'each'
 'national' 'tournament' ',' 'making' 'it' 'possible' 'for' 'students'
 'to' 'pursue' 'post' 'secondary' 'educati

In [None]:
## joining the 2 dataframes... Because of computing and internet issues I had to divide the test dataset into 2
df_2 = pd.read_csv("coffie_pos_2.csv")
df_1 = pd.read_csv("coffie_pos_1.csv")

df_1 = df_1.iloc[0:-13, :]
final_output = df_1.append(df_2, ignore_index = True)
final_output.to_csv("coffie_pos.csv", index=False)

Unnamed: 0,sentence_id,token_id,pos
0,1,0,NN
1,1,1,NN
2,1,2,NNP
3,1,3,NN
4,1,4,IN
