**Xing Yi Chan**

**R00183768**

### **Part 3**

The third task asks machine to generate the reasons and we use BLEU to evaluate them.

In [None]:
import pandas as pd
import numpy as np
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Embedding
from pickle import dump, load

In [None]:
# import both training and testing data
train_data = pd.read_csv('/content/drive/My Drive/NLP/dataset2/traindata/subtaskC_data_all.csv').values
train_label = pd.read_csv('/content/drive/My Drive/NLP/dataset2/traindata/subtaskC_answers_all.csv', names=['id', 's1', 's2', 's3']).values

test_data = pd.read_csv('/content/drive/My Drive/NLP/dataset2/testdata/subtaskC_trial_data.csv').values
test_label = pd.read_csv('/content/drive/My Drive/NLP/dataset2/testdata/subtaskC_answers.csv').values

In [None]:
# preparing text
text = ' '
for x in range(len(train_data)):
    # combine all sentences in train_data and train_label
    sentences = train_data[x][1] + train_label[x][1] + train_label[x][2] + train_label[x][3]
    sentences = sentences.replace('.', ' ').lower() # replace fullstops to spaces and convert every word to lower case
    text = text + sentences

### **Clean and save text**


In [None]:
def clean_doc(doc):
    # replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	return tokens

# organize into sequences of tokens
length = 50 + 1
tokens = clean_doc(text)
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)

# write sequences into a file
def save_doc(lines, filename):
    data = '\n'.join(lines)
    datafile = open(filename, 'w')
    datafile.write(data)
    datafile.close()

filename ='/content/drive/My Drive/NLP/dataset2/Part3_data'
save_doc(sequences, filename)

### **Read and encode sequences**

In [None]:
# load sequences
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# load file
doc = load_doc(filename)
lines = doc.split('\n')
lines = lines[:150000]

# encode sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

# vocab size
vocab_size = len(tokenizer.word_index) + 1

# seperate input and output
sequences = np.array(sequences)

X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

### **Create and train model**

In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            773200    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 15464)             1561864   
Total params: 2,485,964
Trainable params: 2,485,964
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=5)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7f5a7aed1b00>

In [None]:
# save model to file
model.save('/content/drive/My Drive/NLP/dataset2/model.h5')

# save tokenizer
dump(tokenizer, open('/content/drive/My Drive/NLP/dataset2/tokenizer.pkl', 'wb'))

### **Generate text**

In [None]:
# generate sequence using language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text

    # generate a fixed number of words:
    for i in range(n_words):
        # encode text into integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        prob = model.predict_classes(encoded, verbose=0)
        # map predicted index word to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == prob:
                out_word = word
                break
        # append into input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [None]:
# load the model
model = load_model('/content/drive/My Drive/NLP/dataset2/model.h5')
 
# load the tokenizer
tokenizer = load(open('/content/drive/My Drive/NLP/dataset2/tokenizer.pkl', 'rb'))
 
# generate new text
results = []
for test_item in test_data:
    result = []
    i = test_item[0]
    sentence = test_item[1]
    response = generate_seq(model, tokenizer, seq_length, sentence, 10)
    result.extend([i, response])
    results.append(result)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
# save generated results into file
pd.DataFrame(results).to_csv('/content/drive/My Drive/NLP/PartC(results).csv')

### **Evaluation**

In [None]:
# calculate the bleu score of the generated text
from nltk.translate.bleu_score import sentence_bleu

# clean the sentences
def clean_sen(input_sen):
    # split into tokens by white space
    tokens = input_sen.split()

    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

hypothesis = clean_sen(test_label)
reference = clean_sen(results)

bleuscore = sentence_bleu(reference, hypothesis, weights = (0.5, 0.5))
print('Bleu score of generated text is :', bleuscore)

Bleu score of generated text is : 71.63
