In [0]:
import numpy as np
import pandas as pd
import keras
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import os
import nltk
nltk.download('punkt')

Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
train_file = pd.read_csv("/content/drive/Shared drives/EMNLP/full_prepped_training.csv")
test_file = pd.read_csv("/content/drive/Shared drives/EMNLP/prepped_test.csv")

In [0]:
# original; edited_sentence; word_position; cosine_similarity
def dataset_gathering(dataset):
    # origin = dataset["original"]
    edited = dataset["edited_sentence"]
    labels = dataset["meanGrade"]
    # position = dataset["word_position"]
    # cosine = dataset["cosine_similarity"]

    res = []
    for sample in zip(edited, labels):
        res.append(sample)
    return res

In [0]:
train_file = dataset_gathering(train_file)
test_file = dataset_gathering(test_file)

In [0]:
print(len(train_file))
print(len(test_file))

17900
3024


In [0]:
print(train_file[0])
print(test_file[0])

('France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq', 0.2)
('The Latest : Election tally shows Cars turning right', 1.2)


In [0]:
print('Tokenizeing...')
print('\n')

def tokenize(data):
    res = []
    for samples in data:
        # nltk.word_tokenize用于取tokens
        temp_t = nltk.word_tokenize(samples[0])
        res.append([temp_t, samples[1]])
    return res


train_file = tokenize(train_file)
test_file = tokenize(test_file)
print(train_file[0])
print(test_file[0])
print('\n')
print('Tokenization completed!')

Tokenizeing...


[['France', 'is', '‘', 'hunting', 'down', 'its', 'citizens', 'who', 'joined', 'twins', '’', 'without', 'trial', 'in', 'Iraq'], 0.2]
[['The', 'Latest', ':', 'Election', 'tally', 'shows', 'Cars', 'turning', 'right'], 1.2]


Tokenization completed!


In [0]:
def lower_case(data):
  print('-----Converting to lowercases-----')
  res = []
  temp_words = []
  for samples in data:
    for words in samples[0]:
      temp_words.append(words.lower())
    res.append([temp_words, samples[1]])
    temp_words = []
  print('-----Complete!-----')
  return res

train_file = lower_case(train_file)
test_file = lower_case(test_file)
print(train_file[0])
print(test_file[0])

-----Converting to lowercases-----
-----Complete!-----
-----Converting to lowercases-----
-----Complete!-----
[['france', 'is', '‘', 'hunting', 'down', 'its', 'citizens', 'who', 'joined', 'twins', '’', 'without', 'trial', 'in', 'iraq'], 0.2]
[['the', 'latest', ':', 'election', 'tally', 'shows', 'cars', 'turning', 'right'], 1.2]


In [0]:
# cacluating maxlen
def maxlen_cal(data):
  print('-----Cacluating maxlen-----')
  maxlen = 0
  for samples in data:
    temp = len(samples[0])
    if temp >= maxlen:
      maxlen = temp
  print('-----Complete!-----')
  return maxlen

In [0]:
print(maxlen_cal(train_file))
print(maxlen_cal(test_file))

-----Cacluating maxlen-----
-----Complete!-----
29
-----Cacluating maxlen-----
-----Complete!-----
27


In [0]:
maxlen = 30

In [0]:
def padding_len(data, maxlen):
  print('-----Padding for dataset-----')
  maxlen = maxlen
  res = []
  for samples in data:
    temp_len = len(samples[0])
    for k in range(maxlen - temp_len):
      samples[0].append('.')
    res.append([samples[0], samples[1]])
  print('-----Complete!-----')
  return res

In [0]:
train_file = padding_len(train_file, maxlen)
test_file = padding_len(test_file, maxlen)

print(maxlen_cal(train_file))
print(maxlen_cal(test_file))

-----Padding for dataset-----
-----Complete!-----
-----Padding for dataset-----
-----Complete!-----
-----Cacluating maxlen-----
-----Complete!-----
30
-----Cacluating maxlen-----
-----Complete!-----
30


In [0]:
print(train_file[0])
print(test_file[0])

[['france', 'is', '‘', 'hunting', 'down', 'its', 'citizens', 'who', 'joined', 'twins', '’', 'without', 'trial', 'in', 'iraq', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.'], 0.2]
[['the', 'latest', ':', 'election', 'tally', 'shows', 'cars', 'turning', 'right', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.'], 1.2]


In [0]:
def corpus_labels_gathering(data):
  corpus_res = []
  labels_res = []
  for samples in data:
    corpus_res.append(samples[0])
    labels_res.append(samples[1])
    
  labels_res = np.array(labels_res)
  return corpus_res, labels_res

In [0]:
x_train, y_train = corpus_labels_gathering(train_file)
x_test, y_test = corpus_labels_gathering(test_file)

print(y_train.shape)
print(y_test.shape)
print(x_train[0])
print(x_test[0])

(17900,)
(3024,)
['france', 'is', '‘', 'hunting', 'down', 'its', 'citizens', 'who', 'joined', 'twins', '’', 'without', 'trial', 'in', 'iraq', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']
['the', 'latest', ':', 'election', 'tally', 'shows', 'cars', 'turning', 'right', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']


In [0]:
glove_dir = os.path.join('/content/drive/', 'My Drive')
embedding_dim = 300
# Dictionary where we store the word:vector_embedding map
embeddings_index = {}
word_index = {}
count=0

# Setting up embedding array
f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float64')
    if coefs.shape[0] != embedding_dim:
      continue
    # Embeddings is a dictionary of words:word_vector_embeddings
    embeddings_index[word] = coefs
    word_index[word] = count
    count+=1
f.close()
print('Found {} word vectors.'.format(len(embeddings_index)))

Found 400000 word vectors.


In [0]:
print('Our word index dictionary is given by: (word, index), a sample 10 entries are:')
list(word_index.items())[:10]

Our word index dictionary is given by: (word, index), a sample 10 entries are:


[('the', 0),
 (',', 1),
 ('.', 2),
 ('of', 3),
 ('to', 4),
 ('and', 5),
 ('in', 6),
 ('a', 7),
 ('"', 8),
 ("'s", 9)]

In [0]:
oov = "OOV"

def oov_vec_gathering(embedding_dim):
  res = np.random.rand(1, embedding_dim)
  return res

oov_vec = oov_vec_gathering(embedding_dim = embedding_dim)

In [0]:
embeddings_index[oov] = oov_vec
word_index[oov] = count + 1
count += 1
print('Found {} word vectors.'.format(len(embeddings_index)))

Found 400001 word vectors.


In [0]:
# --- Preprocessing the GloVe word-embeddings matrix --
max_words = count
# Instantiating a 10000 x 100 matrix 
# embedding_matrix = (number_words, dim_embedding)
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    # Make sure that we are not exceeding the max token size
    if i < max_words:
        # Get the embedded vector for the word
        embedding_vector = embeddings_index.get(word)
        # Provided that a word is known store it in the 
        # embeddig matrix at position i
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [0]:
print("The size of the word embedding matrix is:" + str(embedding_matrix.shape))

The size of the word embedding matrix is:(400001, 300)


In [0]:
def data2vec(data):
  res = []
  for samples in data:
    for items in samples:
      if items not in embeddings_index.keys():
        word_index[items] = count
      res.append(word_index[items])

  res = np.array(res)
  return res

In [0]:
x_train = data2vec(x_train)
x_test = data2vec(x_test)

In [0]:
x_train = x_train.reshape(-1, maxlen)
x_test = x_test.reshape(-1, maxlen)
print(x_train.shape)
print(x_test.shape)

(17900, 30)
(3024, 30)


In [0]:
# model = keras.models.Sequential([
#         keras.layers.Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=True),
#         keras.layers.Bidirectional(keras.layers.LSTM(2048, activation='tanh', recurrent_activation='sigmoid', use_bias=True)),
#         keras.layers.Dropout(0.5),
#         keras.layers.Dense(1024),
#         keras.layers.LeakyReLU(alpha=0.3),
#         keras.layers.Dropout(0.5),
#         keras.layers.Dense(512),
#         keras.layers.LeakyReLU(alpha=0.3),
#         keras.layers.Dropout(0.5),
#         keras.layers.Dense(128),
#         keras.layers.LeakyReLU(alpha=0.2),
#         keras.layers.Dropout(0.5),
#         keras.layers.Dense(32), 
#         keras.layers.LeakyReLU(alpha=0.2),
#         keras.layers.Dropout(0.5),
#         # keras.layers.Dense(1, activation='relu')     
#         keras.layers.Dense(1, activation="relu")                
# ])
model = keras.models.Sequential([
        keras.layers.Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False),
        # keras.layers.Bidirectional(keras.layers.LSTM(2048, activation='tanh', recurrent_activation='sigmoid', use_bias=True, return_sequences=True)),
        # keras.layers.Bidirectional(keras.layers.LSTM(1024, activation='tanh', recurrent_activation='sigmoid', use_bias=True, return_sequences=False)),
        keras.layers.LSTM(2048, activation='tanh', recurrent_activation='sigmoid', use_bias=True, return_sequences=True),
        keras.layers.Bidirectional(keras.layers.LSTM(2048, activation='tanh', recurrent_activation='sigmoid', use_bias=True, return_sequences=False)),
        keras.layers.Dense(1024, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(512, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(32, activation='relu'), 
        keras.layers.Dropout(0.5),
        keras.layers.Dense(1, activation='relu')                     
])

In [0]:
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 300)           120000300 
_________________________________________________________________
lstm_4 (LSTM)                (None, 30, 2048)          19243008  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 4096)              67125248  
_________________________________________________________________
dense_11 (Dense)             (None, 1024)              4195328   
_________________________________________________________________
dropout_10 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 512)               524800    
_________________________________________________________________
dropout_11 (Dropout)         (None, 512)              

In [0]:
opt = keras.optimizers.adam(learning_rate=4e-5)
# opt = keras.optimizers.sgd(learning_rate=1e-3, momentum=0.9)
# opt = keras.optimizers.rmsprop(learning_rate=1e-3)

In [0]:
model.compile(optimizer=opt,
              loss = "mean_squared_error",
              metrics = ['mse'])

In [0]:
history = model.fit(x_train, y_train,
            epochs = 50,
            batch_size = 64,
            validation_data = [x_test, y_test])

Train on 17900 samples, validate on 3024 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50