In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install numpy==1.16.1
!pip install tensorflow==1.15

In [None]:
import tensorflow as tf
from tensorflow.contrib import learn
import pandas as pd
import numpy as np
import datetime
import pickle as pkl
import time
import json
import os
import sys
import csv
os.chdir('/content/drive/My Drive')

Import hand-crafted features

In [None]:
handcrafted_features = np.loadtxt('handcrafted.txt', skiprows=1, delimiter='\t', usecols=(1,2,3,4,5,6,7,8,9,10,11))

In [None]:
handcrafted_features = np.delete(handcrafted_features, 6, axis = 1)

In [None]:
# Normalize
hf = handcrafted_features
for i in range(hf.shape[1]):
    tmp_hf = hf[:, i]
    tmp_hf = (tmp_hf - min(tmp_hf)) / (max(tmp_hf) - min(tmp_hf))
    hf[:, i] = tmp_hf

Import sentence embedding

In [None]:
semantic_contents = np.load('/content/drive/My Drive/Graduate Thesis/Semantic Model/word embedding/set1_data.npy')
semantic_labels = np.load('/content/drive/My Drive/Graduate Thesis/Semantic Model/word embedding/set1_label.npy')

semantic_lengths = []
for i in range(len(semantic_contents)):
    semantic_lengths.append(len(semantic_contents[i]))
semantic_lengths = np.array(semantic_lengths)
semantic_max_length = max(semantic_lengths)

In [None]:
tmp = [0] * 768
for i in range(len(semantic_contents)):
  length = len(semantic_contents[i])
  if length < semantic_max_length:
    tmp_ = np.tile(tmp, (semantic_max_length - length, 1))
    semantic_contents[i] = np.append(semantic_contents[i], tmp_, axis=0)

In [None]:
semantic_data = [semantic_contents[0]]
for i in range(1, len(semantic_contents)):
    semantic_data = np.concatenate((semantic_data, [semantic_contents[i]]), axis=0)
semantic_contents = semantic_data

In [None]:
np.save('/content/drive/My Drive/XGBoost Model/data/semantic_data', semantic_contents)
np.save('/content/drive/My Drive/XGBoost Model/data/semantic_label', semantic_labels)
np.save('/content/drive/My Drive/XGBoost Model/data/semantic_length', semantic_lengths)

In [None]:
semantic_contents = np.load('/content/drive/My Drive/XGBoost Model/data/semantic_data.npy')
semantic_labels = np.load('/content/drive/My Drive/XGBoost Model/data/semantic_label.npy')
semantic_lengths = np.load('/content/drive/My Drive/XGBoost Model/data/semantic_length.npy')

In [None]:
# Calculate rate
label_rate = []
for i in range(0, 13):
    label_rate.append(np.sum(semantic_labels==i))
label_rate = np.array(label_rate)
label_rate = label_rate / (len(semantic_labels))

label_rates = []
for i in range(len(semantic_labels)):
    label_rates.append(label_rate[int(semantic_labels[i])])

In [None]:
prompt_contents = np.load('/content/drive/My Drive/Prompt-related Model/word embedding/set1_gold_data.npy')
prompt_labels = np.load('/content/drive/My Drive/Prompt-related Model/word embedding/set1_gold_label.npy')

prompt_lengths = []
for i in range(len(prompt_contents)):
    prompt_lengths.append(len(prompt_contents[i]))
prompt_lengths = np.array(prompt_lengths)
prompt_max_length = max(prompt_lengths)

In [None]:
tmp = [0] * 768
for i in range(len(prompt_contents)):
    length = len(prompt_contents[i])
    if length < prompt_max_length:
        tmp_ = np.tile(tmp, (prompt_max_length - length, 1))
        prompt_contents[i] = np.append(prompt_contents[i], tmp_, axis=0)

In [None]:
prompt_data = [prompt_contents[0]]
for i in range(1, len(prompt_contents)):
    prompt_data = np.concatenate((prompt_data, [prompt_contents[i]]), axis=0)
prompt_contents = prompt_data

In [None]:
np.save('/content/drive/My Drive/XGBoost Model/data/prompt_data', prompt_contents)
np.save('/content/drive/My Drive/XGBoost Model/data/prompt_label', prompt_labels)
np.save('/content/drive/My Drive/XGBoost Model/data/prompt_length', prompt_lengths)

In [None]:
prompt_contents = np.load('/content/drive/My Drive/XGBoost Model/data/prompt_data.npy')
prompt_labels = np.load('/content/drive/My Drive/XGBoost Model/data/prompt_label.npy')
prompt_lengths = np.load('/content/drive/My Drive/XGBoost Model/data/prompt_length.npy')

In [None]:
def batch_iter(gold_data, gold_labels, gold_lengths, 
               gold_rate, batch_size, num_epochs):


    assert len(gold_data) == len(gold_labels) == len(gold_lengths) == len(gold_rate)
    data_size = len(gold_data)

    epoch_length = data_size // batch_size
  
    for _ in range(num_epochs):
        for i in range(epoch_length):
            start_index = i * batch_size
            end_index = start_index + batch_size

            xdata = gold_data[start_index: end_index]
            ydata = gold_labels[start_index: end_index]
            sequence_length = gold_lengths[start_index: end_index]
            rate = gold_rate[start_index: end_index]
            
            yield xdata, ydata, sequence_length, rate

Import pre-trained LSTM model

In [None]:
# Import QWK-relative package
import sys
sys.path.append('/content/drive/My Drive/ASAPAES')
import score

In [None]:
# Semantic Model
min_val = 2
max_val = 12
inference_graph = tf.Graph()
with tf.Session(graph = inference_graph) as sess:
  
    graph = tf.get_default_graph()
    path = '/content/drive/My Drive/Semantic Model/runs/LSTM/'

  
    for i in range(3,4):
        print("No." + str(i) + " Model\n")
        lstm_model = tf.train.import_meta_graph(os.path.join(path, "Model_"+str(i), "model", "clf-300.meta"))
        lstm_model.restore(sess, tf.train.latest_checkpoint(os.path.join(path, "Model_"+str(i), "model")))
    
        x_dev = semantic_contents
        y_dev = (semantic_labels - min_val) / (max_val - min_val)
        dev_lengths = semantic_lengths
        dev_rates = label_rates

        print('\nDevlopment Set Validation ' + str(i))
        dev_data = batch_iter(x_dev, y_dev, dev_lengths, dev_rates, 1783, 1)
        for dev_input in dev_data:
      
            x_ = inference_graph.get_tensor_by_name('input_x:0')
            y_ = inference_graph.get_tensor_by_name('input_y:0')
            rate_ = inference_graph.get_tensor_by_name('rate:0')
            prediction_ = inference_graph.get_tensor_by_name('sigmoid/predictions:0')
            keep_prob_ = inference_graph.get_tensor_by_name('keep_prob:0')
            loss_ = inference_graph.get_tensor_by_name('loss/loss:0')
            accuracy_ = inference_graph.get_tensor_by_name('accuracy/accuracy:0')
            sequence_length_ = inference_graph.get_tensor_by_name('sequence_length:0')
            batch_size_ = inference_graph.get_tensor_by_name('batch_size:0')
            vars = sess.run([accuracy_, loss_, prediction_], 
                      feed_dict={x_: dev_input[0],
                                 y_: dev_input[1],
                                 keep_prob_: 1.0,
                                 sequence_length_: dev_input[2],
                                 rate_: dev_input[3],
                                 batch_size_ : 1783})
            time_str = datetime.datetime.now().isoformat()
            print(np.round(vars[2]*10+2))
            print (dev_input[1]*10+2)
            qwks = score.quadratic_weighted_kappa(np.round((vars[2]*10+2)).astype(int), (dev_input[1]*10+2).astype(int), 2, 12)
            print("qwks",qwks," ", score.mean_quadratic_weighted_kappa([qwks]))
            acc = np.sum(((np.round(vars[2]*10+2)-(dev_input[1]*10+2))==0)==True) / len(vars[2])
            print("{}: loss: {:g}, accuracy: {:g}".format(time_str, vars[1], acc))
            print('End Development Set Validation ' + str(i) +'\n')

In [None]:
semantic_prediction_LSTM = vars[2]

In [None]:
semantic_prediction_Bi_LSTM = vars[2]

In [None]:
def batch_iter_coherence(gold_data, gold_labels, gold_lengths, 
               gold_rate, batch_size, num_epochs):

    assert len(gold_data) == len(gold_labels) == len(gold_lengths) == len(gold_rate)
    data_size = len(gold_data)

    epoch_length = data_size // batch_size + 1
    gold_data = np.concatenate((gold_data, gold_data[0:batch_size]), axis = 0)
    gold_labels = np.concatenate((gold_labels, gold_labels[0:batch_size]), axis = 0)
    gold_lengths = np.concatenate((gold_lengths, gold_lengths[0:batch_size]), axis = 0)
    gold_rate = np.concatenate((gold_rate, gold_rate[0:batch_size]), axis = 0)
  
    for _ in range(num_epochs):
        for i in range(epoch_length):
            start_index = i * batch_size
            end_index = start_index + batch_size

            xdata = gold_data[start_index: end_index]
            ydata = gold_labels[start_index: end_index]
            sequence_length = gold_lengths[start_index: end_index]
            rate = gold_rate[start_index: end_index]
            
            yield xdata, ydata, sequence_length, rate

In [None]:
# Coherence Model
min_val = 0
max_val = 12
inference_graph = tf.Graph()
res = np.array([])
with tf.Session(graph = inference_graph) as sess:
  
    graph = tf.get_default_graph()
    path = '/content/drive/My Drive/Coherence Model/runs/Bi-LSTM/'

  
    for i in range(2,3):
        print("No." + str(i) + " Model\n")
        lstm_model = tf.train.import_meta_graph(os.path.join(path, "Model_"+str(i), "model", "clf-500.meta"))
        lstm_model.restore(sess, tf.train.latest_checkpoint(os.path.join(path, "Model_"+str(i), "model")))
    
        x_dev = semantic_contents
        y_dev = semantic_labels
        dev_lengths = semantic_lengths
        dev_rates = label_rates

        print('\nDevlopment Set Validation ' + str(i))
        dev_data = batch_iter_coherence(x_dev, y_dev, dev_lengths, dev_rates, 256, 1)
        for dev_input in dev_data:
      
            x_ = inference_graph.get_tensor_by_name('input_x:0')
            y_ = inference_graph.get_tensor_by_name('input_y:0')
            rate_ = inference_graph.get_tensor_by_name('rate:0')
            prediction_ = inference_graph.get_tensor_by_name('sigmoid/predictions:0')
            keep_prob_ = inference_graph.get_tensor_by_name('keep_prob:0')
            loss_ = inference_graph.get_tensor_by_name('loss/loss:0')
            accuracy_ = inference_graph.get_tensor_by_name('accuracy/accuracy:0')
            sequence_length_ = inference_graph.get_tensor_by_name('sequence_length:0')
            batch_size_ = inference_graph.get_tensor_by_name('batch_size:0')
            vars = sess.run([accuracy_, loss_, prediction_], 
                      feed_dict={x_: dev_input[0],
                                 y_: dev_input[1],
                                 keep_prob_: 1.0,
                                 sequence_length_: dev_input[2],
                                 rate_: dev_input[3],
                                 batch_size_ : 256})
            time_str = datetime.datetime.now().isoformat()
            res = np.concatenate((res, vars[2]), axis = 0)
            print(vars[2])
            print (dev_input[1])
            qwks = score.quadratic_weighted_kappa((vars[2]).astype(int), dev_input[1], 2, 12)
            print("qwks",qwks," ", score.mean_quadratic_weighted_kappa([qwks]))
            acc = np.sum(((vars[2]-dev_input[1])==0)==True) / len(vars[2])
            print("{}: loss: {:g}, accuracy: {:g}".format(time_str, vars[1], acc))
            print('End Development Set Validation ' + str(i) +'\n')

In [None]:
coherence_prediction_Bi_LSTM = res[0:1783]
qwks = score.quadratic_weighted_kappa(coherence_prediction_Bi_LSTM.astype(int), semantic_labels.astype(int), 2, 12)
print("qwks", qwks, " ", score.mean_quadratic_weighted_kappa([qwks]))
acc = np.sum(((coherence_prediction_Bi_LSTM - semantic_labels)==0)==True) / len(semantic_labels)
print("acc ", acc)

In [None]:
coherence_prediction_LSTM = res[0:1783]
qwks = score.quadratic_weighted_kappa(coherence_prediction_LSTM.astype(int), semantic_labels.astype(int), 2, 12)
print("qwks", qwks, " ", score.mean_quadratic_weighted_kappa([qwks]))
acc = np.sum(((coherence_prediction_LSTM - semantic_labels)==0)==True) / len(semantic_labels)
print("acc ", acc)

In [None]:
def batch_iter_prompt(gold_data, gold_labels, gold_lengths, batch_size, num_epochs):

    assert len(gold_data) == len(gold_labels) == len(gold_lengths)
    data_size = len(gold_data)

    epoch_length = data_size // batch_size
  
    for _ in range(num_epochs):
        for i in range(epoch_length):
            start_index = i * batch_size
            end_index = start_index + batch_size

            xdata = gold_data[start_index: end_index]
            ydata = gold_labels[start_index: end_index]
            sequence_length = gold_lengths[start_index: end_index]
            
            yield xdata, ydata, sequence_length

In [None]:
# Prompt-related Model
min_val = 0
max_val = 12
inference_graph = tf.Graph()
res = np.array([])
with tf.Session(graph = inference_graph) as sess:
  
    graph = tf.get_default_graph()
    path = '/content/drive/My Drive/Prompt-related Model/runs/Bi-LSTM/'

  
    for i in range(3,4):
        print("No." + str(i) + " Model\n")
        lstm_model = tf.train.import_meta_graph(os.path.join(path, "Model_"+str(i), "model", "clf-500.meta"))
        lstm_model.restore(sess, tf.train.latest_checkpoint(os.path.join(path, "Model_"+str(i), "model")))
    
        x_dev = prompt_contents
        y_dev = prompt_labels
        dev_lengths = semantic_lengths

        print('\nDevlopment Set Validation ' + str(i))
        dev_data = batch_iter_prompt(x_dev, y_dev, dev_lengths, 1783, 1)
        for dev_input in dev_data:
      
          x_ = inference_graph.get_tensor_by_name('input_x:0')
      y_ = inference_graph.get_tensor_by_name('input_y:0')
      prediction_ = inference_graph.get_tensor_by_name('sigmoid/predictions:0')
      keep_prob_ = inference_graph.get_tensor_by_name('keep_prob:0')
      loss_ = inference_graph.get_tensor_by_name('loss/loss:0')
      accuracy_ = inference_graph.get_tensor_by_name('accuracy/accuracy:0')
      sequence_length_ = inference_graph.get_tensor_by_name('sequence_length:0')
      batch_size_ = inference_graph.get_tensor_by_name('batch_size:0')
      vars = sess.run([accuracy_, loss_, prediction_], 
                      feed_dict={x_: dev_input[0],
                                 y_: dev_input[1],
                                 keep_prob_: 1.0,
                                 sequence_length_: dev_input[2],
                                 batch_size_ : 1783})
      time_str = datetime.datetime.now().isoformat()
      res = np.concatenate((res, vars[2]), axis = 0)
      print(vars[2])
      print (dev_input[1])
      qwks = score.quadratic_weighted_kappa((vars[2]).astype(int), dev_input[1], 2, 12)
      print("qwks",qwks," ", score.mean_quadratic_weighted_kappa([qwks]))
      acc = np.sum(((vars[2]-dev_input[1])==0)==True) / len(vars[2])
      print("{}: loss: {:g}, accuracy: {:g}".format(time_str, vars[1], acc))
      print('End Development Set Validation ' + str(i) +'\n')

In [None]:
prompt_prediction_LSTM = res

In [None]:
prompt_prediction_Bi_LSTM = res

In [None]:
np.save('/content/drive/My Drive/XGBoost Model/data/prompt_prediction_Bi_LSTM', prompt_prediction_Bi_LSTM)
np.save('/content/drive/My Drive/XGBoost Model/data/prompt_prediction_LSTM', prompt_prediction_LSTM )
np.save('/content/drive/My Drive/XGBoost Model/data/semantic_prediction_Bi_LSTM', semantic_prediction_Bi_LSTM )
np.save('/content/drive/My Drive/XGBoost Model/data/semantic_prediction_LSTM', semantic_prediction_LSTM )
np.save('/content/drive/My Drive/XGBoost Model/data/coherence_prediction_Bi_LSTM', coherence_prediction_Bi_LSTM )
np.save('/content/drive/My Drive/XGBoost Model/data/coherence_prediction_LSTM', coherence_prediction_LSTM )

In [None]:
prompt_prediction_Bi_LSTM = np.load('/content/drive/My Drive/XGBoost Model/data/prompt_prediction_Bi_LSTM.npy')
prompt_prediction_LSTM = np.load('/content/drive/My Drive/XGBoost Model/data/prompt_prediction_LSTM.npy')
semantic_prediction_Bi_LSTM = np.load('/content/drive/My Drive/XGBoost Model/data/semantic_prediction_Bi_LSTM.npy')
semantic_prediction_LSTM = np.load('/content/drive/My Drive/XGBoost Model/data/semantic_prediction_LSTM.npy')
coherence_prediction_Bi_LSTM = np.load('/content/drive/My Drive/XGBoost Model/data/coherence_prediction_Bi_LSTM.npy')
coherence_prediction_LSTM = np.load('/content/drive/My Drive/XGBoost Model/data/coherence_prediction_LSTM.npy')

In [None]:
labels = np.load('/content/drive/My Drive/Semantic Model/word embedding/set1_label.npy')
labels = labels - 2

In [None]:
# prompt_prediction_Bi_LSTM = (prompt_prediction_Bi_LSTM - min(prompt_prediction_Bi_LSTM)) / (max(prompt_prediction_Bi_LSTM) - min(prompt_prediction_Bi_LSTM))
# semantic_prediction_Bi_LSTM = (semantic_prediction_Bi_LSTM - min(semantic_prediction_Bi_LSTM)) / (max(semantic_prediction_Bi_LSTM) - min(semantic_prediction_Bi_LSTM))
# coherence_prediction_Bi_LSTM = (coherence_prediction_Bi_LSTM - min(coherence_prediction_Bi_LSTM)) / (max(coherence_prediction_Bi_LSTM) - min(coherence_prediction_Bi_LSTM))

In [None]:
# prompt_prediction_Bi_LSTM = (prompt_prediction_Bi_LSTM - 2) / (12 - 2)
# semantic_prediction_Bi_LSTM = semantic_prediction_Bi_LSTM * 10 + 2
# coherence_prediction_Bi_LSTM = (coherence_prediction_Bi_LSTM - 2) / (12 - 2)
semantic_prediction_LSTM = semantic_prediction_LSTM * 10 + 2

In [None]:
prompt_prediction_Bi_LSTM = prompt_prediction_Bi_LSTM.reshape(len(prompt_prediction_Bi_LSTM), 1)
semantic_prediction_Bi_LSTM = semantic_prediction_Bi_LSTM.reshape(len(semantic_prediction_Bi_LSTM), 1)
coherence_prediction_Bi_LSTM = coherence_prediction_Bi_LSTM.reshape(len(coherence_prediction_Bi_LSTM), 1)

In [None]:
prompt_prediction_LSTM = prompt_prediction_LSTM.reshape(len(prompt_prediction_LSTM), 1)
semantic_prediction_LSTM = semantic_prediction_LSTM.reshape(len(semantic_prediction_LSTM), 1)
coherence_prediction_LSTM = coherence_prediction_LSTM.reshape(len(coherence_prediction_LSTM), 1)

In [None]:
final_data_Bi_LSTM = np.concatenate((hf, prompt_prediction_LSTM - 2, semantic_prediction_LSTM - 2, coherence_prediction_LSTM - 2), axis = 1)

In [None]:
final_data_Bi_LSTM = hf

In [None]:
import xgboost as xgb
unit = int(len(final_data_Bi_LSTM) / 5)

使用 XGBoost

In [None]:
# 80%训练集，20%测试集
train_dataset = []
train_label = []

dev_dataset = []
dev_label = []

test_dataset = []
test_label = []

train_dataset = final_data_Bi_LSTM[0: 4 * unit]
train_label = labels[0: 4 * unit]

test_dataset = final_data_Bi_LSTM[4 * unit: -1]
test_label = labels[4 * unit: -1]

In [None]:
train_dataset[0].shape

In [None]:
dtrain = xgb.DMatrix(train_dataset, train_label)
dtest = xgb.DMatrix(test_dataset, test_label)
eval_list = [(dtest, 'eval'), (dtrain, 'train')]
param = {'booster': 'gbtree', 'bst: max_depth': 6, 'bst:eta': 0.001, 'objective': 'multi:softmax', 'num_class': 11}
epoch_nums = 200

In [None]:
bst = xgb.train(param.items(), dtrain, epoch_nums, eval_list, early_stopping_rounds=10)

In [None]:
test = xgb.DMatrix(test_dataset)
ypred = bst.predict(test, ntree_limit=bst.best_iteration)

In [None]:
# np.sum(((np.squeeze(np.round(semantic_prediction_Bi_LSTM)) - (labels + 2))==0)==True) / 178
np.sum((((ypred+2) - (test_label + 2))==0)==True) / len(test_label)

In [None]:
score.quadratic_weighted_kappa((ypred+2).astype(int), test_label+2, 2, 12)

In [None]:
import matplotlib.pyplot as plt
plt.plot(semantic_prediction_Bi_LSTM, labels+2, 'ro', markersize=1)
plt.show()

In [None]:
ypred.shape

In [None]:
bst.save_model('xgboost_01.model')