In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir('/content/drive/My Drive')

In [None]:
import tensorflow as tf
from bert import bert_tokenization as tokenization
import os
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
import numpy as np
import re

# Load data
f_chars_file = "Coherence Model/F_text.txt"
f_chars_id = []
with open(f_chars_file, "r") as f:
    for line in f.readlines():
        line = line.split(" +++$+++ ")
        k = line[0]
        f_chars_id.append(k)

m_chars_file = "Coherence Model/M_text.txt"
m_chars_id = []
with open(m_chars_file, "r") as f:
    for line in f.readlines():
        line = line.split(" +++$+++ ")
        k = line[0]
        m_chars_id.append(k)

f_lines = []
m_lines = []
lines_file = "Colab Notebooks/lines.txt"

with open(lines_file, "r") as f:
    for line in f.readlines():
        line = line.split(" +++$+++ ")
        k = line[1]
        v = line[4].strip()
        if len(v.split()) > 128:
            continue
        if k in f_chars_id:
            f_lines.append(v.lower())
        elif k in m_chars_id:
            m_lines.append(v.lower())

# Load Pre-Trained BERT Model via TF 2.0
# Prepare Data
lines = m_lines + f_lines
size = len(lines)
max_seq_length = 130
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)



def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens) > max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length - len(token_ids))
    return input_ids

train_data_seq = np.array([])
train_data_word = np.array([])
train_label = np.array([])

for i in range(size):
    line = lines[i]
    if line in f_lines:
        train_label = np.append(train_label, [1])
    elif line in m_lines:
        train_label = np.append(train_label, [0])

    token = tokenizer.tokenize(line)
    token = ["[CLS]"] + token + ["[SEP]"]

    input_id = get_ids(token, tokenizer, max_seq_length)
    input_mask = get_masks(token, max_seq_length)
    input_segment = get_segments(token, max_seq_length)
    
    seq_data, word_data = model.predict([[input_id],[input_mask],[input_segment]])
    train_data_seq = np.append(train_data_seq, seq_data)
    train_data_word = np.append(train_data_word, word_data)

train_data_seq = np.array(train_data_seq)
train_data_word = np.array(train_data_word)
train_labels = np.array(train_label)
np.save("data_seq.npy", train_data_seq)
np.save("data_word.npy", train_data_word)
np.save("label.npy", train_labels)

In [None]:
!pip install tensorflow==2.0

In [None]:
!pip install sentencepiece

In [None]:
!pip install bert-for-tf2

In [None]:
# import bert
# from bert import modeling
import tensorflow as tf
from bert import bert_tokenization as tokenization
import os
import numpy as np
import nltk
import nltk.tokenize as tk
import pandas as pd
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
nltk.download('punkt')

In [None]:
f_chars_file = "Coherence Model/F_text.txt"
f_chars_id = []
with open(f_chars_file, "r") as f:
    for line in f.readlines():
        line = line.split(" +++$+++ ")
        k = line[0]
        f_chars_id.append(k)

m_chars_file = "Coherence Model/M_text.txt"
m_chars_id = []
with open(m_chars_file, "r") as f:
    for line in f.readlines():
        line = line.split(" +++$+++ ")
        k = line[0]
        m_chars_id.append(k)

f_lines = []
m_lines = []
lines_file = "Colab Notebooks/lines.txt"
# data_clean = lines_file.read().replace('\xad', '')
with open(lines_file, "r") as f:
    for line in f.readlines():
        line = line.split(" +++$+++ ")
        k = line[1]
        v = line[4].strip()
        if len(v.split()) > 128:
            continue
        if k in f_chars_id:
            f_lines.append(v.lower())
        elif k in m_chars_id:
            m_lines.append(v.lower())

In [None]:
max_seq_length = 128  
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [None]:
from tensorflow.keras.models import Model 

In [None]:
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
lines = m_lines + f_lines

In [None]:
size = len(lines)

In [None]:
train_data = []
train_label = []
max_seq_length = 128
unit = int(size / 5)
test_size = size - unit * 4

In [None]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [None]:
s = "This is a nice sentence."
stokens = tokenizer.tokenize(s)
stokens = ["[CLS]"] + stokens + ["[SEP]"]

input_ids = get_ids(stokens, tokenizer, max_seq_length)
input_masks = get_masks(stokens, max_seq_length)
input_segments = get_segments(stokens, max_seq_length)

pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])


In [None]:
a = np.array([1])
np.append(a, [2])

In [None]:
train_data_seq = np.array([])
train_data_word = np.array([])
train_label = np.array([])

for i in range(size):
    if i % 1000 == 0:
        print("No.", i / 1000)
    line = lines[i]
    if line in f_lines:
        train_label = np.append(train_label, [1])
    elif line in m_lines:
        train_label = np.append(train_label, [0])
  
    token = tokenizer.tokenize(line)
    token = ["[CLS]"] + token + ["[SEP]"]

    input_id = get_ids(token, tokenizer, max_seq_length)
    input_mask = get_masks(token, max_seq_length)
    input_segment = get_segments(token, max_seq_length)

    seq_data, word_data = model.predict([[input_id],[input_mask],[input_segment]])
  
    if i == 0 :
        train_data_seq = seq_data
        train_data_word = word_data
    else:
        train_data_seq = np.append(train_data_seq, seq_data, axis = 0)
        train_data_word = np.append(train_data_word, word_data, axis = 0)

In [None]:
len(train_data_seq[0])

In [None]:
all_embs.shape

In [None]:
# BERT configures
bert_config = modeling.BertConfig.from_json_file("/content/drive/My Drive/BERT_uncased_model/bert_config.json")

# Create BERT's input
input_ids = tf.placeholder(shape=[None, max_seq_length], dtype=tf.int32, name="input_ids")
input_mask = tf.placeholder(shape=[None, max_seq_length], dtype=tf.int32, name="input_mask")
segment_ids = tf.placeholder(shape=[None, max_seq_length], dtype=tf.int32, name="segment_ids")

# Build BERT Model
model = modeling.BertModel(
    config = bert_config,
    is_training = True,
    input_ids = input_ids,
    input_mask = input_mask,
    token_type_ids = segment_ids,
    use_one_hot_embeddings = False 
)

init_checkpoint = "/content/drive/My Drive/BERT_uncased_model/bert_model.ckpt"
use_tpu = False

tvars = tf.trainable_variables()

(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
                                                                                       init_checkpoint)

tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

encoder_last2_layer = model.all_encoder_layers[-2]
encoder_last3_layer = model.all_encoder_layers[-3]

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    train_data = []
    train_label = []
    for i in range(0, size):
        text = lines[i]
        queries = tk.sent_tokenize(text)
        embeddings = []
        for query in queries:
            tokens = []
            tokens.append("[CLS]")
            split_tokens = tokenizer.tokenize(query)
            for token in split_tokens:
                tokens.append(token)
            tokens.append("[SEP]")
            word_ids = tokenizer.convert_tokens_to_ids(tokens)
            word_mask = [1] * len(word_ids)
            word_segment_ids = [0] * len(word_ids)
        while len(word_ids) < max_seq_length:
            word_ids.append(0)
            word_mask.append(0)
            word_segment_ids.append(0)
        fd = {input_ids: [word_ids], input_mask: [word_mask], segment_ids: [word_segment_ids]}

        last2, last3 = sess.run([encoder_last2_layer, encoder_last3_layer], feed_dict=fd)

        embedding = last2[0] + last3[0]
        embedding = np.sum(embedding, axis=0)
        embedding = embedding / max_seq_length
        embeddings.append(embedding)

      # train_data dimension1: # essay 
      # train_data dimension2: # essay sentence
      # train_data dimension3: the size of longest sentence (word)：192
      # train_data dimension4：dimension of embedding ：768（pre-trained model：12-layer, 768-hidden, 12-heads and 110M parameters.）
      
        train_data.append(embeddings)
        if lines[i] in f_lines:
            train_label.append(0)
        else:
            train_label.append(1)

In [None]:
train_dataset = np.array(train_data)
train_labels = np.array(train_label)
np.save("data.npy", train_dataset) 
np.save("label.npy", train_labels)

In [None]:
text = set_data['essay'][1114]
query = tk.sent_tokenize(text)
for q in query:
    print (q)
    print (len(tokenizer.tokenize(q)))