In [0]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip3 install --quiet seaborn

In [0]:
from google.colab import drive
drive.mount('/content/gdrive',  force_remount=True)

In [0]:
!unzip -qq "/content/gdrive/My Drive/20182_DOAN/universal_embed/data_labels.zip"

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = '/content/gdrive/My Drive/20182_DOAN/universal_embed/model_tf_hub' #"https://tfhub.dev/google/universal-sentence-encoder-large/3"
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

In [0]:
import os
import re 
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras import layers
from keras.models import Model
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

checkpointer = ModelCheckpoint(filepath='/content/gdrive/My Drive/20182_DOAN/universal_embed/weights_20_5_v1.hdf5', verbose=1, save_best_only=True)

def get_dataframe(filedir):
    
    print(len(os.listdir(filedir)))
    data = []
    listfilenames = os.listdir(filedir)
    for file in listfilenames:
        docs = open(filedir + '/' + file, 'r').read().strip().split('\n####\n')
        for doc in docs:
            lines = doc.strip().split('\n')
            lines = [s for s in lines if s != '']
            for i in range(0, len(lines)):
                label = int(lines[i][0])
                text = lines[i][2:]
                text = text.replace(" \'s", "\'s")
                #text = text.replace(" \'d", "\'d")
                #text = text.replace(" \'m", "\'m")
                #text = text.replace(" n\'t", "n\'t")
                #text = re.sub('[^A-Za-z0-9 ,\?\'\"-._\+\!/\`@=;:]+', '', text)
                data.append([label, text])

    df = pd.DataFrame(data, columns=['label', 'text'])
    df.label = df.label.astype('category')
    return df

df_train = get_dataframe('/content/train_chunk')
df_train.head()

In [0]:
train_text = df_train['text'].tolist()
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
category_counts = 2
train_label = np.asarray(pd.get_dummies(df_train.label), dtype = np.int8)

In [0]:
def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), 
    	signature="default", as_dict=True)["default"]


In [0]:
input_text = layers.Input(shape=(1,), dtype="string")
embedding = layers.Lambda(UniversalEmbedding,
	output_shape=(512,))(input_text)
dense = layers.Dense(256, activation='relu')(embedding)
pred = layers.Dense(category_counts, activation='softmax')(dense)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='categorical_crossentropy', 
	optimizer='adam', metrics=['accuracy'])

In [0]:
def get_dataframe_valid(filedir):
    list_filenames = os.listdir(filedir)
    data = []
    for file in list_filenames: 
        lines = open(filedir + '/' + file, 'r').read().strip().splitlines()
        for line in lines:
            label = int(line[0])
            text = line[2:]
            text = text.replace(" \'s", "\'s")
            #text = text.replace(" \'d", "\'d")
            #text = text.replace(" \'m", "\'m")
            #text = text.replace(" n\'t", "n\'t")
            #text = re.sub('[^A-Za-z0-9 ,\?\'\"-._\+\!/\`@=;:]+', '', text)
            data.append([label, text])
    
    df = pd.DataFrame(data, columns=['label', 'text'])
    df.label = df.label.astype('category')
    return df

df_test = get_dataframe_valid('/content/valid')
test_text = df_test['text'].tolist()
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = np.asarray(pd.get_dummies(df_test.label), dtype = np.int8)

In [0]:
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  history = model.fit(train_text, 
            train_label,
            validation_data=(test_text, test_label),
            epochs=12,
            batch_size=50,
            callbacks=[checkpointer])
  
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training loss', 'valid loss'], loc='upper left')
plt.show()

In [0]:
filedir = '/content/test'
def get_dataframe_test(filename):
    data = []
    lines = open(filedir + '/' + filename, 'r').read().strip().split('\n')
    lines = [line for line in lines if line != '']
    for line in lines:
        label = int(line[0])
        text = line[2:]
        text = re.sub('[^A-Za-z0-9 ,\?\'\"-._\+\!/\`@=;:]+', '', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.replace(" \'s", "\'s")
        text = text.replace(" \'d", "\'d")
        text = text.replace(" \'m", "\'m")
        text = text.replace(" n\'t", "n\'t")
        if text != ' ':
            data.append([label, text])
    
    df = pd.DataFrame(data, columns=['label', 'text'])
    df.label = df.label.astype('category')
    return df


list_test_filenames = os.listdir(filedir)
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    model1 = Model(inputs=[input_text], outputs=pred)
    model1.load_weights('/content/gdrive/My Drive/20182_DOAN/universal_embed/weights_24_4.hdf5')  
    model1.compile(loss='categorical_crossentropy', 
    optimizer='adam', metrics=['accuracy'])
    
    for file in list_test_filenames:
        try:
            print(file)
            df_test = get_dataframe_test(file)
            new_text = df_test['text'].tolist()
            new_text = np.array(new_text, dtype=object)[:, np.newaxis]
            #test_label = np.asarray(pd.get_dummies(df_test.label), dtype = np.int8)
          
            predicts = model1.predict(new_text, batch_size=80)
            print(len(new_text), len(predicts))
            np.save('/content/gdrive/My Drive/20182_DOAN/universal_embed/test_prob/' + file + '.npy', predicts)
        except Exception as e:
            print(e)


In [0]:
!zip -r '/content/test_prob_24.zip' '/content/gdrive/My Drive/20182_DOAN/universal_embed/test_prob'