<a href="https://colab.research.google.com/github/SilverQ/Multi-Label-Text-Classification-master/blob/master/MultiLabelCNN(TL_to_Section).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [114]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [0]:
base_path = './gdrive/My Drive/Colab Notebooks/multilabelcnn/'

In [0]:
data_in_path = base_path + 'input_data/'
test_data_path = base_path + 'test_data/'
data_out_path = base_path + 'result_tl_section/'
meta_data_path = base_path + 'meta_data/'
vocab_file = meta_data_path + 'vocab.voc'
label_file = meta_data_path + 'labels.pickle'
freq_file = meta_data_path + 'word_freq.pickle'

In [0]:
import os
if not os.path.exists(data_in_path):
    os.makedirs(data_in_path)
if not os.path.exists(test_data_path):
    os.makedirs(test_data_path)
if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)
if not os.path.exists(meta_data_path):
    os.makedirs(meta_data_path)

In [0]:
# import zipfile
# train_zip_ref = zipfile.ZipFile(data_in_path + 'input_data.zip', 'r')
# train_zip_ref.extractall(data_in_path)
# train_zip_ref.close()

# test_zip_ref = zipfile.ZipFile(test_data_path + 'test_data.zip', 'r')
# test_zip_ref.extractall(test_data_path)
# test_zip_ref.close()

In [0]:
import tensorflow as tf
import pickle
import json
from tqdm import tqdm
from random import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.backend as K
from collections import Counter
import re
import numpy as np
from tensorflow.keras import preprocessing
from tensorflow.keras.utils import plot_model
from tensorflow.keras import backend

In [0]:
tf.enable_eager_execution()

In [121]:
tr_file_list = os.listdir(data_in_path)
tr_file_list = [file for file in tr_file_list if file.endswith(".txt")]
# tr_file_list = [file for file in tr_file_list if file.startswith("cpc")]

test_file_list = os.listdir(test_data_path)
test_file_list = [file for file in test_file_list if file.endswith(".txt")]
print(tr_file_list, '\n', test_file_list)

['cpc_json_0_3docs_01.txt', 'cpc_json_0_3docs_02.txt', 'w_dhhan_1007_00.txt', 'w_dhhan_1007_01.txt', 'w_dhhan_1007_02.txt', 'w_dhhan_1007_03.txt', 'w_dhhan_1007_04.txt', 'w_dhhan_1007_05.txt', 'w_dhhan_1007_06.txt', 'w_dhhan_1007_07.txt', 'w_dhhan_1007_08.txt'] 
 ['w_dhhan_1007_09.txt', 'w_dhhan_1007_10.txt', 'w_dhhan_1007_11.txt', 'w_dhhan_1007_12.txt', 'w_dhhan_1007_13.txt', 'w_dhhan_1007_15.txt', 'w_dhhan_1007_14.txt']


In [0]:
class Dataset:

    def __init__(self, train_path, test_path, is_shuffle, train_bs, test_bs, epoch, max_length):
        self.train_path = train_path
        self.test_path = test_path
        self.is_shuffle = is_shuffle
        self.train_bs = train_bs
        self.test_bs = test_bs
        self.epoch = epoch
        self.max_length = max_length
        self.special_tokens = ['<PAD>', '<BOS>', '<EOS>', '<UNK>']
        # self.label_path = label_path
        # self.vocab_path = vocab_path

        if not os.path.exists(vocab_file):
            print('No vocabulary.')
            print('Making vocabulary.')
            self.build_vocab_by_patent(vocab_file)
            print('Complete build vocabulary!')

        # print('Loading vocabulary...')
        self.idx2word, self.word2idx = pickle.load(open(vocab_file, 'rb'))
        print('Successfully load vocabulary!')
        self.idx2label, self.label2idx = pickle.load(open(label_file, 'rb'))
        print('Successfully load labels')

    def build_freq(self, word_list):
        word_counts = Counter(word_list)
        # print('word_list: ', len(word_list), word_list)
        # print('word_counts_1: ', len(word_counts), word_counts)
        # print('word_counts_2: ', len(word_counts.most_common()), word_counts.most_common())
        freq = Counter()
        # freq_file = raw_path + '/word_freq.pickle'
        # print(freq_file)
        if os.path.exists(freq_file):
            with open(freq_file, 'rb') as freq_dist_f:
                freq = pickle.load(freq_dist_f)
                print('frequency distribution loaded', len(freq))
        for word, cnt in word_counts.items():
            # print(word, freq[word])
            freq[word] += cnt
            # print(word, freq[word])
        print('freq len: ', len(freq))
        with open(freq_file, 'wb') as freq_dist_f:
            pickle.dump(freq, freq_dist_f)
        return freq

    def build_vocab_by_patent(self, vocab_file):
        error_cnt = 0
        label_list = []
        for file in self.train_path:
            word_list = []
            with open(data_in_path + file, encoding='utf-8') as f:
                for line in tqdm(f):
                    try:
                        # print('line: ', line)
                        patent = json.loads(line)
                        text = re.sub('[-=.#/?:$}(){,]', ' ', patent['title'] + patent['ab'])
                        token = text.split()
                        # token = tokenizer(patent['title'])
                        # print('token: ', token)
                        # doc = en.tokenizer(patent['title']+patent['ab']+patent['cl'])
                        labels = patent['cpc'].split('|')
                        for tok in token:
                            word_list.append(tok.lower())
                        labels = [label[0] for label in labels]
                        for label in labels:
                            if label not in label_list:
                                label_list.append(label)
                    except:
                        error_cnt += 1
                        # print('error: ', line)
            print('\nIn "%s" word_list: %d, error_cnt: %d\n' % (file, len(word_list), error_cnt))
            idx2word = self.build_freq(word_list)
        idx2word = self.special_tokens + [word for word, _ in idx2word.most_common()]
        print('idx2word: ', len(idx2word), idx2word[:10])
        print('idx2label: ', len(label_list), label_list)
        word2idx = {word: idx for idx, word in enumerate(idx2word)}
        label2idx = {label: idx for idx, label in enumerate(label_list)}
        vocab = (idx2word, word2idx)
        label = (label_list, label2idx)
        pickle.dump(vocab, open(vocab_file, 'wb'))
        pickle.dump(label, open(label_file, 'wb'))

    def text_to_sequence(self, text_list):
        sequences = []
        for text in text_list:
            sequences.append([self.word2idx[word] for word in text if word in self.word2idx.keys()])
        return sequences

    def sequence_to_text(self, sequence):
        return [self.idx2word[idx] for idx in sequence if idx != 0]

    def read_lines(self, indices, path):
        line_count = 0
        texts = []
        labels = []
        # print('indices: ', indices)
        with open(path, encoding='utf-8') as f:
            for line in f:
                if line_count in indices:
                    try:
                        patent = json.loads(line)
                        # text = re.sub('[-=.#/?:$}(){,]', ' ', patent['title'] + patent['ab'])
                        text = re.sub('[-=.#/?:$}(){,]', ' ', patent['title'])
                        label = patent['cpc'].split('|')
                        texts.append(text.lower().split())
                        labels.append(list(set([cpc[0] for cpc in label])))
                    except:
                        print(line)
                        print(line_count)
                line_count += 1
        return texts, labels

    def create_multiplehot_labels(self, labels_index):
        labels = []
        # print(len(label))
        for batch in labels_index:
            label = [0] * len(self.label2idx)
            # print(item)
            for cpc in batch:
                label[self.label2idx[cpc]] = 1
            labels.append(label)
        # print('label_repr: ', labels)
        return labels

    def data_generator(self, is_train):
        if is_train:
            batch_size = self.train_bs
            is_shuffle = self.is_shuffle  # 셔플을 여기서 해줘야해. 밖에서는 느려
            file_list = tr_file_list
            path = data_in_path
        else:
            batch_size = self.test_bs
            is_shuffle = False
            file_list = test_file_list
            path = test_data_path
        # print(file_list)
        for file in tqdm(file_list):
            cur_file = path + file
            # print(file)
            with open(cur_file, encoding='utf-8') as f:  # 일단 읽어서 길이는 알아둔다.
                data_length = len(f.readlines())
                # print('Num of pat: ', data_length)

            indices = list(range(data_length))  # 인덱스를 미리 만들어주는게 제너레이터 사용의 핵심.
            if is_shuffle:
                shuffle(indices)  # 셔플할꺼라면 이걸... 내장 라이브러리 random에 있는 함수.
                # print('suffled indices: ', indices)
            current_count = 0
            # while True:
            #     if current_count >= data_length:
            #         return
            #     else:
            while current_count < data_length:
                target_indices = indices[current_count:current_count + batch_size]
                texts, labels = self.read_lines(target_indices, cur_file)
                tokenized_title = texts
                labels = self.create_multiplehot_labels(labels)
                indexed_encoder_inputs = self.text_to_sequence(tokenized_title)
                padded_encoder_inputs = pad_sequences(indexed_encoder_inputs,
                                                      maxlen=self.max_length,
                                                      padding='pre')
                # print(padded_encoder_inputs, labels)
                current_count += batch_size
                yield padded_encoder_inputs, labels

    def mapping_fn(self, x, y=None):
        inputs, label = {'x': x}, y
        return inputs, label

    def train_input_fn(self):
        dataset = tf.data.Dataset.from_generator(generator=lambda: self.data_generator(is_train=True),
                                                 output_types=(tf.int64, tf.int64),
                                                 output_shapes=(
                                                     (None, self.max_length),  # 넣어주면 graph그릴때 잘못 들어온 입력을 잡아줄 수 있다.
                                                     (None, None)))  # labels count: unknown
        dataset = dataset.map(self.mapping_fn)
        dataset = dataset.repeat(count=self.epoch)
        return dataset

    def test_input_fn(self):
        dataset = tf.data.Dataset.from_generator(generator=lambda: self.data_generator(is_train=False),
                                                 output_types=(tf.int64, tf.int64),
                                                 output_shapes=((None, self.max_length),
                                                                (None, None)))
        dataset = dataset.map(self.mapping_fn)
        return dataset

    def eval_input_fn(self):
        dataset = tf.data.Dataset.from_generator(
            generator=lambda: self.data_generator(is_train=False),
            output_types=(tf.int64, tf.int64),
            output_shapes=((None, self.max_length), (None, None)))
        dataset = dataset.map(self.mapping_fn)
        return dataset

In [123]:
dataset = Dataset(train_path=tr_file_list,
                  test_path=test_file_list,
                  is_shuffle=True,
                  train_bs=500,
                  test_bs=100,
                  epoch=10,
                  max_length=10)

Successfully load vocabulary!
Successfully load labels


In [0]:
hyper_params = {'vocab_size': len(dataset.word2idx),     # or 50,000 or 445,694
                'label_size': len(dataset.label2idx),
                'embedding_dimension': 256,
                'teacher_forcing_rate': 0.5,
                'use_attention': True}

In [0]:
def model_fn(features, labels, mode, params):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    # feature['x'] => (bs, 20)

    train_op = features
    loss = features
    predicted_token = features
    embedding_layer = tf.keras.layers.Embedding(params['vocab_size'],
                                                params['embedding_dimension'])(features['x'])  # (bs, 20, EMD_SIZE)

    dropout_emb = tf.keras.layers.Dropout(rate=0.5)(embedding_layer)  # (bs, 20, EMD_SIZE)

    filter_sizes = [3, 4, 5]
    pooled_outputs = []
    for filter_size in filter_sizes:
        conv = tf.keras.layers.Conv1D(
            filters=100,
            kernel_size=filter_size,
            padding='valid',
            activation=tf.nn.relu,
            kernel_constraint=tf.keras.constraints.max_norm(3.))(dropout_emb)  # (bs, 20, 100)
        # 최대 norm 지정, weight clipping이 바로 이 부분

        pool = tf.keras.layers.GlobalMaxPool1D()(conv)  # [(bs, 100), (bs, 100), (bs, 100)]
        pooled_outputs.append(pool)

    h_pool = tf.concat(pooled_outputs, axis=1)  # (bs, 300)

    hidden = tf.keras.layers.Dense(units=250, activation=tf.nn.relu,
                                   kernel_constraint=tf.keras.constraints.max_norm(3.))(h_pool)  # (bs, 200)
    dropout_hidden = tf.keras.layers.Dropout(rate=0.5)(hidden, training=TRAIN)
    # logits = tf.keras.layers.Dense(units=1)(dropout_hidden)  # sigmoid를 해주겠다  # (bs, 1)
    logits = tf.keras.layers.Dense(units=params['label_size'])(dropout_hidden)  # 이렇게하면 one-hot 필요

    if TRAIN:
        global_step = tf.train.get_global_step()
        loss = tf.losses.sigmoid_cross_entropy(labels, logits
                                               ,
                                               weights=1.0, label_smoothing=0.1
                                               )
        # loss = tf.losses.softmax_cross_entropy(labels, logits)
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        precision = tf.metrics.precision(labels, tf.round(pred))
        recall = tf.metrics.recall(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, 
                                          train_op=train_op,
                                          loss=loss,
                                          eval_metric_ops={'acc': accuracy, 
                                                           'prec': precision,
                                                           'recall': recall})

    elif EVAL:
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
                                              #  ,
                                              #  weights=1.0, label_smoothing=0.01)
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        precision = tf.metrics.precision(labels, tf.round(pred))
        recall = tf.metrics.recall(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, 
                                          loss=loss, 
                                          eval_metric_ops={'acc': accuracy, 
                                                           'prec': precision,
                                                           'recall': recall})

    elif PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob': tf.nn.sigmoid(logits),
            }
        )
    plot_model(model, to_file=data_out_path + 'model.png')

    return tf.estimator.EstimatorSpec(
        mode=mode,
        train_op=train_op,
        loss=loss,
        predictions={'prediction': predicted_token})


In [0]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

In [127]:
est = tf.estimator.Estimator(model_fn=model_fn,
                             params=hyper_params,
                             model_dir=data_out_path)
# tf.estimator.train_and_evaluate(model_fn, train_spec, eval_spec)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './gdrive/My Drive/Colab Notebooks/multilabelcnn/result_tl_section/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f943e67edd8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [0]:
# train_spec = tf.estimator.TrainSpec(input_fn=dataset.train_input_fn, max_steps=1000)
# eval_spec = tf.estimator.EvalSpec(input_fn=dataset.eval_input_fn, steps=10)

In [0]:
est.train(dataset.train_input_fn)
# tf.estimator.train_and_evaluate(model_fn, train_spec, eval_spec)


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./gdrive/My Drive/Colab Notebooks/multilabelcnn/result_tl_section/model.ckpt.





  0%|          | 0/11 [00:00<?, ?it/s][A[A[A

INFO:tensorflow:loss = 0.6917918, step = 0





  9%|▉         | 1/11 [00:00<00:03,  3.01it/s][A[A[A


 18%|█▊        | 2/11 [00:00<00:02,  3.74it/s][A[A[A

INFO:tensorflow:global_step/sec: 0.283072
INFO:tensorflow:loss = 0.39744103, step = 100 (353.284 sec)
INFO:tensorflow:Saving checkpoints for 172 into ./gdrive/My Drive/Colab Notebooks/multilabelcnn/result_tl_section/model.ckpt.
INFO:tensorflow:global_step/sec: 0.278485
INFO:tensorflow:loss = 0.35965917, step = 200 (359.072 sec)





 27%|██▋       | 3/11 [11:56<28:38, 214.86s/it][A[A[A

INFO:tensorflow:global_step/sec: 0.269111
INFO:tensorflow:loss = 0.36394724, step = 300 (371.593 sec)
INFO:tensorflow:Saving checkpoints for 334 into ./gdrive/My Drive/Colab Notebooks/multilabelcnn/result_tl_section/model.ckpt.
INFO:tensorflow:global_step/sec: 0.26683
INFO:tensorflow:loss = 0.35004145, step = 400 (374.774 sec)





 36%|███▋      | 4/11 [24:22<43:40, 374.32s/it][A[A[A

INFO:tensorflow:Saving checkpoints for 493 into ./gdrive/My Drive/Colab Notebooks/multilabelcnn/result_tl_section/model.ckpt.
INFO:tensorflow:global_step/sec: 0.257985
INFO:tensorflow:loss = 0.34016272, step = 500 (387.619 sec)
INFO:tensorflow:global_step/sec: 0.274342
INFO:tensorflow:loss = 0.34362793, step = 600 (364.507 sec)





 45%|████▌     | 5/11 [36:54<48:46, 487.67s/it][A[A[A

In [15]:
valid = est.evaluate(dataset.eval_input_fn, steps=10)
# acc = 0.7144097, global_step = 96876, loss = 6.5086164, prec = 0.10945274, recall = 0.12790698

label smoothing 0.1 적용 후
# acc = 0.7065972, global_step = 121052, loss = 0.7922439, prec = 0.13913043, recall = 0.18604651


INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-22T16:08:29Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./gdrive/My Drive/Colab Notebooks/multilabelcnn/result_tl_section/model.ckpt-121052
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


  0%|          | 0/7 [00:00<?, ?it/s]

INFO:tensorflow:Evaluation [1/10]
INFO:tensorflow:Evaluation [2/10]
INFO:tensorflow:Evaluation [3/10]
INFO:tensorflow:Evaluation [4/10]
INFO:tensorflow:Evaluation [5/10]
INFO:tensorflow:Evaluation [6/10]
INFO:tensorflow:Evaluation [7/10]
INFO:tensorflow:Evaluation [8/10]
INFO:tensorflow:Evaluation [9/10]
INFO:tensorflow:Evaluation [10/10]
INFO:tensorflow:Finished evaluation at 2019-10-22-16:08:57
INFO:tensorflow:Saving dict for global step 121052: acc = 0.7065972, global_step = 121052, loss = 0.7922439, prec = 0.13913043, recall = 0.18604651
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 121052: ./gdrive/My Drive/Colab Notebooks/multilabelcnn/result_tl_section/model.ckpt-121052


In [15]:
pred_results = est.predict(input_fn=dataset.eval_input_fn)
print(pred_results)

<generator object Estimator.predict at 0x7fc703909360>


In [0]:
# test_output = [pred['prob'] for item in list(pred_results)]
test_output = [item for item in list(pred_results)]
# test_output = np.array(test_output)
print(test_output)

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Done calling model_fn.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./gdrive/My Drive/Colab Notebooks/multilabelcnn/result_tl_section/model.ckpt-121052
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


  0%|          | 0/7 [00:00<?, ?it/s]

https://www.tensorflow.org/guide/estimator

https://colab.research.google.com/drive/130zRZLtZu8ceWfHmRqQfai09MuAW9fAY#scrollTo=5HeTOvCYbjZb
이거 보면서 잘 공부해보쟈~