<a href="https://colab.research.google.com/github/SooWanKim/practice_nlp/blob/master/IMDB%20Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tokenizers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/e9/ee/fedc3509145ad60fe5b418783f4a4c1b5462a4f0e8c7bbdbda52bdcda486/tokenizers-0.8.1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 8.5MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.8.1


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 29.0MB/s eta 0:00:01[K     |▉                               | 20kB 6.0MB/s eta 0:00:01[K     |█▎                              | 30kB 7.1MB/s eta 0:00:01[K     |█▊                              | 40kB 7.7MB/s eta 0:00:01[K     |██▏                             | 51kB 7.1MB/s eta 0:00:01[K     |██▋                             | 61kB 8.0MB/s eta 0:00:01[K     |███                             | 71kB 7.8MB/s eta 0:00:01[K     |███▍                            | 81kB 8.8MB/s eta 0:00:01[K     |███▉                            | 92kB 8.0MB/s eta 0:00:01[K     |████▎                           | 102kB 8.2MB/s eta 0:00:01[K     |████▊                           | 112kB 8.2MB/s eta 0:00:01[K     |█████▏                          | 122kB 8.2M

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import re
import os
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
from google.colab import drive


In [None]:
MAX_SEQ_LEN = 100  # max sequence length


def get_masks(tokens):
    """Masks: 1 for real tokens and 0 for paddings"""
    return [1] * len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens))


def get_segments(tokens):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (MAX_SEQ_LEN - len(tokens))


def get_ids(tokens, ids):
    """Token ids from Tokenizer vocab"""
    token_ids = ids
    input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids))
    return input_ids


def create_single_input(sentence, tokenizer, max_len):
    """Create an input from a sentence"""

    encoded = tokenizer.encode(sentence)

    ids = get_ids(encoded.tokens, encoded.ids)
    masks = get_masks(encoded.tokens)
    segments = get_segments(encoded.tokens)

    return ids, masks, segments


def convert_sentences_to_features(sentences, tokenizer):
    """Convert sentences to features: input_ids, input_masks and input_segments"""
    input_ids, input_masks, input_segments = [], [], []

    for sentence in tqdm(sentences, position=0, leave=True):
        ids, masks, segments = create_single_input(sentence, tokenizer, MAX_SEQ_LEN - 2)
        assert len(ids) == MAX_SEQ_LEN
        assert len(masks) == MAX_SEQ_LEN
        assert len(segments) == MAX_SEQ_LEN
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32), np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)]


def nlp_model(callable_object):
    # Load the pre-trained BERT base model
    bert_layer = hub.KerasLayer(handle=callable_object, trainable=True)

    # BERT layer three inputs: ids, masks and segments
    input_ids = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_ids")
    input_masks = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_masks")
    input_segments = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="segment_ids")

    inputs = [input_ids, input_masks, input_segments]  # BERT inputs
    pooled_output, sequence_output = bert_layer(inputs)  # BERT outputs

    # Add a hidden layer
    x = Dense(units=768, activation="relu")(pooled_output)
    x = Dropout(0.1)(x)

    # Add output layer
    outputs = Dense(2, activation="softmax")(x)

    # Construct a new model
    model = Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
drive.mount('/content/drive')
model = nlp_model("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2")
model.summary()

# https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
# google drive 에 올려서 씀 ㅋ
!ls '/content/drive/My Drive/Colab Notebooks/'

movie_reviews = pd.read_csv("/content/drive/My Drive/Colab Notebooks/IMDB Dataset.csv")
movie_reviews.head(5)
movie_reviews = movie_reviews.sample(frac=1) 

print(movie_reviews.isnull().values.any())
print(movie_reviews.shape)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········


INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.


Mounted at /content/drive


INFO:absl:Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2, Total size: 421.50MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_ids[0][0]                  
                                                                 input_masks[0][0]            

In [None]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub("[^a-zA-Z]", " ", sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", " ", sentence)

    # Removing multiple spaces
    sentence = re.sub(r"\s+", " ", sentence)

    return sentence


TAG_RE = re.compile(r"<[^>]+>")


def remove_tags(text):
    return TAG_RE.sub("", text)


reviews = []
sentences = list(movie_reviews["review"])
for sen in sentences:
    reviews.append(preprocess_text(sen))

print(movie_reviews.columns.values)
print(movie_reviews.sentiment.unique())

y = movie_reviews["sentiment"]

y = np.array(list(map(lambda x: 1 if x == "positive" else 0, y)))
print(y[:10])

['review' 'sentiment']
['negative' 'positive']
[0 0 0 1 0 1 1 0 0 1]


In [None]:
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)
tokenizer.enable_truncation(MAX_SEQ_LEN - 2)

train_count = 20000 
test_count = 2000 

# X_train = convert_sentences_to_features(reviews[:40000], tokenizer)
# X_test = convert_sentences_to_features(reviews[40000:], tokenizer)

X_train = convert_sentences_to_features(reviews[:train_count], tokenizer)
X_test = convert_sentences_to_features(reviews[train_count:train_count+test_count], tokenizer)

one_hot_encoded = to_categorical(y)
# one_hot_encoded = tf.one_hot(y, 1)

# y_train = one_hot_encoded[:40000]
# y_test = one_hot_encoded[40000:]

Y_train = one_hot_encoded[:train_count]
Y_test = one_hot_encoded[train_count:train_count + test_count]
print(one_hot_encoded[:10])
print(len(one_hot_encoded))

100%|██████████| 20000/20000 [00:10<00:00, 1976.01it/s]
100%|██████████| 2000/2000 [00:01<00:00, 1997.76it/s]


[[1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]]
50000


In [None]:
BATCH_SIZE = 8
EPOCHS = 1

opt = Adam(learning_rate=2e-5)
model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])

history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)

model.save('nlp_model.h5')

pred_test = np.argmax(model.predict(X_test), axis=1)
print(pred_test[:10])
# print(reviews[40000:40010])

[1 1 0 0 1 1 0 1 1 0]


In [None]:
from tensorflow.keras.models import load_model

new_model = load_model('nlp_model.h5',custom_objects={'KerasLayer':hub.KerasLayer})

In [None]:
from sklearn.metrics import classification_report

print(reviews[train_count:train_count+1])
pred_test = np.argmax(new_model.predict(X_test), axis=1)
print(X_test[:1])
print(pred_test[:10])
print(Y_test[:10])

['This movie even though is about one of the most favorite topics of Mexican producers producers the extreme life in our cities has funny way to put it on the screen Four of the more important Mexican directors of the last times approach histories of our city framed in diverse literary sorts as it can be the farce or the satire which gives us film with over exposed topic in our country but narrated in very different way which gives freshness tone him With actors little known but that interprets of excellent way their paper each one of the directors reflect in the stories the capacity by we have been identified anywhere in the world that capacity of laugh the pains and to make celebration of the sadness Perhaps to many people in our country the film not have pleased but consider that people of other countries could find attractive and share the surrealism of the Mexican ']
[array([[  101,  2023,  3185, ...,   102,     0,     0],
       [  101,  2023,  2003, ...,     0,     0,     0],
  

In [None]:
print(classification_report(np.argmax(Y_test,axis=1), pred_test))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      1000
           1       0.87      0.91      0.89      1000

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.88      0.88      0.88      2000



In [None]:
def sentiment_predict(sentence, tokenizer):
    x_test = convert_sentences_to_features(sentence, tokenizer)
    pred = np.argmax(model.predict(x_test), axis=1)
    print('\n')
    print(pred)

    for score in pred:
        score = float(score)  # 예측
        if score > 0.0:
            print("긍정 리뷰입니다.\n")
        else:
            print("부정 리뷰입니다.\n")


sentiment_predict(reviews[train_count:train_count+10], tokenizer)

100%|██████████| 10/10 [00:00<00:00, 1608.18it/s]



[1 1 0 0 1 1 0 1 1 0]
긍정 리뷰입니다.

긍정 리뷰입니다.

부정 리뷰입니다.

부정 리뷰입니다.

긍정 리뷰입니다.

긍정 리뷰입니다.

부정 리뷰입니다.

긍정 리뷰입니다.

긍정 리뷰입니다.

부정 리뷰입니다.




