# 映画レビューをネガティブ/ポジティブ分類する

/content/drive/MyDrive/Dataset/にIMDB.csvを配置する

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import shutil
shutil.copyfile("/content/drive/MyDrive/Dataset/IMDB.csv",
              "/content/IMDB.csv")

'/content/IMDB.csv'

In [3]:
import re
import csv
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses, metrics

データを読み込む

In [4]:
# 文章の最大の長さ
max_sentence_length = 256

In [5]:
datas = list()
category_to_index = {
    "negative" : 0,
    "positive" : 1
}

with open("/content/IMDB.csv", "r") as f:
    r = csv.reader(f)

    items = next(iter(r))

    for R in r:
        datas.append( {
            "review" : R[0],
            "sentiment" : category_to_index[R[1]]
        } )

#
len(datas), datas[0]

(50000,
 {'review': "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the s

文章を正規化する

*   大文字を小文字にする

*   記号を取り除く

*   "."や","を" ."や" ,"のように記号の前に空白を配置する



In [6]:
for D in tqdm(datas):
    aligned_review = D["review"].lower()
    aligned_review = re.sub(r"<br />", "", aligned_review)
    aligned_review = re.sub(r"[.,-?\!\(\)\{\}\[\]\"\'&%$#_\x85\x97]", lambda match: " " + match.group(0) + " ", aligned_review)
    D["aligned_review"] = aligned_review

datas[0]

100%|██████████| 50000/50000 [00:01<00:00, 27911.08it/s]


{'review': "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is du

単語をIDに変換する(Tokenize)

この際，文章の最大の長さを超過するデータは使用しない

In [7]:
words = list()
tmp_datas = list()

for D in tqdm(datas):
    parsed_review = D["aligned_review"].split(" ")
    tmp = list()
    for W in parsed_review:
        if len(W) != 0:
            tmp.append(W)
    parsed_review = tmp

    if len(parsed_review) <= max_sentence_length - 2:
        tmp_datas.append(D)
        tmp_datas[-1]["parsed_review"] = parsed_review
        words += parsed_review
words = set(words)

word_to_index = {
    W : i + 3 for i, W in enumerate(words)
}
word_to_index["<PADDING>"] = 0
word_to_index["<START>"] = 1
word_to_index["<END>"] = 2

for D in tqdm(tmp_datas):
    tmp = list()
    for W in D["parsed_review"]:
        tmp.append( word_to_index[W] )
    tmp = [1] + tmp + [2]
    D["tokenized_review"] = tmp

len(tmp_datas), len(words)

100%|██████████| 50000/50000 [00:07<00:00, 6760.55it/s] 
100%|██████████| 31072/31072 [00:01<00:00, 26372.78it/s]


(31072, 65094)

In [8]:
inputs = list()
teacher_signals = list()

for D in tqdm(tmp_datas):
    tmp = D["tokenized_review"] + [0] * (max_sentence_length - len(D["tokenized_review"]))
    inputs.append(tmp)
    teacher_signals.append(D["sentiment"])

tmp = list(zip(inputs, teacher_signals))
train_tmp, test_tmp = train_test_split(tmp, test_size = 0.1)
train_inputs, train_teacher_signals = zip(*train_tmp)
test_inputs, test_teacher_signals = zip(*test_tmp)

train_inputs = tf.constant(train_inputs)
train_teacher_signals = tf.constant(train_teacher_signals)
test_inputs = tf.constant(test_inputs)
test_teacher_signals = tf.constant(test_teacher_signals)

#
train_inputs.shape, train_teacher_signals.shape, test_inputs.shape, test_teacher_signals.shape

100%|██████████| 31072/31072 [00:00<00:00, 164331.11it/s]


(TensorShape([27964, 256]),
 TensorShape([27964]),
 TensorShape([3108, 256]),
 TensorShape([3108]))

In [10]:
def build_model():
    input = layers.Input(shape = train_inputs.shape[1:])
    x = layers.Embedding(input_dim = len(word_to_index.keys()),
                         output_dim = 256,
                         mask_zero = True)(input)
    x = layers.LSTM(units = 512)(x)
    x = layers.Dense(units = 512, activation = "relu")(x)
    x = layers.Dense(units = 256, activation = "relu")(x)
    output = layers.Dense(units = 1, activation = "sigmoid")(x)

    return models.Model(input, output)

model = build_model()
model.summary()
model.compile(loss = losses.BinaryCrossentropy(),
              optimizer = optimizers.Adam(learning_rate = 0.001),
              metrics = metrics.BinaryAccuracy())
model.fit(x = train_inputs, y = train_teacher_signals, batch_size = 256, epochs = 5,
          validation_data = (test_inputs, test_teacher_signals))

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 256)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 256, 256)          16664832  
                                                                 
 lstm_1 (LSTM)               (None, 512)               1574912   
                                                                 
 dense_2 (Dense)             (None, 512)               262656    
                                                                 
 dense_3 (Dense)             (None, 256)               131328    
                                                                 
 dense_4 (Dense)             (None, 1)                 257       
                                                                 
Total params: 18633985 (71.08 MB)
Trainable params: 1863398

<keras.src.callbacks.History at 0x785dd83bde40>

In [11]:
model.evaluate(test_inputs, test_teacher_signals)



[0.42361435294151306, 0.8606821298599243]