In [54]:
import os

# IMDbデータセットが置かれているディレクトリ
imdb_dir = '/Users/Takanori/Downloads/aclImdb'

train_dir = os.path.join(imdb_dir, 'train')
labels = list() # 感情が0 or 1で追加される
texts = list() # テキストが追加される

# neg, posディレクトリからテキストの読み込みを行なっている
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

### テキストをトークン化

In [55]:
# ライブラリをload
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [56]:
max_len = 10000 # 映画レビューを10000ワードでカット
training_samples = 20000 # 20,000個のサンプルで訓練
validation_samples = 5000 # 5,000個のサンプルで検証
max_words = 10000  #　データセットの最初から10,000ワードのみを考慮

In [57]:
len(texts), len(labels)

(25000, 25000)

In [58]:
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)
# シーケンスに変換
sequences = tokenizer.texts_to_sequences(texts)

In [59]:
data = pad_sequences(sequences, maxlen=max_len)

In [60]:
# 単語に割り当てられた数字の表示
word_index = tokenizer.word_index
word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'he': 26,
 'be': 27,
 'one': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'who': 34,
 'so': 35,
 'from': 36,
 'like': 37,
 'her': 38,
 'or': 39,
 'just': 40,
 'about': 41,
 "it's": 42,
 'out': 43,
 'has': 44,
 'if': 45,
 'some': 46,
 'there': 47,
 'what': 48,
 'good': 49,
 'more': 50,
 'when': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'she': 56,
 'even': 57,
 'my': 58,
 'would': 59,
 'which': 60,
 'only': 61,
 'story': 62,
 'really': 63,
 'see': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'were': 68,
 'me': 69,
 'well': 70,
 'than': 71,
 'we': 72,
 'much': 73,
 'been': 74,
 'bad': 75,
 'get': 76,
 'will': 77,
 'do': 78,
 'also': 79,
 'into': 80,
 'people': 81,
 'other': 82,
 '

In [61]:
# どれくらいの単語が存在しているか
len(word_index)

88582

In [82]:
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# データを訓練データセットと検証データセットに分割:
# ただし、ここでランダムに並び替えることに注意
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples : training_samples + validation_samples]
y_val  = labels[training_samples : training_samples + validation_samples]

Shape of data tensor: (25000, 10000)
Shape of label tensor: (25000,)


In [83]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape

((20000, 10000), (20000,), (5000, 10000), (5000,))

### 学習済みの単語埋め込みを使用しない

In [84]:
embedding_dim = 100

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer = 'rmsprop',
            loss = 'binary_crossentropy',
             metrics = ['acc'])

history = model.fit(x_train, y_train,
                   epochs = 10,
                   batch_size = 32,
                   validation_data = (x_val, y_val))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 10000, 100)        1000000   
_________________________________________________________________
flatten_10 (Flatten)         (None, 1000000)           0         
_________________________________________________________________
dense_19 (Dense)             (None, 32)                32000032  
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 33        
Total params: 33,000,065
Trainable params: 33,000,065
Non-trainable params: 0
_________________________________________________________________
Train on 20000 samples, validate on 5000 samples
Epoch 1/10