In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers, losses, preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
print(tf.__version__)

2.4.1


In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
# Скачиваем файл по ссылке.
dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                 untar=True, cache_dir='.',
                                 cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb') # Создаем ссылку на директорию датасета.

In [4]:
os.listdir(dataset_dir) # Смотрим содержимое в директории.

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [5]:
train_dir = os.path.join(dataset_dir, 'train') # Создаем ссылку на дирректорию тренировочнойпапки.
os.listdir(train_dir) # Смотрим содержимое в директории.

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [6]:
# Смотрим один из примеров позитивного ревью.
sample_file = os.path.join(train_dir, 'pos/1181_9.txt')
with open(sample_file) as f:
    print(f.read())

Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It will bring tears to all those who have lost a loved one, and survived.


In [7]:
remove_dir = os.path.join(train_dir, 'unsup') # Создаем ссылку на дирректорию для удаления.
shutil.rmtree(remove_dir) # Удаляем всю ветку

In [31]:
batch_size = 50
seed = 113
# Из дерриктории создаем датасеты train&valid.
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(train_dir, batch_size=batch_size,
                                                             validation_split=0.2,
                                                             subset='training',
                                                             seed=seed)
raw_valid_ds = tf.keras.preprocessing.text_dataset_from_directory(train_dir, batch_size=batch_size,
                                                             validation_split=0.2,
                                                             subset='validation',
                                                             seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [9]:
# Просмотрим пару вариантов.
for text_batch, label_batch in raw_train_ds.take(1): # Берем один элемент.
    for i in range(3):
        print("Review", text_batch.numpy()[i])
        print("Label", label_batch.numpy()[i])

Review b'This is a lovely tale of guilt-driven obsession.<br /><br />Matiss, on a lonely night stroll in Riga (?) passes by a woman on the wrong side of a bridge railing. He passes by without a word. Only the splash in the water followed by a cry for help causes him to act. And then only too little and too late.<br /><br />The film chronicles his efforts at finding out more about the woman. On a troll of local bars, he finds her pocketbook. He pieces more and more of her life together. His "look" changes as his obsession grows. He has to make things right. In a marvelously filmed dialog with the "bastard ex-boyfriend" he forces Alexej to face up to the guilt that both feel.<br /><br />Haunting long takes, a gritty soundtrack to accentuate the guilt, barking dogs. Footsteps. Lovely film noir with a lovely twist. A good Indie ending.'
Label 1
Review b'This effort is based on the true story of Jim Morris, a high school science teacher/baseball coach, who is inspired by his players to try 

In [10]:
print("Label 0 corresponds to", raw_train_ds.class_names[0]) # Названия классов.
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to neg
Label 1 corresponds to pos


In [11]:
# Тестовый датасет из директории.
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/test', batch_size=batch_size)

Found 25000 files belonging to 2 classes.


In [12]:
# Стандартизация, собственная функция для удаления меток HTML, пунктуации и понижение регистра.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, f'{re.escape(string.punctuation)}', '')

In [13]:
# Слой векторизации
max_features = 1000
sequence_length = 100

vectorize_layer = TextVectorization(standardize=custom_standardization,
                                   max_tokens=max_features,
                                   output_mode='int',
                                   output_sequence_length=sequence_length)

In [14]:
# Выбираем только текст и адаптируем векторизацию для создания словаря.
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [15]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [16]:
# Посмотрим как работает функция векторизации.
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print('Review ', first_review, '\n')
print('Label ', raw_train_ds.class_names[first_label], '\n')
print("Vectorized review", vectorize_text(first_review, first_label))

Review  tf.Tensor(b'Prominent attorney Walter Pidgeon takes a murder case pro bono, wins an acquittal and discovers that his client (Keefe Braselle) was not only guilty but part of an extortion ring reaching to the highest eschelons of the city. Panged by his own complicity, he undertakes an investigation, stumbles onto the identity of the "unknown man" who heads the syndicate, and murders him.<br /><br />The ironies engage when Braselle is charged with this second murder and Pidgeon must defend him by pointing to the existence of another "unknown man" -- himself. Though somewhat short of urban grit and long on rhetoric, the Unknown Man belongs to the noir cycle less by style or structure than by its acknowledgement of the pervasive corruption of American municipal politics that came to light in the postwar years.', shape=(), dtype=string) 

Label  pos 

Vectorized review (<tf.Tensor: shape=(1, 100), dtype=int64, numpy=
array([[  1,   1,   1,   1, 268,   3, 741, 525,   1,   1,   1,  31

In [17]:
# Посмотрим какому индексу соответствуют слова.
print("149 ---> ",vectorize_layer.get_vocabulary()[149])
print("478 ---> ",vectorize_layer.get_vocabulary()[478])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

149 --->  movie,
478 --->  white
Vocabulary size: 1000


In [18]:
# Применим векторизацию ко всем данным.
train_ds = raw_train_ds.map(vectorize_text)
valid_ds = raw_valid_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [19]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(AUTOTUNE)
valid_ds = valid_ds.cache().prefetch(AUTOTUNE)
test_ds = test_ds.cache().prefetch(AUTOTUNE)

In [20]:
# Создаем момдель.
embedding_dim = 32

class MyModel(tf.keras.Model):
    
    def __init__(self, embedding_dim):
        super(MyModel, self).__init__()
        self.embedding = layers.Embedding(max_features+1, embedding_dim)
        self.dropout_1 = layers.Dropout(0.2)
        self.pooling = layers.GlobalAveragePooling1D()
        self.dropout_2 = layers.Dropout(0.2)
        self.dence = layers.Dense(1)
        
    def call(self, x):
        x = self.embedding(x)
        x = self.dropout_1(x)
        x = self.pooling(x)
        x = self.dropout_2(x)
        return self.dence(x)

In [21]:
# Инициализируем модель.
model = MyModel(embedding_dim)

In [22]:
# Параметры.
optimazer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [23]:
# Создаем метрики потери и точноти для всех датасетов отдельно.
train_accuracy = tf.keras.metrics.BinaryAccuracy(threshold=0.0)
train_loss = tf.keras.metrics.Mean()

valid_accuracy = tf.keras.metrics.BinaryAccuracy(threshold=0.0)
valid_loss = tf.keras.metrics.Mean()

test_accuracy = tf.keras.metrics.BinaryAccuracy(threshold=0.0)
test_loss = tf.keras.metrics.Mean()

In [24]:
# Функция для тренировки модели.
def train(texts, labels):
    with tf.GradientTape() as tape:
        prediction = model(texts, training=True)
        loss = loss_fn(labels, prediction)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimazer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(labels, prediction)

In [25]:
# Функция для валидации модели.
def valid(texts, labels):
    prediction =  model(texts, training=False)
    v_loss = loss_fn(labels, prediction)
    
    valid_loss(v_loss)
    valid_accuracy(labels, prediction)

In [26]:
# Функция для валидации модели.
def test(texts, labels):
    prediction = model(texts, training=False)
    t_loss = loss_fn(labels, prediction)
    
    test_loss(t_loss)
    test_accuracy(labels, prediction)

In [30]:
# Запускаем тренировку модели
EPOCHS = 10

for epoch in range(EPOCHS):
    # Стираем все данные до начал следующей эпохи
    train_loss.reset_states()
    train_accuracy.reset_states()
    valid_loss.reset_states()
    valid_accuracy.reset_states()
    
    for text, labels in train_ds:
        train(text, labels)
        
    for texts, labels in valid_ds:
        valid(texts, labels)
        
    print(f'Epoch: {epoch + 1}')
    print(f'Train loss: {train_loss.result()}')
    print(f'Train accuracy: {train_accuracy.result()}')
    print(f'Valid loss: {valid_loss.result()}')
    print(f'Valid accuracy: {valid_accuracy.result()}')

Epoch: 1
Train loss: 0.42365968227386475
Train accuracy: 0.8053996562957764
Valid loss: 0.4708956480026245
Valid accuracy: 0.767799973487854
Epoch: 2
Train loss: 0.4231114983558655
Train accuracy: 0.8051995635032654
Valid loss: 0.4716337323188782
Valid accuracy: 0.767599880695343
Epoch: 3
Train loss: 0.42381227016448975
Train accuracy: 0.8045499920845032
Valid loss: 0.471934050321579
Valid accuracy: 0.7671999335289001
Epoch: 4
Train loss: 0.42394396662712097
Train accuracy: 0.8046997785568237
Valid loss: 0.4723937213420868
Valid accuracy: 0.7669999003410339
Epoch: 5
Train loss: 0.4222526550292969
Train accuracy: 0.8049994111061096
Valid loss: 0.47280704975128174
Valid accuracy: 0.7671999335289001
Epoch: 6
Train loss: 0.4223613739013672
Train accuracy: 0.8061996698379517
Valid loss: 0.47312507033348083
Valid accuracy: 0.7683998942375183
Epoch: 7
Train loss: 0.42297253012657166
Train accuracy: 0.8039497137069702
Valid loss: 0.473359614610672
Valid accuracy: 0.7685997486114502
Epoch: 8
Tr

In [None]:
model.summary()

In [None]:
test_loss.reset_states()
test_accuracy.reset_states()

for texts, labels in test_ds:
    test(text, labels)

In [None]:
print(f'Test loss: {test_loss.result()}')
print(f'Test accuracy: {test_accuracy.result()}')