In [1]:
import warnings
warnings.simplefilter(action='ignore')

In [2]:
import os
import re
import tarfile
import urllib.request

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

### 1. 下载 IMDb 数据集

#### 1.1 下载压缩文件

In [3]:
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
file_path = 'data/aclImdb_v1.tar.gz'
if not os.path.isfile(file_path):
    result = urllib.request.urlretrieve(url, file_path)
    print('download:', result)

#### 1.2 解压文件

In [4]:
if not os.path.exists('data/aclImdb'):
    tfile = tarfile.open('data/aclImdb_v1.tar.gz', 'r:gz')
    result = tfile.extractall('data/')
    print('extract file success.')

### 2. 读取与查看 IMDb 数据

#### 2.1 定义函数用于删除文字中的 HTML 标签

In [5]:
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

#### 2.2 定义函数用于读取 IMDb 文件目录

In [6]:
def read_files(file_type):
    path = 'data/aclImdb/'
    file_list = []
    
    positive_path = path + file_type + '/pos/'
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]
    
    negative_path = path + file_type + '/neg/'
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
    
    print('read', file_type, 'files:', len(file_list))
    
    all_labels = ([1] * 12500 + [0] * 12500)
    all_texts = []
    for file in file_list:
        with open(file, encoding='utf8') as f:
            all_texts += [rm_tags(" ".join(f.readlines()))]
    
    return all_labels, all_texts

#### 2.3 读取 IMDb 数据集目录

In [7]:
y_train, train_text = read_files('train')

read train files: 25000


In [8]:
y_test, test_text = read_files('test')

read test files: 25000


#### 2.4 查看 IMDb 数据

In [9]:
train_text[0]

'For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.'

In [10]:
y_train[0]

1

In [11]:
train_text[12501]

'Well...tremors I, the original started off in 1990 and i found the movie quite enjoyable to watch. however, they proceeded to make tremors II and III. Trust me, those movies started going downhill right after they finished the first one, i mean, ass blasters??? Now, only God himself is capable of answering the question "why in Gods name would they create another one of these dumpster dives of a movie?" Tremors IV cannot be considered a bad movie, in fact it cannot be even considered an epitome of a bad movie, for it lives up to more than that. As i attempted to sit though it, i noticed that my eyes started to bleed, and i hoped profusely that the little girl from the ring would crawl through the TV and kill me. did they really think that dressing the people who had stared in the other movies up as though they we\'re from the wild west would make the movie (with the exact same occurrences) any better? honestly, i would never suggest buying this movie, i mean, there are cheaper ways to 

In [12]:
y_train[12501]

0

### 3. 建立 token

#### 3.1 建立 token

In [13]:
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)

#### 3.2 查看 token 读取文章数目

In [14]:
print(token.document_count)

25000


#### 3.3 查看 token.word_index 属性

In [15]:
print(token.word_index)



### 4. 将 "影评文字" 转换成 "数字列表"

#### 4.1 开始转换

In [16]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

#### 4.2 查看转换前后的差别

In [17]:
print('before texts_to_sequences:')
print(train_text[0])
print()
print('after texts_to_sequences:')
print(x_train_seq[0])

before texts_to_sequences:
For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.

after texts_to_sequences:
[14, 3, 16, 11, 210, 53, 1157, 46, 248, 22, 3, 172, 4, 902, 14, 10, 1524, 833, 3, 16, 117, 912, 6, 161, 158, 6, 3, 132, 1, 105, 6, 31, 1551, 102, 14, 1604, 1, 1787, 13, 3, 564]


### 5. 让转换后的数字长度相同

#### 5.1 开始转换

In [18]:
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test = sequence.pad_sequences(x_test_seq, maxlen=100)

#### 5.2 查看转换前后的差别

In [19]:
print('before pad sequences length:', len(x_train_seq[0]))
print(x_train_seq[0])
print()
print('after pad sequences length:', len(x_train[0]))
print(x_train[0])

before pad sequences length: 41
[14, 3, 16, 11, 210, 53, 1157, 46, 248, 22, 3, 172, 4, 902, 14, 10, 1524, 833, 3, 16, 117, 912, 6, 161, 158, 6, 3, 132, 1, 105, 6, 31, 1551, 102, 14, 1604, 1, 1787, 13, 3, 564]

after pad sequences length: 100
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0   14    3   16   11  210   53 1157   46  248   22    3
  172    4  902   14   10 1524  833    3   16  117  912    6  161  158
    6    3  132    1  105    6   31 1551  102   14 1604    1 1787   13
    3  564]


In [20]:
print('before pad sequences length:', len(x_train_seq[1]))
print(x_train_seq[1])
print()
print('after pad sequences length:', len(x_train[1]))
print(x_train[1])

before pad sequences length: 111
[1158, 185, 16, 1058, 15, 800, 1584, 17, 30, 298, 4, 1313, 13, 3, 180, 17, 639, 15, 3, 1827, 33, 6, 5, 985, 14, 37, 30, 1, 5, 604, 1, 135, 15, 22, 51, 69, 1989, 1, 1305, 224, 6, 399, 6, 1216, 13, 17, 50, 1094, 79, 3, 943, 30, 3, 19, 1, 346, 1862, 179, 62, 376, 1, 582, 3, 2, 374, 22, 3, 172, 2, 6, 83, 249, 13, 3, 564, 1247, 1, 16, 6, 750, 3, 1652, 4, 892, 2, 1, 17, 47, 3, 444, 19, 1, 114, 30, 1, 6, 364, 4, 834, 121, 69, 30, 163, 484, 33, 3, 273, 15, 301, 237, 35]

after pad sequences length: 100
[1313   13    3  180   17  639   15    3 1827   33    6    5  985   14
   37   30    1    5  604    1  135   15   22   51   69 1989    1 1305
  224    6  399    6 1216   13   17   50 1094   79    3  943   30    3
   19    1  346 1862  179   62  376    1  582    3    2  374   22    3
  172    2    6   83  249   13    3  564 1247    1   16    6  750    3
 1652    4  892    2    1   17   47    3  444   19    1  114   30    1
    6  364    4  834  121   69   30  163 