In [1]:
# 导入需要用的库
import os
import tarfile
# 软件包的解压
import urllib.request
#网络下载的请求
import tensorflow as tf
import numpy as np

import re
# 正则化
import string

from random import randint

In [2]:
# 数据地址
url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
filepath='data/aclImdb_v1.tar.gz'

In [3]:
# 如果当前目录下不存在data文件夹，则建立
if not os.path.exists('data'):
    os.makedirs('data')
# 下载数据，80兆左右
if not os.path.isfile(filepath):
    print('downloading...')
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)
else:
    print(filepath,'is existed')

data/aclImdb_v1.tar.gz is existed


In [4]:
# 解压数据
if not os.path.exists('data/aclImdb'):
    tfile=tarfile.open(filepath,'r:gz')
    print('extracting...')
    result=tfile.extractall('data/')
    print('extraction completed')
else:
    print('data/aclImdb is existed!')

data/aclImdb is existed!


In [5]:
# 将文本中不需要的字符清除，如html中的标签<br />
def remove_tags(text):
    re_tag=re.compile(r'<[^>]+>')
    return re_tag.sub('',text)

In [6]:
def read_files(filetype):
    path='data/aclImdb/'
    file_list=[]
    # 读取正面评价的文件路径，存到file_list列表里
    positive_path=path+filetype+'/pos/'
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
    pos_files_num=len(file_list)
    # 读取负面评价的文件的路径，存到file_list列表里
    negative_path=path+filetype+'/neg/'
    for  f in os.listdir(negative_path):
        file_list+=[negative_path+f]
    neg_files_num=len(file_list)-pos_files_num
    
    print('read',filetype,'files:',len(file_list))
    print(pos_files_num,'pos files in',filetype,'files')
    print(neg_files_num,'neg files in',filetype,'files')
    #得到所有标签。标签用one——hot编码，正面{1,0}负面[0,1]
    all_labels=([[1,0]]*pos_files_num+[[0,1]]*neg_files_num)
    
    # 得到所有文本
    all_texts=[]
    for fi in file_list:
        with open(fi,encoding='utf8') as file_input:
            #文本中有<br />这类html标签，将文本传入remove_tags函数
            #函数里用正则表达式将标签去除
            all_texts+=[remove_tags(''.join(file_input.readlines()))]
    return all_labels,all_texts

In [7]:
train_labels,train_texts=read_files("train")
test_labels,test_texts=read_files('test')

read train files: 25000
12500 pos files in train files
12500 neg files in train files
read test files: 25000
12500 pos files in test files
12500 neg files in test files


# 建立词汇词典 Token

In [8]:
token=tf.keras.preprocessing.text.Tokenizer(num_words=4000)
# 分词器，把出现率最高的4000个词纳入分词器

In [9]:
token.fit_on_texts(train_texts)

In [10]:
# 查看token读取了多少文档
token.document_count

25000

In [11]:
#print(token.word_index)# 出现频率的排名

In [12]:
token.word_docs
# 将单词映射为他们在训练器出现的文档或文本的数量

defaultdict(int,
            {'at': 12936,
             'satire': 209,
             'me': 7329,
             "isn't": 2587,
             'pomp': 8,
             'teachers': 64,
             'scramble': 6,
             'down': 3092,
             'far': 2551,
             'right': 2772,
             'closer': 190,
             'your': 4266,
             'a': 24173,
             'line': 1613,
             'here': 4199,
             "i'm": 3655,
             'years': 3638,
             'insightful': 63,
             'repeatedly': 117,
             'cartoon': 367,
             'which': 7572,
             'one': 14096,
             'sack': 42,
             'think': 5440,
             'tried': 704,
             'knew': 822,
             'pity': 225,
             'to': 23474,
             'in': 22036,
             'comedy': 2337,
             'time': 8719,
             'through': 3992,
             'that': 20039,
             'profession': 61,
             'programs': 62,
             'my': 81

In [13]:
train_sequences=token.texts_to_sequences(train_texts)
test_sequences=token.texts_to_sequences(test_texts)

In [14]:
print(train_texts[0])

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [15]:
print(train_sequences[0])

[308, 6, 3, 1068, 208, 8, 2160, 29, 1, 168, 54, 13, 45, 81, 40, 391, 109, 137, 13, 57, 149, 7, 1, 481, 68, 5, 260, 11, 2000, 6, 72, 2422, 5, 631, 70, 6, 1, 5, 2001, 1, 1530, 33, 66, 63, 204, 139, 64, 1229, 1, 4, 1, 222, 899, 28, 3021, 68, 4, 1, 9, 693, 2, 64, 1530, 50, 9, 215, 1, 386, 7, 59, 3, 1470, 3710, 798, 5, 3509, 176, 1, 391, 9, 1235, 29, 308, 3, 352, 343, 2970, 142, 129, 5, 27, 4, 125, 1470, 2372, 5, 308, 9, 532, 11, 107, 1466, 4, 57, 554, 100, 11, 308, 6, 226, 47, 3, 2231, 11, 8, 214]


# 让转换后数字列表长度相同

In [16]:
x_train=tf.keras.preprocessing.sequence.pad_sequences(train_sequences,padding='post',truncating='post',maxlen=400)
x_test=tf.keras.preprocessing.sequence.pad_sequences(test_sequences,padding='post',truncating='post',maxlen=400)
y_train=np.array(train_labels)
y_test=np.array(test_labels)

In [17]:
x_train.shape

(25000, 400)

# 建立模型

In [18]:
model=tf.keras.models.Sequential()

In [19]:
model.add(tf.keras.layers.Embedding(output_dim=32,
                                   input_dim=4000,
                                   input_length=400))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [22]:
# 用RNN牛拍卖行把词嵌入平坦化
# model.add(keras.layers.SimpleRNN(units=16))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=8)))
# 双相LSTM

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [23]:
model.add(tf.keras.layers.Dense(units=32,activation='relu'))

In [24]:
model.add(tf.keras.layers.Dropout(0.3))

In [25]:
model.add(tf.keras.layers.Dense(units=2,activation='softmax'))

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 32)           128000    
_________________________________________________________________
bidirectional (Bidirectional (None, 16)                2624      
_________________________________________________________________
dense (Dense)                (None, 32)                544       
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 66        
Total params: 131,234
Trainable params: 131,234
Non-trainable params: 0
_________________________________________________________________


# 训练模型

In [27]:
model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

In [None]:
history=model.fit(x_train,y_train,
                 validation_split=0.2,
                 epochs=10,
                 batch_size=128,
                 verbose=1)

Train on 20000 samples, validate on 5000 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

# 可视化

# 评估准确率

In [None]:
test_loss,test_acc=model.evaluate(x_test,y_test,verbose=1)
print('Test Accuracy',test_acc)

# 执行预测

In [None]:
predictions=model.predict(x_test)
predictions[0]

# 查看数据的预测结果

In [None]:
sentiment_dict={0:'pos',1:'neg'}
def display_test_sentiment(i):
    print(test_texts[i])
    print('label values',sentiment_dict[np.argmax(y_test[i])],
          'predict value:',sentiment_dict[np.argmax()]
    

In [None]:
display_test_sentiment(0)


网上找评论来试验