## 加载并查看数据

In [1]:
import pandas as pd       
train = pd.read_csv("./labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [2]:
train.shape

(25000, 3)

In [3]:
train.head(5)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [5]:
print (train["review"][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

### 数据预处理

- 去除HTML标签等等，使用BeautifulSoup4 （安装：pip install BeautifulSoup4）

In [6]:
# Import BeautifulSoup into your workspace
from bs4 import BeautifulSoup             

# Initialize the BeautifulSoup object on a single movie review     
example_bs = BeautifulSoup(train["review"][0])  

# Print the raw review and then the output of get_text(), for 
# comparison with the last output
print (example_bs.get_text())

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 mi



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


- 正则表达式删去非字母的符号

In [7]:
import re
example_letters_only = re.sub("[^a-zA-Z]", " ", example_bs.get_text())
print (example_letters_only)

 With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    mi

- 全部变小写

In [8]:
example_lower_case = example_letters_only.lower()
print (example_lower_case)

 with all this stuff going down at the moment with mj i ve started listening to his music  watching the odd documentary here and there  watched the wiz and watched moonwalker again  maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring  some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him the actual feature film bit when it finally starts is only on for    mi

- 分词

In [9]:
example_words = example_lower_case.split()
print (example_words)

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again', 'maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent', 'moonwalker', 'is', 'part', 'biography', 'part', 'feature', 'film', 'which', 'i', 'remember', 'going', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'was', 'originally', 'released', 'some', 'of', 'it', 'has', 'subtle', 'messages', 'about', 'mj', 's', 'feeling', 'towards', 'the', 'press', 'and', 'also', 'the', 'obvious', 'message', 'of', 'drugs', 'are', 'bad', 'm', 'kay', 'visually', 'impressive', 'but', 'of', 'course', 'this', 'is', 'all', 'about', 

- 去停用词

In [10]:
from nltk.corpus import stopwords

# In Python, searching a set is much faster than searching
#    a list, so convert the stop words to a set
stops = set(stopwords.words("english"))                  
# 
# Remove stop words
example_meaningful_words = [w for w in example_words if not w in stops]

print (example_meaningful_words)

['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug', 'lord', 

综上，预处理步骤如下：

In [11]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

In [12]:
print ("Cleaning and parsing the training set movie reviews...\n")
clean_train_reviews = []
num_reviews = train["review"].size
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print ("Review %d of %d\n" % ( i+1, num_reviews ))                                                                   
    clean_train_reviews.append( review_to_words( train["review"][i] ))

Cleaning and parsing the training set movie reviews...





 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000



### 使用课上的CNN4Text进行情感分析，采用Conv1D

In [13]:
len(clean_train_reviews)

25000

In [14]:
import warnings
warnings.filterwarnings('ignore')
import numpy
from numpy import array
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Activation
from keras.layers import Conv1D, Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [15]:
clean_train_reviews[0]

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

In [40]:
max_features = 1000 #vocabulary size
encoded_docs = [one_hot(d, vocab_size) for d in clean_train_reviews]
print(encoded_docs[:2])

[[496, 541, 672, 967, 410, 86, 804, 653, 457, 727, 970, 151, 970, 365, 391, 961, 212, 725, 154, 202, 64, 598, 885, 3, 391, 27, 219, 391, 943, 29, 365, 898, 335, 898, 502, 559, 319, 541, 560, 143, 624, 266, 180, 657, 967, 538, 534, 108, 552, 470, 198, 942, 917, 926, 171, 77, 665, 400, 446, 208, 929, 924, 967, 458, 541, 426, 440, 390, 967, 889, 967, 10, 199, 743, 566, 967, 515, 861, 229, 46, 515, 926, 598, 149, 491, 502, 559, 596, 873, 207, 307, 291, 109, 353, 80, 490, 640, 683, 986, 604, 934, 569, 177, 967, 205, 917, 303, 967, 503, 522, 775, 490, 640, 263, 887, 490, 670, 699, 82, 942, 986, 538, 391, 146, 967, 804, 79, 885, 686, 924, 967, 785, 507, 206, 749, 65, 725, 80, 552, 631, 783, 268, 268, 437, 134, 651, 917, 80, 671, 639, 426, 739, 930, 163, 386, 364, 749, 577, 191, 839, 969, 959, 455, 51, 566, 670, 924, 967, 930, 119, 250, 276, 670, 746, 372, 467, 423, 956, 198, 580, 967, 605, 943, 566, 769, 400, 446, 495, 930, 638, 670, 299, 984, 88, 943, 405, 132, 317, 518, 184, 405, 699, 670, 

In [1]:
labels = array(train["sentiment"])

NameError: name 'array' is not defined

In [42]:
len(labels)

25000

In [43]:
import random
indexes = random.sample(range(0, len(encoded_docs)), int(len(encoded_docs)*0.7))
tindexes = [i for i in range(0, len(encoded_docs)) if i not in indexes]

In [44]:
len(tindexes)

7500

In [62]:
(X_train, y_train) = ([encoded_docs[i] for i in indexes], array([labels[i] for i in indexes]))

In [63]:
(X_test, y_test) = ([encoded_docs[i] for i in tindexes], array([labels[i] for i in tindexes]))

In [64]:
len(X_train[0]), len(X_train[1]), len(X_train[2])

(291, 188, 16)

In [65]:
# [[1,14,22,..],[....],[....]]  -> 补全长度,不够的补0,太多的砍掉
from keras.preprocessing import sequence
max_review_length = 1600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) #padding
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [66]:
len(X_train[0]), len(X_train[1]), len(X_train[2])

(1600, 1600, 1600)

In [67]:
model = Sequential()

In [68]:
# word indexes -> word vectors
# [[1],[1,2,3] -> [[0.9, 0.1, 0.1], [0.9, 0.9, 0.9]]
# add Embedding Layer
embedding_vecor_length = 300
model.add(Embedding(max_features, embedding_vecor_length, input_length=max_review_length)) 
# input: an integer matrix of size (batch, input_length), i.e.,(None, 1600) 
# input (30, 3, 4) inputsize =(3,4)
# output: (None, max_review_length, embedding_vecor_length), i.e.,(None, 1600, 300) where None is the batch dimension(变长) 

In [69]:
# add Conv layer
# Input:（samples，max_review_length，embedding_vecor_length）的3D张量
# Output:（samples，new_steps，nb_filter）的3D张量，
nb_filter = 250
filter_length = 3
# 'valid': no padding, 'same': output=input, 有padding
model.add(Conv1D(filters=nb_filter, kernel_size=filter_length, padding='valid', activation='relu'))
# now model.output_shape == (None, nb_filter), i.e., (None, 1598, 250)
# (input + 2padding -filter) / stride + 1 



In [70]:
# add MaxPooling Layer
model.add(MaxPooling1D(pool_length=model.output_shape[1]))
# output: (None, 1, 250)

In [71]:
# get many small vectors
model.add(Flatten())
# output: (None, 250)

In [72]:
# Classifier: 2 Layer Neural Network (MLP)
hidden_dims = 200
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1)) #y=0/1
model.add(Activation('sigmoid')) # model.add(Dense(1, activation='sigmoid'))

In [73]:
# choose loss function and optimizing method
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [74]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1600, 300)         300000    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1598, 250)         225250    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 1, 250)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 250)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 200)               50200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
activation_3 (Activation)    (None, 200)               0         
__________

In [75]:
y_train

array([1, 0, 1, ..., 1, 0, 0])

In [76]:
# train the model
batch_size = 32
nb_epoch = 2
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x12a853940>

In [77]:
# Evaluation on the test set
scores = model.evaluate(X_test, y_test) #, verbose=0
print ("Accuracy: %.2f%%" % (scores[1]*100)) # loss, accuracy

Accuracy: 79.40%
