In [1]:
import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = 'aclImdb/{}/{}'.format(s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as file:
                txt = file.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

FileNotFoundError: [Errno 2] No such file or directory: 'aclImdb/test/pos'

In [2]:
import numpy as np
np.random.permutation(df.index)

NameError: name 'df' is not defined

In [3]:
# 将数据重新排序（打乱正例与反例的顺序）
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index)) # 打乱索引顺序并按照新索引排序
df.to_csv('./movie_data.csv', index=False) # 保存到本地文件

NameError: name 'df' is not defined

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv('./import movie_data.csv')
df.head()

FileNotFoundError: File b'movie_data.csv' does not exist

In [2]:
# 构建词袋模型
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

In [3]:
print(bag)
print(count.vocabulary_) # dict形式，单词及其对应的索引
print(bag.toarray()) # 得到词向量tf(t,d) 单词t在文档d中出现的次数
print(count.get_feature_names())

  (0, 2)	1
  (0, 1)	1
  (0, 3)	1
  (0, 5)	1
  (1, 4)	1
  (1, 6)	1
  (1, 1)	1
  (1, 5)	1
  (2, 0)	1
  (2, 4)	1
  (2, 6)	1
  (2, 2)	1
  (2, 1)	2
  (2, 3)	1
  (2, 5)	2
{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]
['and', 'is', 'shining', 'sun', 'sweet', 'the', 'weather']


In [6]:
# 词频-逆文档频率（tf-idf）
# tf-idf(t, d) = tf(t,d) * idf(t,d)
# idf(t,d) = log((1+nd)/(1+df(d,t)) # nd为文档总数；df(d,t)为包含t的文档数
# 计算之后需对词向量进行归一化
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(bag).toarray()) # 在词袋的基础上得到tf-idf

[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


In [7]:
df.loc[4, 'review'][:50] # loc => index+column; iloc => index, index

'The Good:<br /><br />Effective color scheme. Good '

In [8]:
import re
def preprocessor(text):
    text = re.sub('<.*>', '', text) # 去除html标记
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

In [9]:
preprocessor(df.loc[4, 'review'][:50])

'the good effective color scheme good '

In [10]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [11]:
df['review'] = df['review'].apply(preprocessor)

In [12]:
df.loc[10, 'review']

'well '

In [13]:
df.head()

Unnamed: 0,review,sentiment
0,unlike some movies which you can wonder around...,1
1,to be honest i did never read one of the comic...,0
2,the long list of big names in this flick inclu...,0
3,this movie is not your typical horror movie it...,1
4,the good not worth a watch threadbare story su...,0


In [15]:
# 标记文档                                                              
# 词干提取，标记单词原型                                   
from nltk.stem.porter import PorterStemmer                              
porter = PorterStemmer()                                                
def tokenizer_porter(text):                                             
   return [porter.stem(word) for word in text.split()]      
tokenizer_porter('runner likes running and thus they run')


['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [16]:
# 去停
from nltk.corpus import stopwords                                       
stop = stopwords.words('english')                          
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [18]:
# 不是按照比例选取吗
X_train = df.loc[:25000, 'review'].values                               
y_train = df.loc[:25000, 'sentiment'].values               
X_test = df.loc[25000:, 'review'].values                                 
y_test = df.loc[25000:, 'sentiment'].values                              
X_train.shape

(25001,)

In [19]:
from sklearn.grid_search import GridSearchCV                            
from sklearn.pipeline import Pipeline                      
from sklearn.linear_model import LogisticRegression                     
from sklearn.feature_extraction.text import TfidfVectorizer             
tfidf = TfidfVectorizer(strip_accents=None, lowercase=None, preprocessor=None) # 结合了词袋模型及
tfidf.get_params()



{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': None,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [21]:
def tokenizer(text):                                                    
    return text.split() 

In [22]:
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])

In [31]:
params_grid = [{'vect__ngram_range': [(1,1)],
      'vect__stop_words': [stop, None],
      'vect__tokenizer': [tokenizer_porter,tokenizer],
      'vect__norm': ['l1', 'l2'], # 数据标准化
      'clf__penalty': ['l1', 'l2'],
      'clf__C': [1.0, 10.0, 100.0]
     },                  
    {'vect__ngram_range': [(1,1)],
     'vect__stop_words': [stop, None],
     'vect__tokenizer': [tokenizer_porter,tokenizer],
     'vect__use_idf': [False], # 不使用idf
     'vect__norm': [None],
     'clf__penalty': ['l1', 'l2'],
     'clf__C': [1.0, 10.0, 100.0]}]
gs_lr_tfidf = GridSearchCV(lr_tfidf, params_grid, scoring='accuracy', cv=5, verbose=-1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 73.2min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 96.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=None, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='ac

In [34]:
gs_lr_tfidf.best_params_

{'clf__C': 10.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 1),
 'vect__norm': 'l2',
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer>}

In [33]:
gs_lr_tfidf.best_score_

0.8564057437702491