In [2]:
# 在线学习及外存学习（采用随机梯度下降法，使用小型子批次文档来训练）
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text): # 文本清洗及处理
    text = re.sub('<.*>', '', text) # 去除html标记
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [3]:
# 生成器，每次从本地文件读取一个文档并返回
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # 跳过第一行（csv本身是一个生成器）
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [4]:
next(stream_docs(path='./movie_data.csv'))

('"Unlike some movies which you can wonder around and do other things, this movie kept me in front of the screen for the entire two hours. I loved every minute of it.<br /><br />However, I have to say that the story is not very believable. Especially when the foreigner was expelled by the government, and then later on, actually sent a package to the guy who helped him. Xiao Liu is a very good actor, he shows his emotions, and he shows his silliness, and his love toward that girl."',
 1)

In [5]:
def get_minibatch(doc_stream, size): # 批处理
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1) # loss表示损失函数log,perception,hinge(svm)
doc_stream = stream_docs('./movie_data.csv')

In [7]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1]) # 有什么用
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes) # 分批次学习（更新权重）
    pbar.update()



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:15


In [8]:
X_test, y_test = get_minibatch(doc_stream, 5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.839


In [9]:
clf.partial_fit(X_test, y_test) #利用测试集升级分类器



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=1,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=1, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [10]:
clf

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=1,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=1, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [12]:
import pickle, os
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)