In [6]:
import  pickle
import jieba
import pandas as pd
import numpy as np
from gensim import corpora, models
from pprint import pprint
import traceback
import sys
from scipy.sparse import *
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [10]:
# 导入自定义库
from utils.data_utils import clean_str
from utils.data_utils import build_vocab
from utils.data_utils import get_tokens

在正式处理之前，我们想对语料中的每一个单词关联一个唯一的ID。这可以用 gensim.corpora.Dictionary 来实现。这个字典定义了我们要处理的所有单词表。

In [81]:
BASE_DIR = '/Users/tsw/ScenicSpotReviews'

W2V_DIR = BASE_DIR + '/embeddings/'

TEXT_DATA_DIR = BASE_DIR + '/data/'

MAX_SEQUENCE_LENGTH = 100

MAX_NUM_WORDS = 33950
MAX_NB_WORDS = 30000

EMBEDDING_DIM = 300

VALIDATION_SPLIT = 0.2

BATCH_SIZE = 32

In [186]:
df_train_dataset = pd.read_csv('./data/training-inspur.csv', encoding='utf-8')
df_test_dataset = pd.read_csv('./data/Preliminary-texting.csv', encoding='utf-8')

In [187]:
df_train_dataset.shape

(20000, 3)

In [188]:
df_test_dataset.shape

(65499, 2)

In [189]:
df_dataset = pd.concat([df_train_dataset['COMMCONTENT'],df_test_dataset['COMMCONTENT']], ignore_index=True)

In [190]:
df_dataset.shape

(85499,)

#### 建立评论语料库

In [195]:
reviews_corpus = []

In [196]:
for sent in df_dataset:

    # Extract Sentence
    sent = str(sent).strip()

    sent = clean_str(sent)
    
    stop_words = [" "]

    seg_list = jieba.cut(sent, cut_all=False)

    seg_list = [word for word in seg_list if word not in stop_words]
    
    reviews_corpus.append(" ".join(seg_list))

In [200]:
len(reviews_corpus)

85499

In [201]:
vocab,vocab_freqs = build_vocab(reviews_corpus)

In [204]:
len(vocab)

2554361

In [205]:
len(vocab_freqs)

84219

In [206]:
vocab_freqs.most_common()

[('的', 156880),
 ('了', 61610),
 ('是', 38290),
 ('去', 31576),
 ('很', 29660),
 ('也', 23266),
 ('都', 22816),
 ('在', 21771),
 ('不', 20508),
 ('有', 20093),
 ('就', 20074),
 ('还', 17927),
 ('可以', 15686),
 ('没有', 15332),
 ('不错', 14806),
 ('好', 14395),
 ('人', 14167),
 ('我', 12944),
 ('就是', 12870),
 ('一个', 11760),
 ('到', 11281),
 ('和', 10877),
 ('玩', 10697),
 ('景区', 10633),
 ('感觉', 10308),
 ('地方', 10232),
 ('还是', 9989),
 ('多', 9437),
 ('没', 9267),
 ('景点', 9238),
 ('看', 8854),
 ('值得', 8780),
 ('要', 8574),
 ('说', 8325),
 ('门票', 8265),
 ('里面', 7996),
 ('很多', 7786),
 ('我们', 7496),
 ('比较', 7208),
 ('一般', 6387),
 ('上', 6353),
 ('来', 6178),
 ('走', 6133),
 ('吧', 5763),
 ('元', 5762),
 ('非常', 5760),
 ('但是', 5680),
 ('什么', 5634),
 ('太', 5626),
 ('总体', 5612),
 ('这里', 5447),
 ('大', 5401),
 ('这个', 5379),
 ('不是', 5184),
 ('个', 5147),
 ('但', 5144),
 ('你', 5120),
 ('有点', 5071),
 ('还有', 5049),
 ('小时', 4995),
 ('挺', 4950),
 ('时候', 4788),
 ('孩子', 4727),
 ('买', 4681),
 ('着', 4615),
 ('景色', 4534),
 ('时间', 4469),
 ('给

生成 corpus 
raw_corpus = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [91]:
precessed_corpus = []
for sent in corpus:
    precessed_corpus.append(sent)

In [94]:
corpus[:5]

['普通 公园 一个 只是 多 了 几个 泉 而已 人不多 适合 老人 孩子 闲逛 买票 的话 还是 贵 了 人家 说 6.30 之 前进 园 不用 花钱',
 '跟 儿子 在 里面 玩 了 一天 非常 好 跟 儿子 在 里面 玩 了 一天 非常 好 真的 很 不错 哦 有空 还要 去',
 '这 已经 是 第五次 来 这里 玩 了 每次 孩子 都 很 喜欢 不 愿意 从水里 出来 有 机会 还会 再 来 还有 比 我 更 忠诚 的 客户 吗 哈哈',
 '当天 在 携程 上定 的 票 打 温泉 度假村 咨询电话 和 携程 客服 都 说 次日 生效 但 到 酒店 后 票能 用 请客 服 人员 了解 清楚 再 回答 咨询 问题 不然 听信 就 得 中途 掉头 回家 了',
 '烟台 历史 的 一部分 非常 值得 推荐 去 看看 海边 景色 也 很漂亮']

In [57]:
dictionary = corpora.Dictionary(corpus)

In [58]:
len(dictionary.dfs.items())

33950

In [59]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x12667d0f0>

In [60]:
dictionary.token2id

{'6.30': 0,
 '一个': 1,
 '不用': 2,
 '之': 3,
 '买票': 4,
 '了': 5,
 '人不多': 6,
 '人家': 7,
 '公园': 8,
 '几个': 9,
 '前进': 10,
 '只是': 11,
 '园': 12,
 '多': 13,
 '孩子': 14,
 '普通': 15,
 '泉': 16,
 '的话': 17,
 '老人': 18,
 '而已': 19,
 '花钱': 20,
 '说': 21,
 '贵': 22,
 '还是': 23,
 '适合': 24,
 '闲逛': 25,
 '一天': 26,
 '不错': 27,
 '儿子': 28,
 '去': 29,
 '哦': 30,
 '在': 31,
 '好': 32,
 '很': 33,
 '有空': 34,
 '玩': 35,
 '真的': 36,
 '跟': 37,
 '还要': 38,
 '里面': 39,
 '非常': 40,
 '不': 41,
 '从水里': 42,
 '再': 43,
 '出来': 44,
 '吗': 45,
 '哈哈': 46,
 '喜欢': 47,
 '客户': 48,
 '已经': 49,
 '忠诚': 50,
 '愿意': 51,
 '我': 52,
 '是': 53,
 '更': 54,
 '有': 55,
 '机会': 56,
 '来': 57,
 '每次': 58,
 '比': 59,
 '的': 60,
 '第五次': 61,
 '还会': 62,
 '还有': 63,
 '这': 64,
 '这里': 65,
 '都': 66,
 '上定': 67,
 '不然': 68,
 '中途': 69,
 '了解': 70,
 '人员': 71,
 '但': 72,
 '到': 73,
 '后': 74,
 '听信': 75,
 '和': 76,
 '咨询': 77,
 '咨询电话': 78,
 '回家': 79,
 '回答': 80,
 '客服': 81,
 '就': 82,
 '度假村': 83,
 '当天': 84,
 '得': 85,
 '打': 86,
 '掉头': 87,
 '携程': 88,
 '服': 89,
 '次日': 90,
 '清楚': 91,
 '温泉': 92,
 '生效': 93,
 '

1927

In [None]:
def filter_tokens(vocab,threshold_num=10):
        small_freq_ids = [
            tokenid for tokenid, docfreq in self.dictionary.dfs.items()
            if docfreq < threshold_num
        ]
        self.dictionary.filter_tokens(small_freq_ids)
        self.dictionary.compactify()

In [79]:
small_freq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < 5]
small_freq_ids

[]

In [75]:
dictionary.filter_tokens(small_freq_ids)

In [66]:
dictionary.compactify()

In [78]:
dictionary.__len__()

1927

In [None]:
print "Beigin to Train the model"
lr_model = LogisticRegression()
lr_model.fit(self.train_set, self.train_tag)
print "End Now, and evalution the model with test dataset"
# print "mean accuracy: {0}".format(lr_model.score(self.test_set, self.test_tag))
y_pred = lr_model.predict(self.test_set)
print classification_report(self.test_tag, y_pred)
print confusion_matrix(self.test_tag, y_pred)
print "save the trained model to tfidf_lr_model.pl"
joblib.dump(lr_model, self.data_path.replace("all_title.csv","tfidf_lr_model.pl"))

In [None]:
class tfidf_text_classifier:
    """ tf_idf_text_classifier: a text classifier based on tfidf
    """
    def __init__(self, data_path):
        self.data_path = data_path
        self.dictionary = corpora.Dictionary()
        self.corpus = []
        self.labels = []
        # self.cut_doc_obj = cutDoc()
     
    
    def get_tokens_from_csv(self):
        """ get all tokens of the corpus
        """
        df_dataset = pd.read_csv('./data/training-inspur.csv', encoding='utf-8')
        
#         fwrite = open(self.data_path.replace("all_title.csv", "all_token.csv"), 'w')
#         with open(self.data_path, "r") as fread:
#             i = 0
#             # while True:
#             for line in fread.readlines():
#                 try:
#                     line_list = line.strip().split("\t")
#                     label = line_list[0]
#                     self.labels.append(label)
#                     text = line_list[1]
#                     text_tokens = self.cut_doc_obj.run(text)
#                     self.corpus.append(' '.join(text_tokens))
#                     self.dictionary.add_documents([text_tokens])
#                     fwrite.write(label + "\t" + "\\".join(text_tokens) + "\n")
#                     i += 1
#                 except BaseException as e:
#                     msg = traceback.format_exc()
#                     print msg
#                     print "=====>Read Done<======"
#                     break
#         self.token_len = self.dictionary.__len__()
#         print "all token len " + str(self.token_len)
#         self.num_data = i
#         fwrite.close()
        
    def filter_tokens(self, threshold_num=10):
        small_freq_ids = [
            tokenid for tokenid, docfreq in self.dictionary.dfs.items()
            if docfreq < threshold_num
        ]
        self.dictionary.filter_tokens(small_freq_ids)
        self.dictionary.compactify()
        
    def train(self):
        print "Beigin to Train the model"
        lr_model = LogisticRegression()
        lr_model.fit(self.train_set, self.train_tag)
        print "End Now, and evalution the model with test dataset"
        # print "mean accuracy: {0}".format(lr_model.score(self.test_set, self.test_tag))
        y_pred = lr_model.predict(self.test_set)
        print classification_report(self.test_tag, y_pred)
        print confusion_matrix(self.test_tag, y_pred)
        print "save the trained model to tfidf_lr_model.pl"
        joblib.dump(lr_model, self.data_path.replace("all_title.csv","tfidf_lr_model.pl"))

In [None]:
bow_text_classifier_obj = tfidf_text_classifier("../data/origin_data/all_title.csv")

In [None]:
bow_text_classifier_obj.vec()
bow_text_classifier_obj.split_train_test()
bow_text_classifier_obj.train()