## eg. KNN model

In [117]:
from collections import Counter
from scipy.spatial.distance import euclidean

# def data_set(X, y):
#     return [(xi, y) for ]

def distance(x1, x2):
    return euclidean(x1, x2)

def predict(x, X, y, k=5):
    """
    x--data to be predicted
    X--origin data
    y--label
    """
    # K nearest
    nearest = sorted(zip(X, y), key=lambda xi: distance(xi[0], x))[:k]
    y_list = [i[1] for i in nearest]
    
    count = Counter(y_list)
    
    return count.most_common()[0][0]

In [30]:
import numpy as np

X = np.random.random((20, 4))
y = np.random.randint(0, 2, size=20)

x = np.random.random((1, 4))

In [31]:
predict(x, X, y, k=5)

1

## 新华社新闻抄袭自动判别

In [35]:
!ls -l ../data/

total 172285
-rw-r--r-- 1 Raclerrr 197609 23613536 Feb  4  2019 SourceHanSerifSC-Regular.otf
-rw-r--r-- 1 Raclerrr 197609     3089 Feb  4  2019 chinese_stopwords.txt
-rw-r--r-- 1 Raclerrr 197609 63561249 Feb  4  2019 comment-classification.zip
-rw-r--r-- 1 Raclerrr 197609 42396032 Feb  4  2019 export_sql_1558435.zip
-rw-r--r-- 1 Raclerrr 197609 46833745 Feb  4  2019 movie_comments.csv
-rw-r--r-- 1 Raclerrr 197609      162 Feb  4  2019 readme.md


In [36]:
!unzip ../data/export_sql_1558435.zip

Archive:  ../data/export_sql_1558435.zip
  inflating: sqlResult_1558435.csv   


In [34]:
import pandas as pd

In [46]:
data_origin = pd.read_csv('sqlResult_1558435.csv', delimiter=',', encoding='gb18030')
data_origin.head(2)

Unnamed: 0,id,author,source,content,feature,title,url
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm


In [48]:
data_origin = data_origin.loc[:, ['source', 'content']]
data_origin.head(2)

Unnamed: 0,source,content
0,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...
1,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...


### 快速数据分析

In [49]:
import pandas_profiling

pandas_profiling.ProfileReport(data_origin)



In [50]:
# 新华社基本比例
data_ = data_origin.dropna()

print(len(data_[data_['source'] == '新华社']) / len(data_))

0.903609336948031


In [54]:
from tqdm import tqdm_notebook
from IPython.display import clear_output

tqdm_notebook().pandas()
clear_output()

# 预测标签处理
def pre_process_label(data_):
    
    def label_y(source):
        return 1 if source == '新华社' else 0

    data_['source'] = data_.source.progress_apply(label_y)
    
    return data_


data_ = pre_process_label(data_)
data_.sample(3)

HBox(children=(IntProgress(value=0, max=87052), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,source,content
72429,1,新华社北京5月23日电（记者黄小希）2016年度全国新闻发布工作评估考核结果日前揭晓，10个...
77420,1,新华社照片，北京，2017年5月27日\n施一——寻找病毒“软肋”的阳光男孩\n施一（左一）...
87583,1,新华社北京6月5日电十二届全国人大第十七期暨2017年第一期全国人大代表学习班5日在北京...


### 文本预处理

In [101]:
import jieba
import re


def process_stop_word(path):
    
    stop_words = []
    with open(path, encoding='utf-8') as f:
        stop_words = f.read().split('\n')
    
    return stop_words
    

def clean_not_chinese(data):
    """
    data--data_Series
    """
    re_data = []
    for i in data:
        i_re = ''.join(re.findall(r'[\u4e00-\u9fa5]', i))
        re_data.append(i_re.strip())
    return re_data
    
    

def tokenize(content, stop_words):
    jieba_cut = jieba.cut(content)
    
    tokens = ''
    for i in jieba_cut:
        if i not in stop_words:
            tokens += ' ' + i
            
    tokens = tokens.strip()
    
    return tokens


def pre_process(data_, stop_words_path):
    """complete process"""
    stop_words = process_stop_word(stop_words_path)

    data_['content'] = clean_not_chinese(data_['content'])
    data_['tokens'] = data_['content'].progress_apply(
                                                       lambda content: tokenize(content, stop_words))
    
    return data_

In [102]:
data_ = pre_process(data_, '../data/chinese_stopwords.txt')
data_.sample(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


HBox(children=(IntProgress(value=0, max=87052), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,source,content,tokens
26242,1,新华社照片外代年月日外代二线纽约国际车展媒体开放日月日在美国纽约捷豹汽车在纽约国际车展上获得...,新华社 照片 外代 年月日 外代 二线 纽约 国际 车展 媒体 开放 日月 美国纽约 捷豹 ...
62845,1,新华社马德里月日电报道员谢宇智西甲争冠已进入白热化阶段日西甲第轮结束皇马依旧在积分榜上平分紧...,新华社 马德里 日电 报道 员 谢宇智 西甲 争冠 进入 白热化 阶段 西甲 第轮 结束 皇...
36073,1,新华社北京月日电记者安蓓国家发展改革委日称我国已确定辽宁中部河北唐山等个城市经济区建设首批产...,新华社 北京 日电 记者 安蓓 国家 发展 改革 委日称 我国 确定 辽宁 中部 河北 唐山...


In [249]:
data_.to_csv('pre_processed_data.csv')

In [128]:
from collections import Counter


def count_common(data_):
    # positive words
    data_pos = data_[data_['source'] == 1]
    pos_words = ' '.join(data_pos['tokens']).split(' ')
    count_pos = Counter(pos_words)

    # negetive words
    data_neg = data_[data_['source'] == 0]
    neg_words = ' '.join(data_neg['tokens']).split(' ')
    count_neg = Counter(neg_words)
    
    return count_pos, count_neg

In [125]:
count_pos, count_neg = count_common(data_)

In [126]:
count_pos.most_common(n=100)

[('新华社', 141924),
 ('外代', 82425),
 ('二线', 61690),
 ('年月日', 60271),
 ('中国', 59943),
 ('记者', 56240),
 ('照片', 51923),
 ('国际', 32728),
 ('比赛', 31231),
 ('发展', 28305),
 ('当日', 26888),
 ('体育', 24474),
 ('举行', 24240),
 ('摄', 23425),
 ('北京', 22315),
 ('足球', 22268),
 ('国家', 21703),
 ('美国', 20677),
 ('进行', 20030),
 ('选手', 19795),
 ('合作', 18486),
 ('一带', 18112),
 ('一路', 18042),
 ('日电', 17500),
 ('经济', 17142),
 ('完', 17043),
 ('企业', 17033),
 ('表示', 15712),
 ('联赛', 15338),
 ('晋级', 15003),
 ('一个', 14971),
 ('工作', 14801),
 ('世界', 14225),
 ('球员', 14019),
 ('建设', 13847),
 ('队', 13732),
 ('活动', 13372),
 ('问题', 12660),
 ('项目', 12521),
 ('战胜', 12190),
 ('目前', 12077),
 ('法国', 11945),
 ('文化', 11697),
 ('社会', 11513),
 ('决赛', 11487),
 ('重要', 11453),
 ('全国', 11352),
 ('成为', 11147),
 ('网球', 10891),
 ('市场', 10823),
 ('服务', 10522),
 ('总统', 10462),
 ('今年', 10184),
 ('公司', 10155),
 ('以比', 10137),
 ('胜', 9659),
 ('赛季', 9459),
 ('技术', 9345),
 ('提供', 9191),
 ('地区', 9145),
 ('俄罗斯', 9088),
 ('参加', 9001),
 ('没有', 8913),


In [127]:
count_neg.most_common(n=100)

[('中国', 11133),
 ('发展', 9325),
 ('企业', 8840),
 ('市场', 8157),
 ('公司', 6965),
 ('一个', 6934),
 ('表示', 6413),
 ('进行', 5954),
 ('记者', 5901),
 ('工作', 5876),
 ('没有', 5078),
 ('经济', 4840),
 ('美国', 4794),
 ('建设', 4782),
 ('目前', 4757),
 ('项目', 4722),
 ('国家', 4678),
 ('问题', 4675),
 ('创新', 4646),
 ('已经', 4472),
 ('投资', 4135),
 ('城市', 4095),
 ('服务', 4064),
 ('亿元', 3915),
 ('报道', 3858),
 ('成为', 3815),
 ('相关', 3753),
 ('可能', 3658),
 ('今年', 3505),
 ('认为', 3327),
 ('时间', 3313),
 ('情况', 3264),
 ('产业', 3242),
 ('政策', 3192),
 ('方面', 3180),
 ('技术', 3157),
 ('国际', 3095),
 ('合作', 3068),
 ('提供', 2985),
 ('实现', 2985),
 ('资金', 2964),
 ('出现', 2961),
 ('主要', 2957),
 ('需要', 2940),
 ('产品', 2934),
 ('北京', 2929),
 ('要求', 2884),
 ('进入', 2880),
 ('数据', 2875),
 ('平台', 2843),
 ('行业', 2790),
 ('重要', 2762),
 ('对于', 2745),
 ('全国', 2696),
 ('机构', 2660),
 ('管理', 2645),
 ('活动', 2628),
 ('一些', 2585),
 ('影响', 2584),
 ('中心', 2577),
 ('增长', 2537),
 ('包括', 2474),
 ('显示', 2463),
 ('发现', 2437),
 ('万元', 2428),
 ('政府', 2422),
 ('未来', 2

In [129]:
# 考虑删除正例和负例中都出现的且频率很高的词，提高分类效果

### 文本表示

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, auc, roc_curve, confusion_matrix
import pandas as pd

data_ = pd.read_csv('pre_processed_data.csv')
data_.drop(columns='Unnamed: 0', inplace=True)
data_.dropna(inplace=True)

# 考虑非均衡数据处理
X_train, X_test, y_train, y_test = train_test_split(
    data_["tokens"], data_["source"], test_size=0.2, random_state=1)

In [68]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86848 entries, 0 to 87051
Data columns (total 3 columns):
source     86848 non-null int64
content    86848 non-null object
tokens     86848 non-null object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


In [36]:
def get_metrics(y_ture, y_predicted):
    """
    y_ture:真实值
    y_predicted：预测值
    """
    #精确度=真阳性/（真阳性+假阳性）
    precision = precision_score(y_ture, y_predicted, pos_label=1,average='binary')             
    #召回率=真阳性/（真阳性+假阴性）
    recall = recall_score(y_ture, y_predicted, pos_label=1,average='binary')   
    # F1
    f1 = f1_score(y_ture, y_predicted, pos_label=1, average='binary')
    #准确率
    accuracy = accuracy_score(y_ture, y_predicted, )
    #AUC
    fpr, tpr, thresholds = roc_curve(y_ture, y_predicted, pos_label=1)
    auc_ = auc(fpr, tpr)
    
    return accuracy, precision, recall, f1, auc_

In [27]:
# Tf-idf
def tfidf(data, max_features, max_df=1.0):
    """
    max_features: 取tfidf值的前max_features个
    max_df: 限制最大的doc frequency
    """
    vectorizer = TfidfVectorizer(max_features=max_features, max_df=max_df)
    X = vectorizer.fit_transform(data)
    return X, vectorizer


def transform_data(X_train, X_test, max_features, max_df):
    X_train_tfidf, tfidf_vectorizer = tfidf(X_train, max_features, max_df)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)  # transform
    return X_train_tfidf, X_test_tfidf, tfidf_vectorizer

In [28]:
X_train_tfidf, X_test_tfidf, tfidf_vectorizer = transform_data(X_train, X_test, 3000, 0.9)

In [29]:
print(type(X_train_tfidf))

print(X_train_tfidf.shape)

print(X_test_tfidf.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(69478, 3000)
(17370, 3000)


### model

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from mlxtend.classifier import StackingClassifier
import pandas as pd
import pprint
import time

def train(X_train, y_train, X_test=None, y_test=None, model=None, 
              do_compare=False, best_params=None, params_range=None):
    """
    model: 分类器对象
         - sclf 为stack模型
         - kNN 简单，有效；计算时间复杂度较高
         - LR 简单，快速，可解释；但是低维数据效果差，要求线性可分
         - SVM 可解释，对样本集有较好鲁棒性，小样本集适用；大规模训练较困难
         - DT 简答，快速，可解释；容易过拟合
         - Stacking 不易过拟合，泛化能力好；慢
         - AdaBoost，GBDT，RF 准确率和泛化能力都较好；计算时间复杂度较高
    do_compare: 对比模型模式，选择最优模型后可设为false
    best_params: do_compare为True时，可选择输入最优参数；默认None，将再次使用
                          GridSearchCV搜索
    params_range: dict, 再次使用GridSearchCV搜索的参数字典
    X_test, y_test: 单个模型拟合或者调优时输入
    """
    sclf = StackingClassifier(classifiers=[KNeighborsClassifier(),
                                                              SVC(),
                                                              DecisionTreeClassifier()],
                                           meta_classifier=LogisticRegression())

    model_dict = {'kNN': (KNeighborsClassifier(),
                                     {'n_neighbors': [5, 25, 55]}),
                             'LR': (LogisticRegression(),
                                    {'C': [0.01, 1, 100]}),
#                              'SVM': (SVC(probability=True, class_weight='balanced'),
#                                      {'C': [0.01, 1, 100]}),
                             'DT': (DecisionTreeClassifier(),
                                    {'max_depth': [50, 100, 150]}),
#                              'Stacking': (sclf,
#                                           {'kneighborsclassifier__n_neighbors': [5, 25, 55],
#                                            'svc__C': [0.01, 1, 100],
#                                            'decisiontreeclassifier__max_depth': [50, 100, 150],
#                                            'meta_classifier__C': [0.01, 1, 100]}),
                             'AdaBoost': (AdaBoostClassifier(),
                                          {'n_estimators': [50, 100, 150, 200]}),
                             'GBDT': (GradientBoostingClassifier(),
                                      {'learning_rate': [0.01, 0.1, 1, 10, 100]}),
                             'RF': (RandomForestClassifier(),
                                    {'n_estimators': [100, 150, 200, 250]})}
    
    # 选择最优模型，使用小数据集
    if do_compare == True:
        results_data = {}
        for model_name, (model, params) in model_dict.items():
            print('训练{}...'.format(model_name))
            # print(model.get_params().keys())
            clf = GridSearchCV(estimator=model,
                                            param_grid=params,
                                            cv=5,
                                            scoring=['roc_auc', 'accuracy', 'f1', 'precision', 'recall'],
                                            refit='roc_auc')
            
            start = time.time()
            clf.fit(X_train, y_train)
            end = time.time()
            
            duration = end - start
            print('耗时{:.4f}s'.format(duration))
            
            results = {key: np.max(value) for key, value in clf.cv_results_.items() if key.startswith('mean_test')}
            results[' best_param'] = clf.best_params_
            results[' time(s)'] = duration
            results_data[model_name] = results
            
        return results_data
    
    # 针对单个模型训练
    if do_compare == False and model is not None:
        results_data = {}
        if best_params is None:
            print('训练...')
            clf = GridSearchCV(estimator=model(),
                                            param_grid=params_range,
                                            cv=5,
                                            scoring=['roc_auc', 'accuracy', 'f1', 'precision', 'recall'],
                                            refit='roc_auc')
            start = time.time()
            clf.fit(X_train, y_train)
            end = time.time()
            
            duration = end - start
            print('耗时{:.4f}s'.format(duration))
            
            results = {key: np.max(value) for key, value in clf.cv_results_.items() if key.startswith('mean_test')}
            results[' best_param'] = clf.best_params_
            results[' time(s)'] = duration
            results_data[model] = results
            
            # 测试集表现
            y_pre = clf.predict(X_test)
            accuracy, precision, recall, f1, auc = get_metrics(y_test, y_pre)
            print("accuracy=%.6f, precision=%.6f, recall=%.6f, f1=%.6f, auc=%.6f" % (accuracy, precision, recall, f1, auc))
            
            diff = y_test - y_pre
            wrong_predict = np.argwhere(diff != 0)
            
            return results_data, clf, wrong_predict
        
        else:  # 已知最优参数
            print('训练...')
            clf = model(*best_params)

            start = time.time()
            clf.fit(X_train, y_train)
            end = time.time()
            
            duration = end - start
            print('耗时{:.4f}s'.format(duration))
            
            # 测试集表现
            y_pre = clf.predict(X_test)
            accuracy, precision, recall, f1, auc = get_metrics(y_test, y_pre)
            print("accuracy=%.6f, precision=%.6f, recall=%.6f, f1=%.6f, auc=%.6f" % (accuracy, precision, recall, f1, auc))
            
            diff = y_test - y_pre
            wrong_predict = np.argwhere(diff != 0)
            
            return clf, wrong_predict
    
    print('参数有误，请重新输入')
    return None

In [229]:
results_data = train(X_train_tfidf[:1500, :500], y_train[: 1500], do_compare=True)

pd_frame = pd.DataFrame(results_data).transpose()
pd_frame

Unnamed: 0,best_param,time(s),mean_test_accuracy,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_roc_auc
kNN,{'n_neighbors': 55},1.46013,0.915333,0.955611,0.916841,1.0,0.7532
LR,{'C': 1},0.187497,0.913333,0.954704,0.931795,1.0,0.869652
SVM,{'C': 0.01},3.89758,0.913333,0.954704,0.913333,1.0,0.835233
DT,{'max_depth': 150},0.652275,0.892667,0.941571,0.93637,0.947445,0.626839
Stacking,"{'decisiontreeclassifier__max_depth': 150, 'kn...",141.656,0.913333,0.954704,0.938981,1.0,0.645775
AdaBoost,{'n_estimators': 50},9.89952,0.914667,0.95414,0.942863,0.972263,0.832468
GBDT,{'learning_rate': 0.1},8.94806,0.920667,0.957824,0.935437,0.99927,0.841564
RF,{'n_estimators': 250},11.6817,0.913333,0.954575,0.917022,0.99708,0.870087


In [244]:
results_data = train(X_train_tfidf[:3000, :1000], y_train[: 3000], do_compare=True)

pd_frame = pd.DataFrame(results_data).transpose()
pd_frame

Unnamed: 0,best_param,time(s),mean_test_accuracy,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_roc_auc
kNN,{'n_neighbors': 25},5.56712,0.907667,0.951599,0.913682,1.0,0.796649
LR,{'C': 1},0.335101,0.914333,0.953656,0.936973,1.0,0.894356
SVM,{'C': 100},82.9259,0.721667,0.820634,0.985838,0.703635,0.893798
DT,{'max_depth': 150},2.60003,0.889,0.93931,0.932483,0.946381,0.660017
AdaBoost,{'n_estimators': 50},19.0789,0.914667,0.953723,0.939361,0.968782,0.882959
GBDT,{'learning_rate': 0.1},28.1745,0.918,0.956383,0.931729,0.999265,0.872171
RF,{'n_estimators': 250},27.65,0.917667,0.956599,0.917318,0.999632,0.892417


In [46]:
results_data = train(X_train_tfidf[:6000, :2000], y_train[: 6000], do_compare=True)

pd_frame = pd.DataFrame(results_data).transpose()
pd_frame

Unnamed: 0,best_param,time(s),mean_test_accuracy,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_roc_auc
kNN,{'n_neighbors': 55},25.768,0.904,0.949439,0.975401,1.0,0.885088
LR,{'C': 100},0.579477,0.948333,0.971704,0.960492,1.0,0.967456
DT,{'max_depth': 100},7.06328,0.9715,0.984178,0.98573,0.983189,0.926517
AdaBoost,{'n_estimators': 50},61.1641,0.9775,0.987525,0.987998,0.987252,0.985736
GBDT,{'learning_rate': 0.1},116.042,0.9795,0.988586,0.994395,0.99261,0.981456
RF,{'n_estimators': 250},53.8199,0.979,0.988399,0.985508,0.991318,0.989255


In [52]:
results_data, clf, wrong_predict = train(X_train_tfidf[:8000, :3000], y_train[: 8000], X_test_tfidf[:1500, :3000], y_test[:1500], 
                                                              model=RandomForestClassifier, params_range={'n_estimators': [250, 260, 270]})

训练...
耗时95.6500s
accuracy=0.986667, precision=0.991091, recall=0.994043, f1=0.992565, auc=0.958805


  return getattr(obj, method)(*args, **kwds)


In [74]:
# 错分
X_test.iloc[wrong_predict.ravel()]

4518     全国 启动 据悉 此次 长 相思 以爱要 纯净 主题 传递 出 新西兰 长 相思 葡萄酒 以...
82948    原 标题 蹲点 手记 丨 村民 喜当 包租婆 岭上 民宿 梦小高 初到 高岭 山上 棕 褐色...
1100     原 标题 普京 科米 喊话 若受 迫害 俄罗斯 避难 吧 俄罗斯 总统 普京 年度 普京 直...
6791     儿童 智能 手表 八成 产 深圳 广州日报 深圳 讯 全 媒体 记者 阮元 智能 穿戴 设备...
4692     张艺谋 成龙 新作 亮相 广州日报 讯 全 媒体 记者 黄岸 前日 完美 威秀 娱乐 集团 ...
3669     新华社 洛杉矶 日电 记者 高山 美国 皮克斯 动画片 赛车 总动员 日至 日开 画首 周末...
7543     人民网 北京 日电 记者 黄子娟 下午 日本航空自卫队 发布 消息 刚刚 过去 一周 日美 ...
3379     中新网 贵阳 日电 记者 张伟 记者 贵州省 民政 获悉 截至 日时 新一轮 强降雨 造成 ...
8390     本报讯 记者 郭海方 漯河市 第七届 人民代表大会 第一次 会议 选举 曹存 正为 漯河市 ...
8059     新华社 北京 日电 外交部 发言人 陆慷日 宣布 金砖 国家 外长 会晤 北京 举行 外交部...
5455     中新网 日电 俄罗斯 卫星 网 报道 俄 堪察加 舍维 留奇 火山爆发 喷发 出高达 海拔 ...
597      新华社 哈尔滨 日电 记者 闫睿 记者 黑龙江省政府 获悉 保障 困难 家庭 学生 因贫 失...
57441    总 策划 何平 总监制 刘思扬 总 指导 周宗敏 出品人 陈凯星 冯瑛冰 策划 袁建贺 大为...
5836     以来 伊斯兰 国 宣称 英国 实施 袭击 男子 驾车 伦敦 市中心 议会 大厦 附近 冲撞 ...
7367     图文 海归 博士 研发 污水 分离 产品 湖北日报 讯 应城市 湖北 康创 科技 有限公司 ...
34547    安徒生 孩子 小不点 记不起 最初 躺 朵花 花儿 发出 香气 但是 长大成人 以后 身上 ...
374      新华社 合肥 日电 题 安徽 不断 跨省 流域 补偿 试点 推向 深入 新华社 记者 杨丁淼.

In [75]:
y_test.iloc[wrong_predict.ravel()]

4518     0
82948    1
1100     0
6791     0
4692     0
3669     1
7543     0
3379     0
8390     0
8059     1
5455     0
597      0
57441    1
5836     0
7367     0
34547    1
374      0
83408    1
86221    1
13521    1
Name: source, dtype: int64

In [32]:
def gaussian_nb(X_train, y_train, X_test, y_test):
    """将tf-idf特征作为数值参数"""
    
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    
    y_pre = clf.predict(X_test)
    
    accuracy, precision, recall, f1, auc = get_metrics(y_test, y_pre)
    print("accuracy=%.6f, precision=%.6f, recall=%.6f, f1=%.6f, auc=%.6f" % (accuracy, precision, recall, f1, auc))

    return accuracy, precision, recall, f1, auc


def bernoulli_nb(X_train, y_train, X_test, y_test):
    """将word在文档中出现或者不出现作为特征，然而 MultinomialNB 将考虑word出现的频次
    输入为bag of words，而不是tf-idf特征
    """
    clf = BernoulliNB()
    clf.fit(X_train, y_train)
    
    y_pre = clf.predict(X_test)
    
    accuracy, precision, recall, f1, auc = get_metrics(y_test, y_pre)
    print("accuracy=%.6f, precision=%.6f, recall=%.6f, f1=%.6f, auc=%.6f" % (accuracy, precision, recall, f1, auc))
    
    return accuracy, precision, recall, f1, auc

In [43]:
# 提高特征维度，对结果所有提升
gaussian_nb(X_train_tfidf[:6000, :3000].toarray(), y_train[: 6000], X_test_tfidf[:1000, :3000].toarray(), y_test[:1000])

accuracy=0.821000, precision=0.967532, recall=0.828699, f1=0.892750, auc=0.790587


(0.821,
 0.9675324675324676,
 0.8286985539488321,
 0.8927501497902938,
 0.7905869007367923)

In [70]:
from sklearn.feature_extraction.text import CountVectorizer

def count_vectorize(data):
    count_vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')  # 加上单字符
    emb = count_vectorizer.fit_transform(data)
    return emb, count_vectorizer

X_train_counts, count_vectorizer = count_vectorize(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [71]:
bernoulli_nb(X_train_counts[:6000, :3000], y_train[:6000], X_test_counts[:1000, :3000], y_test[:1000])

accuracy=0.886000, precision=0.918890, recall=0.957731, f1=0.937908, auc=0.602628


(0.886,
 0.9188900747065102,
 0.9577308120133482,
 0.9379084967320261,
 0.6026277822442979)