In [1]:
import jieba
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

%matplotlib notebook

### 数据预处理

In [2]:
news = pd.read_csv('./data/sqlResult_1558435.csv', encoding='gb18030')

In [3]:
news.head(2)

Unnamed: 0,id,author,source,content,feature,title,url
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm


In [5]:
len(news)

89611

In [6]:
news_dropna = news.dropna(subset=['source','content'])

In [7]:
len(news_dropna)

87052

In [8]:
def transform(line):
    class_ = 1 if line['source'] == '新华社' else 0
    return pd.Series([class_, line['content']], index=['y','content'])

In [9]:
data = news_dropna.apply(transform, axis=1)

In [10]:
data.head()

Unnamed: 0,y,content
0,0,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...
1,0,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...
2,0,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...
3,1,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n
4,0,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...


In [11]:
corpus = data.content.to_list()

In [13]:
y = data.y.values.astype(np.int)

In [14]:
y.shape, len(corpus)

((87052,), 87052)

### 使用TF-IDF 进行文本向量化

In [17]:
corpus_cut =[]
mask = []
for sentence in tqdm(corpus):
    if not isinstance(sentence, str):
        mask.append(False)
        continue
    mask.append(True)
    sentence = ''.join(re.findall(r'\w+',sentence))
    corpus_cut.append(' '.join(jieba.cut(sentence)))

  0%|          | 0/87052 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\壹心理\AppData\Local\Temp\jieba.cache
Loading model cost 2.176 seconds.
Prefix dict has been built succesfully.
100%|██████████| 87052/87052 [05:36<00:00, 258.96it/s]


In [18]:
len(corpus_cut)

87052

In [19]:
y = y[mask]
len(y)

87052

In [20]:
corpus_cut[0]

'此外 自 本周 6 月 12 日起 除 小米 手机 6 等 15 款 机型 外 其余 机型 已 暂停 更新 发布 含 开发 版 体验版 内测 稳定版 暂不受 影响 以 确保 工程师 可以 集中 全部 精力 进行 系统优化 工作 有人 猜测 这 也 是 将 精力 主要 用到 MIUI9 的 研发 之中 MIUI8 去年 5 月 发布 距今已有 一年 有余 也 是 时候 更新换代 了 当然 关于 MIUI9 的 确切 信息 我们 还是 等待 官方消息'

- 这里 max_features 并不清楚多大合适，尝试了 400 效果不如 300，暂时确定为 300 吧

In [22]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=300)

In [25]:
X = vectorizer.fit_transform(corpus_cut)

In [26]:
X = X.toarray()
X.shape

(87052, 300)

### 建模 KNN

In [28]:
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [29]:
random_state = 2019

In [30]:
X_train, x_test, Y_train, y_test = train_test_split(X, y, random_state=random_state, test_size=0.15)

In [31]:
X_train.shape, x_test.shape

((73994, 300), (13058, 300))

### KNN KNeighborsClassifier, k=5

In [32]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=42, test_size=0.15)

In [33]:
x_train.shape, x_valid.shape

((62894, 300), (11100, 300))

In [34]:
knc = KNeighborsClassifier(n_jobs = -1)

In [35]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [37]:
knc.score(x_valid, y_valid)

0.9175675675675675

In [38]:
y_pred = knc.predict(x_valid)
y_pred_prob = knc.predict_proba(x_valid)

In [39]:
y_pred_prob, y_pred_prob.shape

(array([[0., 1.],
        [0., 1.],
        [0., 1.],
        ...,
        [0., 1.],
        [0., 1.],
        [0., 1.]]), (11100, 2))

$$precision=\frac{TP}{TP+FP}$$

$$recall=\frac{TP}{TP+FN}$$

$$\frac{1}{F1}=\frac{1}{2}*(\frac{1}{P}+\frac{1}{R})$$

$$F1=\frac{2PR}{P+R}$$

In [42]:
precision_score(y_valid, y_pred)

0.9552164372631159

In [43]:
recall_score(y_valid, y_pred)

0.9535995220551629

In [44]:
f1_score(y_valid, y_pred)

0.9544072948328267

In [45]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9008588518754408

### 调整参数，K=3 

In [46]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=62, test_size=0.15)

In [47]:
knc = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)

In [48]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
                     weights='uniform')

In [49]:
y_pred = knc.predict(x_valid)
y_pred_prob = knc.predict_proba(x_valid)

In [50]:
knc.score(x_valid, y_valid)

0.9307207207207208

In [51]:
precision_score(y_valid, y_pred)

0.9542907696844589

In [52]:
recall_score(y_valid,y_pred)

0.9696364362764682

In [53]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.8839187552143312

### precision 和 recall 都提高了，roc却降低了

### 调整参数 K=7

In [54]:
x_trainain, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=62, test_size=0.15)

In [55]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1)

In [56]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
                     weights='uniform')

In [57]:
y_pred = knc.predict(x_valid)

In [58]:
y_pred_prob = knc.predict_proba(x_valid)

In [59]:
y_pred.shape, y_pred_prob.shape

((11100,), (11100, 2))

In [60]:
knc.score(x_valid, y_valid)

0.9182882882882882

In [61]:
precision_score(y_valid, y_pred)

0.9539335925815136

In [62]:
recall_score(y_valid, y_pred)

0.9555533359968038

In [63]:
f1_score(y_valid, y_pred)

0.9547427773065216

In [64]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9183271434572631

### 使用距离作为权重

In [65]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=72, test_size=0.15)

In [66]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance')

In [67]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
                     weights='distance')

In [68]:
y_pred = knc.predict(x_valid)

In [69]:
y_pred_prob = knc.predict_proba(x_valid)

In [70]:
y_pred.shape, y_pred_prob.shape

((11100,), (11100, 2))

In [71]:
knc.score(x_valid, y_valid)

0.9277477477477477

In [72]:
precision_score(y_valid, y_pred)

0.9582009531374106

In [73]:
recall_score(y_valid, y_pred)

0.9620215311004785

In [74]:
f1_score(y_valid, y_pred)

0.9601074413052129

In [75]:
roc_auc_score(y_valid, y_pred_prob[: ,1])

0.9307409641208059

### 使用距离作为近邻样本权重，效果有明显提升，尝试提高leaf-size

In [76]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=92, test_size=0.15)

In [77]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance', leaf_size=50)

In [78]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=50, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
                     weights='distance')

In [79]:
y_pred = knc.predict(x_valid)

In [80]:
y_pred_prob = knc.predict_proba(x_valid)

In [81]:
knc.score(x_valid, y_valid)

0.9318918918918919

In [82]:
precision_score(y_valid, y_pred)

0.960087370929309

In [83]:
recall_score(y_valid, y_pred)

0.9646847565841979

In [84]:
f1_score(y_valid, y_pred)

0.9623805732484076

In [85]:
roc_auc_score(y_valid, y_pred_prob[: ,1])

0.9373254189944135

### 效果不错，继续增加leaf-size看下 

In [86]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=92, test_size=0.15)

In [87]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance', leaf_size=70)

In [88]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=70, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
                     weights='distance')

In [89]:
y_pred = knc.predict(x_valid)

In [90]:
y_pred_prob = knc.predict_proba(x_valid)

In [91]:
knc.score(x_valid, y_valid)

0.9313513513513514

In [92]:
precision_score(y_valid, y_pred)

0.9604295088486776

In [93]:
recall_score(y_valid, y_pred)

0.9636871508379888

In [94]:
f1_score(y_valid, y_pred)

0.9620555721541679

In [95]:
roc_auc_score(y_valid, y_pred_prob[: ,1])

0.9365046657538636

### 使用grid-search确定超参数 

In [96]:
from sklearn.model_selection import GridSearchCV

In [97]:
parameters = {'n_neighbors':[3,5,7], 'leaf_size':[30, 45, 60]}

In [98]:
knc = KNeighborsClassifier(n_jobs=5, weights='distance')

In [99]:
clf = GridSearchCV(knc, parameters, cv=5, scoring='roc_auc', verbose=5, n_jobs=5)

In [None]:
clf.fit(X_train, Y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed: 28.5min


In [None]:
跑的时间太长了，放弃

### 针对 KNN 最佳参数暂时确定为 n_neighbors=7, weights='distance',leaf_size=40

In [101]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=92, test_size=0.15)

In [102]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance', leaf_size=40)

In [103]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=40, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
                     weights='distance')

In [104]:
y_pred = knc.predict(x_valid)

In [105]:
y_pred_prob = knc.predict_proba(x_valid)

In [106]:
knc.score(x_valid, y_valid)

0.9318918918918919

In [107]:
precision_score(y_valid, y_pred)

0.960087370929309

In [108]:
recall_score(y_valid, y_pred)

0.9646847565841979

In [109]:
f1_score(y_valid, y_pred)

0.9623805732484076

In [110]:
roc_auc_score(y_valid, y_pred_prob[: ,1])

0.9373254189944135

## Naive Bayes 

In [111]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=1002, test_size=0.15)

In [113]:
gnb = GaussianNB()

In [115]:
gnb.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [116]:
y_pred = gnb.predict(x_valid)

In [117]:
y_pred_prob = gnb.predict_proba(x_valid)

In [118]:
gnb.score(x_valid,y_valid)

0.807027027027027

In [119]:
precision_score(y_valid, y_pred)

0.9981153411232567

In [120]:
recall_score(y_valid, y_pred)

0.7887995233839737

In [121]:
f1_score(y_valid, y_pred)

0.8811980033277869

In [122]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9410166438307452

贝叶斯好像没啥好调的……训练速度很快比 KNN 快多了，而且 roc 也更高

### Logistic Regression

In [123]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=1002, test_size=0.15)

In [124]:
lr = LogisticRegression(n_jobs=-1) # set baseline

In [125]:
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [126]:
lr.score(x_valid, y_valid)

0.9807207207207207

In [127]:
y_pred = lr.predict(x_valid)

In [128]:
y_pred_prob = lr.predict_proba(x_valid)

In [129]:
precision_score(y_valid, y_pred)

0.9876323340259227

In [130]:
recall_score(y_valid, y_pred)

0.9911627445139509

In [131]:
f1_score(y_valid, y_pred)

0.9893943899296264

In [132]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9937303744000687

 LR的效果很好, good

### 利用test set选定模型 ·KNN

In [133]:
knc.score(x_test, y_test)

0.9261755245826313

In [134]:
y_pred = knc.predict(x_test)

In [135]:
y_pred_prob = knc.predict_proba(x_test)

In [136]:
precision_score(y_test, y_pred)

0.9586574230639161

In [137]:
recall_score(y_test, y_pred)

0.9597934653800576

In [138]:
f1_score(y_test, y_pred)

0.9592251078588951

In [139]:
roc_auc_score(y_test, y_pred_prob[:, 1])

0.9230672217332208

#### Gaussian Naive Bayes

In [140]:
gnb.score(x_test, y_test)

0.79621687854189

In [141]:
y_preb = gnb.predict(x_test)

In [142]:
y_pred_prob = gnb.predict_proba(x_test)

In [143]:
precision_score(y_test, y_pred)

0.9586574230639161

In [144]:
recall_score(y_test, y_pred)

0.9597934653800576

In [145]:
f1_score(y_test, y_pred)

0.9592251078588951

In [146]:
roc_auc_score(y_test,y_pred_prob[:, 1])

0.9375509641130992

### Logistic Regression

In [148]:
lr.score(x_test, y_test)

0.9790932761525502

In [149]:
y_pred = lr.predict(x_test)

In [150]:
y_pred_prob = lr.predict_proba(x_test)

In [151]:
precision_score(y_test, y_preb)

0.9984751116436118

In [152]:
recall_score(y_test, y_preb)

0.7759437954968681

In [153]:
f1_score(y_test, y_preb)

0.8732555370326268

In [154]:
roc_auc_score(y_test, y_pred_prob[:, 1])

0.9906651640078233

逻辑回归即使在未见过的测试集上表现依然很好，最后选定逻辑回归模型
性能达到某个点可以定义为：在这个点训练集误差依然减小而验证集的误差不再下降反而开始上升也就是模型开始过拟合了

### 找出所以预测为 1， 但是实际为 0 的文章。 作为抄袭的候选者

In [155]:
# d对所有的 x 进行预测
y_pred = lr.predict(X)
y_preb.shape, y.shape

((13058,), (87052,))

In [156]:
len(news_dropna)

87052

In [157]:
news_dropna.head(2)

Unnamed: 0,id,author,source,content,feature,title,url
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm


In [158]:
news_dropna['y'] = y
news_dropna['y_pred'] = y_pred
news_dropna.head(2)

Unnamed: 0,id,author,source,content,feature,title,url,y,y_pred
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm,0,0
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm,0,0


In [159]:
# 实际为0， 预测为1
copy_news = news_dropna[(news_dropna.y == 0)&(news_dropna.y_pred == 1)]
copy_news.head(2)

Unnamed: 0,id,author,source,content,feature,title,url,y,y_pred
51,89566,,新华网,戈壁的大漠黄沙曾掩埋了无数西域古道，而如今一条大漠天路正顽强地与黄沙“搏斗”，在乌兰布和、腾...,"{""type"":""国内新闻"",""site"":""环球"",""commentNum"":""0"",""j...",大漠变通途——世界上最长的穿越沙漠高速公路建设纪实,http://china.huanqiu.com/hot/2017-06/10866392....,0,1
56,89561,,央视新闻,很快，不少人主动添加记者为好友，询问是否需要扫描软件，并声称这些扫描软件能够攻破摄像头的IP...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",大量家庭摄像头遭入侵 有人兜售IP地址给偷窥者,http://www.cnbeta.com/articles/tech/623631.htm,0,1


In [160]:
len(copy_news.source)

1106

In [161]:
copy_sources = set(copy_news.source.to_list())

In [162]:
len(copy_sources)

276

## 什么是数据思维？什么是机器学习思维？

- 数据思维的最核心是利用数据解决问题，利用数据解决问题的最核心是要深度了解需求，了解真正要解决什么样的问题，解决问题背后的真实目的是什么。在解决问题的过程中我们使用数据的方法，通常可以叫做量化的方法。
- 机器学习思维就是根据大数据学习出一种规则，这个规则可以将输入的X映射到Y而不像传统的方法由人工写各种繁琐的规则，机器学习模型可以利用继续增加的数据不断迭代优化模型的表现

### 使用第4课讲解的 edit distance，在涉嫌抄袭的文章中，找到其重复的文字与被修改过的文字。

In [163]:
def edit_distance(string1, string2):
    """string1 => string2"""
    len1, len2 = len(string1), len(string2)
    if len1 == 0 and len2 == 0: return 0
    if len2 == 0: return len1, []
    if len1 == 0: return len2, []
    dp = [[0] * (len2+1) for _ in range(len1+1)]
    duplication = []
    for i in range(1, len2+1):
        dp[0][i] = i
    for i in range(1, len1+1):
        dp[i][0] = i
    for i in range(1, len1+1):
        for j in range(1, len2+1):
            if string1[i-1] == string2[j-1]:
                dp[i][j] = dp[i-1][j-1]
                duplication.append(string1[i-1])
            else:
                dp[i][j] = min(
                    dp[i-1][j], dp[i][j-1], dp[i-1][j-1]
                ) + 1
    return dp[-1][-1], duplication

In [164]:
edit_distance('abcd', 'acdef')

(3, ['a', 'c', 'd'])

拿第一条候选抄袭新闻作为示例

In [165]:
source_copy_news = copy_news.iloc[0 ,3]

In [166]:
source_copy_news = ''.join(re.findall('\w+', source_copy_news))

In [167]:
source_copy_news

'戈壁的大漠黄沙曾掩埋了无数西域古道而如今一条大漠天路正顽强地与黄沙搏斗在乌兰布和腾格里巴丹吉林三大沙漠中穿行成为世界上最长的穿越沙漠高速公路这就是北京至乌鲁木齐的京新高速公路京新高速全长2540公里建成后北京到新疆的行车里程将缩短1300公里大漠变通途通疆达海的梦想即将实现新华社记者邓华摄'

In [168]:
def get_target_news(source_copy_news, idx, news_df):
    min_distance = float('inf')
    duplication = target_news = None
    for i in range(len(news_df)):
        if i == idx: continue # 跳过候选抄袭新闻本身
        candidate_news = news_df.iloc[i, 1]
        candidate_news = ''.join(re.findall('\w+', candidate_news))
        distance, tmp_duplication = edit_distance(source_copy_news, candidate_news)
        if distance < min_distance:
            min_distance = distance
            target_news = candidate_news
            duplication = tmp_duplication
    return target_news, duplication

In [169]:
target_news, duplication = get_target_news(source_copy_news, 51, data)

In [4]:
target_news

NameError: name 'target_news' is not defined

In [5]:
set(duplication)

NameError: name 'duplication' is not defined