In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split,cross_val_score
from matplotlib.colors import ListedColormap
sns.set()
colors = sns.color_palette("husl")

In [5]:
# 核心在于拆词
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# 文本集是一个一维列表
# 文本集的词汇应该用空格作为间隔
text = ["aaa bbb aaa sss",
 "bbb aaa bbb ccc",
 "aaa ccc bbb aaa",
 "ccc bbb bbb aaa"]
text

['aaa bbb aaa sss', 'bbb aaa bbb ccc', 'aaa ccc bbb aaa', 'ccc bbb bbb aaa']

In [6]:
cv = CountVectorizer()
cv.fit(text)
# cv转换后的对象是一个稀疏矩阵，需要使用toarray()做一次转化
data = cv.transform(text).toarray()
data

array([[2, 1, 0, 1],
       [1, 2, 1, 0],
       [2, 1, 1, 0],
       [1, 2, 1, 0]])

In [7]:
# cv对象对文本集训练完成之后，会得到这个文本集内的所有单词
# 这些单词会被处理成词频向量的字段
feature_names = cv.get_feature_names()
feature_names



['aaa', 'bbb', 'ccc', 'sss']

In [9]:
# ci
pd.DataFrame(data=cv.transform(text).toarray(), columns=cv.get_feature_names())

Unnamed: 0,aaa,bbb,ccc,sss
0,2,1,0,1
1,1,2,1,0
2,2,1,1,0
3,1,2,1,0


IDF 逆文档频率  
权值，正直受到这个词在整个文本集中出现的次数所影响，出现次数越少，权值越大

TF-IDF 评估一个词重要性的指标
公式： 词频*逆文档频率

In [10]:
# 计算TF-IDF指标的
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
# 可以直接对词频集处理
tfidf = TfidfTransformer()

In [12]:
tfidf.fit(data)

TfidfTransformer()

In [13]:
tfidf.transform(data).toarray()

array([[0.67915062, 0.33957531, 0.        , 0.65072502],
       [0.39235059, 0.78470118, 0.4799011 , 0.        ],
       [0.78470118, 0.39235059, 0.4799011 , 0.        ],
       [0.39235059, 0.78470118, 0.4799011 , 0.        ]])

In [14]:
pd.DataFrame(data=tfidf.transform(data).toarray(), columns=cv.get_feature_names())



Unnamed: 0,aaa,bbb,ccc,sss
0,0.679151,0.339575,0.0,0.650725
1,0.392351,0.784701,0.479901,0.0
2,0.784701,0.392351,0.479901,0.0
3,0.392351,0.784701,0.479901,0.0


In [16]:
转换流程：
1. 完成词频集转换
cv = CountVectorizer()
cv.fit(text)
data = cv.transform(text)
# data = cv.fit_transform(text) 简化版

2. 完成逆文档频率转换
tfidf = TfidfTransformer()
tfidf.fit(data)
result = tfidf.transform(data)
# result = tfidf.fit_transform(data) 简化版

SyntaxError: invalid character '：' (U+FF1A) (1822647710.py, line 1)

In [17]:
cv = CountVectorizer()
tf = TfidfTransformer()
tf.fit_transform(cv.fit_transform(text)).toarray()

array([[0.67915062, 0.33957531, 0.        , 0.65072502],
       [0.39235059, 0.78470118, 0.4799011 , 0.        ],
       [0.78470118, 0.39235059, 0.4799011 , 0.        ],
       [0.39235059, 0.78470118, 0.4799011 , 0.        ]])

In [18]:
cv.get_feature_names()

['aaa', 'bbb', 'ccc', 'sss']

In [19]:
# 超进化版本
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
tv = TfidfVectorizer()
data = tv.fit_transform(text)

In [21]:
pd.DataFrame(data=data.toarray(), columns=tv.get_feature_names())

Unnamed: 0,aaa,bbb,ccc,sss
0,0.679151,0.339575,0.0,0.650725
1,0.392351,0.784701,0.479901,0.0
2,0.784701,0.392351,0.479901,0.0
3,0.392351,0.784701,0.479901,0.0


In [22]:
text = ["aaa bbb sss aaa ? hehe",
 "bbb aaa bbb ccc 100",
 "aaa ccc bbb tel aaa",
 "ccc bbb bbb ok aaa"]

In [23]:
# 停用词处理 目的就是为了把一些无意义的词删除

# 维护一个停用词组
stop_words = ["?", "hehe", "100", "ok"]

# 停用词组在构造转换对象时直接配置
tv = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3))
data = tv.fit_transform(text)

In [24]:
tv.get_feature_names()



['aaa',
 'aaa bbb',
 'aaa bbb ccc',
 'aaa bbb sss',
 'aaa ccc',
 'aaa ccc bbb',
 'bbb',
 'bbb aaa',
 'bbb aaa bbb',
 'bbb bbb',
 'bbb bbb aaa',
 'bbb ccc',
 'bbb sss',
 'bbb sss aaa',
 'bbb tel',
 'bbb tel aaa',
 'ccc',
 'ccc bbb',
 'ccc bbb bbb',
 'ccc bbb tel',
 'sss',
 'sss aaa',
 'tel',
 'tel aaa']

## 垃圾短信识别

In [29]:
SMS = pd.read_table("./SMSSpamCollection",header=None)
text = SMS[1].copy()
y = SMS[0].copy()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
# 分词
tv = TfidfVectorizer()
train = tv.fit_transform(text).toarray()

In [32]:
train.shape

(5572, 8713)

In [33]:
# 查看分出哪些词
tv.get_feature_names()



['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

In [42]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [43]:
mnb = MultinomialNB()   # 对于特征，对出现次数计数-----适合大文本集
bnb = BernoulliNB()     # 对于特征，有就置为True，没有就置为False---适合小文本集
lr = LogisticRegression()
knn = KNeighborsClassifier()

In [44]:
mnb_score = cross_val_score(mnb,train,y,cv=3).mean()
bnb_score = cross_val_score(bnb,train,y,cv=3).mean()
lr_score = cross_val_score(lr,train,y,cv=3).mean()
knn_score = cross_val_score(knn,train,y,cv=3).mean()
print(f"mnb_score:{mnb_score}\nbnb_score:{bnb_score}\nlr_score:{lr_score}\nknn_score:{knn_score}")

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


mnb_score:0.9547739630436644
bnb_score:0.9784637459208932
lr_score:0.9580044011555303
knn_score:0.924626492065728
