In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split,cross_val_score
from matplotlib.colors import ListedColormap
sns.set()
colors = sns.color_palette("husl")

## 读取数据

In [4]:
pd.read_csv("./email/ham/1.txt",header=None)

Unnamed: 0,0,1
0,Hi Peter,
1,With Jose out of town,do you want to
2,meet once in a while to keep things,
3,going and do some interesting stuff?,
4,Let me know,
5,Eugene,


In [5]:
with open("./email/ham/2.txt","r",encoding="utf-8") as f:
    content = f.read()

In [6]:
content

"Yay to you both doing fine!\n\nI'm working on an MBA in Design Strategy at CCA (top art school.)  It's a new program focusing on more of a right-brained creative and strategic approach to management.  I'm an 1/8 of the way done today!"

In [7]:
import os

In [8]:
os.listdir("./email")

['spam', 'ham']

In [9]:
def read_content(path,dir_name):
    source_path = path+"/"+dir_name
    content_list = []
    label_list = []
    for file_name in os.listdir(source_path):
        if file_name.endswith("txt"):
            file_path = source_path+"/"+file_name
            with open(file_path,"r",encoding="utf-8",errors="ignore") as f:
                content = f.read()
                content_list.append(content)
                label_list.append(dir_name)
    return pd.DataFrame(data={
        "content":content_list,
        "label":label_list
    })
                
                

In [10]:
ham_df = read_content("./email","ham")
spam_df = read_content("./email","spam")

In [17]:
samples = pd.concat([ham_df,spam_df],ignore_index=True)
samples.head()

Unnamed: 0,content,label
0,"Hi Peter,\n \nThe hotels are the ones that ren...",ham
1,LinkedIn\n\nKerry Haloney requested to add you...,ham
2,yeah I am ready. I may not be here because Ja...,ham
3,Benoit Mandelbrot 1924-2010\n\nBenoit Mandelbr...,ham
4,Jay Stepp commented on your status.\n\nJay wro...,ham


In [18]:
text = samples["content"].copy()
target = samples["label"].copy()

## 文本集转换处理

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
tv = TfidfVectorizer()
data = tv.fit_transform(text)

In [21]:
tv.get_feature_names()



['00',
 '0nline',
 '10',
 '100',
 '100m',
 '100mg',
 '10mg',
 '11',
 '119',
 '120',
 '129',
 '130',
 '138',
 '14th',
 '15',
 '156',
 '15mg',
 '174623',
 '180',
 '1924',
 '195',
 '199',
 '20',
 '200',
 '2007',
 '2010',
 '2011',
 '203',
 '219',
 '225',
 '25',
 '25mg',
 '291',
 '292',
 '30',
 '300x',
 '30mg',
 '322',
 '325',
 '366',
 '37',
 '38',
 '385',
 '39',
 '396',
 '430',
 '492',
 '50',
 '50092',
 '50mg',
 '513',
 '562',
 '570',
 '588',
 '5mg',
 '60',
 '625',
 '66343',
 '70',
 '75',
 '750',
 '80',
 '85',
 '86152',
 '90',
 '90563',
 'about',
 'accept',
 'accepted',
 'access',
 'acrobat',
 'add',
 'address',
 'adobe',
 'advocate',
 'aged',
 'all',
 'also',
 'am',
 'amazing',
 'ambiem',
 'amex',
 'an',
 'analgesic',
 'and',
 'announcement',
 'another',
 'answer',
 'any',
 'ap',
 'approach',
 'approved',
 'are',
 'arolexbvlgari',
 'art',
 'articles',
 'arvind',
 'as',
 'assigning',
 'assistance',
 'at',
 'attaching',
 'automatic',
 'automatically',
 'away',
 'back',
 'bad',
 'bags',
 'ba

### 停用词

In [22]:
# 将数字选出，作为停用词
stop_words = ['00',
 '0nline',
 '10',
 '100',
 '100m',
 '100mg',
 '10mg',
 '11',
 '119',
 '120',
 '129',
 '130',
 '138',
 '14th',
 '15',
 '156',
 '15mg',
 '174623',
 '180',
 '1924',
 '195',
 '199',
 '20',
 '200',
 '2007',
 '2010',
 '2011',
 '203',
 '219',
 '225',
 '25',
 '25mg',
 '291',
 '292',
 '30',
 '300x',
 '30mg',
 '322',
 '325',
 '366',
 '37',
 '38',
 '385',
 '39',
 '396',
 '430',
 '492',
 '50',
 '50092',
 '50mg',
 '513',
 '562',
 '570',
 '588',
 '5mg',
 '60',
 '625',
 '66343',
 '70',
 '75',
 '750',
 '80',
 '85',
 '86152',
 '90',
 '90563']

In [23]:
tv = TfidfVectorizer(stop_words=stop_words)
data = tv.fit_transform(text)

In [24]:
train = data.toarray().copy()
train

array([[0.08233763, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.3140113 , 0.        ,
        0.        ]])

## 建模

In [41]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

In [42]:
bnb = BernoulliNB()
mnb = MultinomialNB()

In [43]:
from sklearn.model_selection import cross_val_score

In [44]:
bnb_score = cross_val_score(bnb,train,target,cv=5).mean()
mnb_score = cross_val_score(mnb,train,target,cv=5).mean()
print(f"bnb_score:{bnb_score}\nmnb_score:{mnb_score}")

bnb_score:0.9
mnb_score:0.96


## 预测

In [50]:
bnb = BernoulliNB()
bnb.fit(train,target)

BernoulliNB()

In [51]:
samples.content[0]

'Hi Peter,\n \nThe hotels are the ones that rent out the tent. They are all lined up on the hotel grounds : )) So much for being one with nature, more like being one with a couple dozen tour groups and nature.\nI have about 100M of pictures from that trip. I can go through them and get you jpgs of my favorite scenic pictures.\n \nWhere are you and Jocelyn now? New York? Will you come to Tokyo for Chinese New Year? Perhaps to see the two of you then. I will go to Thailand for winter holiday to see my mom : )\n \nTake care,\nD\n'

In [52]:
my_words = "'Hi Peter,\n \nThe hotels are the ones that rent out the tent. They are all lined up on the hotel grounds : )) So much for being one with nature, more like being one with a couple dozen tour groups and nature.\nI have about 100M of pictures from that trip. I can go through them and get you jpgs of my favorite scenic pictures.\n \nWhere are you and Jocelyn now? New York? Will you come to Tokyo for Chinese New Year? Perhaps one head mean qq  to see the two of you then. I will go to Thailand for winter holiday to see my mom : )\n \nTake care,\nD\n'"

In [53]:
tv.get_feature_names()



['about',
 'accept',
 'accepted',
 'access',
 'acrobat',
 'add',
 'address',
 'adobe',
 'advocate',
 'aged',
 'all',
 'also',
 'am',
 'amazing',
 'ambiem',
 'amex',
 'an',
 'analgesic',
 'and',
 'announcement',
 'another',
 'answer',
 'any',
 'ap',
 'approach',
 'approved',
 'are',
 'arolexbvlgari',
 'art',
 'articles',
 'arvind',
 'as',
 'assigning',
 'assistance',
 'at',
 'attaching',
 'automatic',
 'automatically',
 'away',
 'back',
 'bad',
 'bags',
 'bargains',
 'based',
 'bathroom',
 'be',
 'because',
 'been',
 'behind',
 'being',
 'below',
 'benoit',
 'betterejacu1ation',
 'bettererections',
 'biggerpenis',
 'bike',
 'bin',
 'blue',
 'book',
 'borders',
 'both',
 'brained',
 'brand',
 'brands',
 'brandviagra',
 'business',
 'but',
 'butt',
 'buy',
 'buyviagra',
 'by',
 'call',
 'came',
 'can',
 'canadian',
 'cannot',
 'capabilities',
 'car',
 'cards',
 'care',
 'carlo',
 'cartier',
 'cat',
 'cats',
 'cca',
 'certified',
 'chance',
 'changes',
 'changing',
 'chapter',
 'cheap',
 '

In [54]:
# 新数据的预测，必须要用训练数据的词频集来转换
test = tv.transform([my_words]).toarray()
bnb.predict(test)

array(['ham'], dtype='<U4')