## 垃圾邮件分类


In [1]:
with open('data/emailSample1.txt', 'r') as f:
    sampe_email = f.read()
    print(sampe_email)

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com




## 1. 对邮件进行预处理

In [5]:
'''
预处理主要包括以下8个部分：
  1. 将大小写统一成小写字母；
  2. 移除所有HTML标签，只保留内容。
  3. 将所有的网址替换为字符串 “httpaddr”.
  4. 将所有的邮箱地址替换为 “emailaddr”
  5. 将所有dollar符号($)替换为“dollar”.
  6. 将所有数字替换为“number”
  7. 将所有单词还原为词源，词干提取
  8. 移除所有非文字类型
  9.去除空字符串‘’
'''

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn import svm
import nltk.stem as ns
import re

def preprocessing(email):
    
    # 1. 统一成小写
    email = email.lower()
    
    #2. 去除html标签
    email = re.sub('<[^<>]>', ' ', email)
    
    #3. 将网址替换为字符串 “httpaddr”.
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email ) 
    
    #4. 将邮箱地址替换为 “emailaddr”
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
    
     # 5.所有dollar符号($)替换为“dollar”.
    email = re.sub('[\$]+', 'dollar', email) 
    
    # 6.匹配数字，将数字替换为“number”
    email = re.sub('[0-9]+', 'number', email) # 匹配一个数字， 相当于 [0-9]，+ 匹配1到多次
    
    # 7. 词干提取
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    tokenlist=[]

    s = ns.SnowballStemmer('english')
        
    for token in tokens:
        
        # 8. 移除非文字类型
        email  = re.sub('[^a-zA-Z0-9]', '', email)
        stemmed = s.stem(token)
    
        # 9.去除空字符串‘’
        if not len(token): continue
        tokenlist.append(stemmed)  
        
    return tokenlist
    

In [6]:
email = preprocessing(sampe_email)

In [8]:
def email2VocabIndices(email, vocab):
    """提取存在单词的索引"""
    token = preprocessing(email)
    print(token)
    index = [i for i in range(len(token)) if token[i] in vocab]
    return index

In [9]:
def email2FeatureVector(email):
    """
    将email转化为词向量，n是vocab的长度。存在单词的相应位置的值置为1，其余为0
    """
    df = pd.read_table('data/vocab.txt',names=['words'])
    vocab = df.values  # return array
    vector = np.zeros(len(vocab))  # init vector
    vocab_indices = email2VocabIndices(email, vocab) 
    print(vocab_indices)# 返回含有单词的索引
    # 将有单词的索引置为1
    for i in vocab_indices:
        vector[i] = 1
    return vector

In [10]:
import pandas as pd
vector = email2FeatureVector(sampe_email)
print('length of vector = {}\nnum of non-zero = {}'.format(len(vector), int(vector.sum())))

['anyon', 'know', 'how', 'much', 'it', 'cost', 'to', 'host', 'a', 'web', 'portal', '\n', '\nwell', 'it', 'depend', 'on', 'how', 'mani', 'visitor', 'you', 're', 'expect', '\nthis', 'can', 'be', 'anywher', 'from', 'less', 'than', 'number', 'buck', 'a', 'month', 'to', 'a', 'coupl', 'of', 'dollarnumb', '\nyou', 'should', 'checkout', 'httpaddr', 'or', 'perhap', 'amazon', 'ecnumb', '\nif', 'your', 'run', 'someth', 'big', '\n\nto', 'unsubscrib', 'yourself', 'from', 'this', 'mail', 'list', 'send', 'an', 'email', 'to', '\nemailaddr\n\n']
[0, 1, 2, 3, 4, 5, 6, 7, 9, 13, 14, 15, 16, 17, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 32, 33, 35, 36, 37, 39, 41, 42, 43, 47, 48, 49, 50, 52, 53, 54, 56, 57, 58, 59, 60, 61]
length of vector = 1899
num of non-zero = 46


In [12]:
vector.shape

(1899,)