In [1]:
# coding=utf-8 
from numpy import *
import json
import csv

# 加载原始数据，进行分割
def load_message():
    content = []
    lines = []
    label = []

    with open('D:\Document\CS\Program\py program\spam messages\spam_train.txt', encoding = 'utf-8') as fr:
        for i in range(10000):
            line = fr.readline()
            lines.append(line)
        num = len(lines)
        for i in range(num):
            message = lines[i].split('\t')
            label.append(message[0])
            content.append(message[1])

    return num, content, label


# 将分割后的原始数据存到json
def data_storage(content, label):
    with open('D:\Document\CS\Program\py program\spam messages\spam_content.json', 'w') as f:
        json.dump(content, f)
    with open('D:\Document\CS\Program\py program\spam messages\spam_label.json', 'w') as f:
        json.dump(label, f)

if '__main__' == __name__:
   num, content, label = load_message()
   data_storage(content, label)


In [2]:
# -*- coding: utf-8 -*-
import numpy as np
import jieba
import jieba.posseg as pseg
import sklearn.feature_extraction.text
import json
import re
from scipy import sparse, io


# 将连续的数字转变为长度的维度
def process_cont_numbers(content):
    digits_features = np.zeros((len(content), 16))
    for i, line in enumerate(content):
        for digits in re.findall(r'\d+', line):
            length = len(digits)
            if 0 < length <= 15:
                digits_features[i, length-1] += 1
            elif length > 15:
                digits_features[i, 15] += 1
    return process_cont_numbers


# 正常分词，非TFID
class MessageCountVectorizer(sklearn.feature_extraction.text.CountVectorizer):
    def build_analyzer(self):
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer


# 用TFID生成对应词向量
class TfidfVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_analyzer(self):
        #analyzer = super(TfidfVectorizer, self).build_analyzer()
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer


# 生成词向量并进行存储
def vector_word():
    with open('D:\Document\CS\Program\py program\spam messages\spam_content.json', 'r') as f:
        content = json.load(f)
    with open('D:\Document\CS\Program\py program\spam messages\spam_label.json', 'r') as f:
        label = json.load(f)
    '''
        vec_count = MessageCountVectorizer(min_df=2, max_df=0.8)
        data_count = vec_count.fit_transform(content)
        name_count_feature = vec_count.get_feature_names()
    '''

    vec_tfidf = TfidfVectorizer(min_df = 2, max_df = 0.8)
    data_tfidf = vec_tfidf.fit_transform(content)
    name_tfidf_feature = vec_tfidf.get_feature_names()

    io.mmwrite('D:\Document\CS\Program\py program\spam messages\word_vector.mtx', data_tfidf)

    with open('D:\Document\CS\Program\py program\spam messages\spam_label.json', 'w') as f:
        json.dump(label, f)
    with open('D:\Document\CS\Program\py program\spam messages\spam_vector_type.json', 'w') as f:
        json.dump(name_tfidf_feature, f)

if '__main__' == __name__:
    vector_word()


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\surui\AppData\Local\Temp\jieba.cache
Loading model cost 0.801 seconds.
Prefix dict has been built succesfully.


In [3]:
from time import time
from sklearn import preprocessing
import numpy as np
from sklearn import svm
from sklearn import metrics
import json
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from scipy import sparse, io
from sklearn.decomposition import PCA

def dimensionality_reduction(training_data, test_data, type='pca'):
    if type == 'pca':
        n_components = 1000
        t0 = time()
        pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True)
        pca.fit(training_data)
        print("done in %0.3fs" % (time() - t0))
        t0 = time()
        training_data_transform = sparse.csr_matrix(pca.transform(training_data))
        test_data_transform = sparse.csr_matrix(pca.transform(test_data))
        print("done in %0.3fs" % (time() - t0))
        #random_projections
        #feature_agglomeration
        return training_data_transform, test_data_transform



def split_data(content, label):
    training_data, test_data, training_target, test_target = train_test_split(
        content, label, test_size=0.1, random_state=20)
    return training_data, test_data, training_target, test_target

def standardized_data(content, label):
    training_data, test_data, training_target, test_target = split_data(content, label)
    scalar = preprocessing.StandardScaler().fit(training_data)
    training_data_transformed = scalar.transform(training_data)
    test_data_transformed = scalar.transform(test_data)
    return training_data_transformed, test_data_transformed, training_target, test_target

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))


class TrainerLinear:
    def __init__(self, training_data, training_target):
        self.training_data = training_data
        self.training_target = training_target
        self.clf = svm.SVC(C=1, class_weight=None, coef0=0.0,
                           decision_function_shape=None, degree=3, gamma='auto',
                           kernel='linear', max_iter=-1, probability=False,
                           random_state=None, shrinking=True, tol=0.001, verbose=False)

    def learn_best_param(self):
        C_range = np.logspace(-2, 10, 13)
        param_grid = dict(C=C_range)
        cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
        grid.fit(self.training_data, self.training_target)
        self.clf.set_params(C=grid.best_params_['C'])
        print("The best parameters are %s with a score of %0.5f"
              % (grid.best_params_, grid.best_score_))

    def train_classifier(self):
        self.clf.fit(self.training_data, self.training_target)
        joblib.dump(self.clf, 'D:\Document\CS\Program\py program\spam messages\SVM_linear_estimator.pkl')
        training_result = self.clf.predict(self.training_data)
        print (metrics.classification_report(self.training_target, training_result))

    def cross_validation(self):
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=20)
        scores = cross_val_score(self.clf, self.training_data, self.training_target, cv=cv, scoring='f1_macro')
        print (scores)
        print("f1 score: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))


class TrainerRbf:
    def __init__(self, training_data, training_target):
        self.training_data = training_data
        self.training_target = training_target
        self.clf = svm.SVC(C=100, class_weight=None, coef0=0.0,
                           decision_function_shape=None, degree=3, gamma=0.01,
                           kernel='rbf', max_iter=-1, probability=False,
                           random_state=None, shrinking=True, tol=0.001, verbose=False)

    def learn_best_param(self):
        C_range = np.logspace(-2, 10, 13)
        gamma_range = np.logspace(-9, 3, 13)
        param_grid = dict(gamma=gamma_range, C=C_range)
        cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
        grid.fit(self.training_data, self.training_target)
        self.clf.set_params(C=grid.best_params_['C'], gamma=grid.best_params_['gamma'])
        print("The best parameters are %s with a score of %0.5f"
              % (grid.best_params_, grid.best_score_))
        self.draw_visualization_param_effect(grid, C_range, gamma_range)

    def draw_visualization_param_effect(self, grid, C_range, gamma_range):
        scores = grid.cv_results_['mean_test_score'].reshape(len(C_range),
                                                             len(gamma_range))
        plt.figure(figsize=(8, 6))
        plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
        plt.imshow(scores, interpolation='nearest',
                   norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
        plt.xlabel('gamma')
        plt.ylabel('C')
        plt.colorbar()
        plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
        plt.yticks(np.arange(len(C_range)), C_range)
        plt.title('Validation accuracy')
        plt.savefig('D:\Document\CS\Program\py program\spam messages\param_effect.png')
        plt.show()

    def train_classifier(self):
        self.clf.fit(self.training_data, self.training_target)
        joblib.dump(self.clf, 'D:\Document\CS\Program\py program\spam messages\SVM_rbf_estimator.pkl')
        training_result = self.clf.predict(self.training_data)
        print (metrics.classification_report(self.training_target, training_result))


    def cross_validation(self):
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=20)
        scores = cross_val_score(self.clf, self.training_data, self.training_target, cv=cv, scoring='f1_macro')
        print (scores)
        print("f1 score: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))





def SVM_train(train_data, train_target):
    clf = svm.SVC(kernel='linear', class_weight='balanced', C =100, gamma = 0.01)
    clf.fit(train_data, train_target)
    expected = train_target
    predicted = clf.predict(train_data)
    # summarize the fit of the model
    print (metrics.classification_report(expected, predicted))
    print (metrics.confusion_matrix(expected, predicted))


def feature_selection(data, data_target, feature_names):
    clf = svm.SVC(class_weight='balanced', C=2)
    clf.fit(data, data_target)

In [4]:
if '__main__' == __name__:
    content = io.mmread('D:\Document\CS\Program\py program\spam messages\word_vector.mtx')
    with open('D:\Document\CS\Program\py program\spam messages\spam_label.json', 'r') as f:
        label = json.load(f)
    training_data, test_data, training_target, test_target = split_data(content, label)
    training_data, test_data = dimensionality_reduction(training_data.todense(), test_data.todense(), type='pca')

    Trainer = TrainerLinear(training_data, training_target)
    #Trainer.learn_best_param()
    Trainer.train_classifier()
    #Trainer.cross_validation()

    #Trainer2 = TrainerRbf(training_data, training_target)
    #Trainer2.learn_best_param()
    #Trainer2.train_classifier()
    #Trainer2.cross_validation()

done in 39.550s
done in 4.536s
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      8128
          1       1.00      1.00      1.00       872

avg / total       1.00      1.00      1.00      9000



In [5]:
class Evaluator:
    clf = joblib.load('D:\Document\CS\Program\py program\spam messages\SVM_linear_estimator.pkl')

    def __init__(self, training_data, training_target, test_data, test_target):
        self.trainer = TrainerLinear(training_data, training_target)
        self.predictor = Predictor(test_data, test_target)

    def train(self):
        #self.trainer.learn_best_param()
        self.trainer.train_classifier()
        joblib.dump(self.clf, 'D:\Document\CS\Program\py program\spam messages\Terminal_estimator.pkl')
        Evaluator.clf = joblib.load('D:\Document\CS\Program\py program\spam messages\Terminal_estimator.pkl')

    def cross_validation(self):
        self.trainer.cross_validation()

    def predict(self, type):
        if (type == 'sample_data'):
            self.predictor.sample_predict(Evaluator.clf)
        elif (type == 'new_data'):
            self.predictor.new_predict(Evaluator.clf)



In [8]:
class Predictor:
    def __init__(self, test_data, test_target):
        self.test_data = test_data
        self.test_target = test_target

    def sample_predict(self, clf):
        test_result = clf.predict(self.test_data)
        print (metrics.classification_report(self.test_target, test_result))
        print (metrics.confusion_matrix(self.test_target, test_result))

    def new_predict(self, clf):
        test_result = clf.predict(self.test_data)
        with open('D:\Document\CS\Program\py program\spam messages\predict_label.txt', 'wt') as f:
            for i in range(len(test_result)):
                f.writelines(test_result[i])
        self.test_target = test_result


In [9]:
if '__main__' == __name__:
    content = io.mmread('D:\Document\CS\Program\py program\spam messages\word_vector.mtx')
    with open('D:\Document\CS\Program\py program\spam messages\spam_label.json', 'r') as f:
        label = json.load(f)
    training_data, test_data, training_target, test_target = split_data(content, label)
    training_data, test_data = dimensionality_reduction(training_data.todense(), test_data.todense(), type='pca')
    evaluator = Evaluator(training_data, training_target, test_data, test_target)
    evaluator.train()
    #evaluator.cross_validation()
    evaluator.predict(type='sample_data')

done in 40.169s
done in 5.468s
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      8128
          1       1.00      1.00      1.00       872

avg / total       1.00      1.00      1.00      9000

             precision    recall  f1-score   support

          0       0.98      0.99      0.98       906
          1       0.87      0.83      0.85        94

avg / total       0.97      0.97      0.97      1000

[[894  12]
 [ 16  78]]


In [13]:
from sklearn.naive_bayes import GaussianNB

class Trainer_bayes:
    def __init__(self, training_data, training_target):
        self.training_data = training_data
        self.training_target = training_target
        self.clf = GaussianNB()


    def train_classifier(self):
        self.clf.fit(self.training_data, self.training_target)
        joblib.dump(self.clf, 'D:\Document\CS\Program\py program\spam messages\spam_bayes_estimator.pkl')
        training_result = self.clf.predict(self.training_data)
        print (metrics.classification_report(self.training_target, training_result))

    def cross_validation(self):
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=20)
        scores = cross_val_score(self.clf, self.training_data, self.training_target, cv=cv, scoring='f1_macro')
        print (scores)
        print("Accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))




def bayes_train(train_data, train_target):

    model = GaussianNB()
    model.fit(train_data, train_target)
    expected = train_target
    predicted = model.predict(train_data)
    # summarize the fit of the model
    print (metrics.classification_report(expected, predicted))
    print (metrics.confusion_matrix(expected, predicted))

In [14]:
if '__main__' == __name__:
    content = io.mmread('D:\Document\CS\Program\py program\spam messages\word_vector.mtx')
    with open('D:\Document\CS\Program\py program\spam messages\spam_label.json', 'r') as f:
        label = json.load(f)
    content = content
    training_data, test_data, training_target, test_target = split_data(content, label)
    print (np.shape(training_data))
    training_data, test_data = dimensionality_reduction(training_data.todense(), test_data.todense(), type='pca')
    print (np.shape(training_data))

    Trainer = Trainer_bayes(training_data.todense(), training_target)
    Trainer.train_classifier()
    #Trainer.cross_validation()

(9000, 9957)
done in 38.522s
done in 4.727s
(9000, 1000)
             precision    recall  f1-score   support

          0       0.98      0.77      0.86      8128
          1       0.28      0.83      0.42       872

avg / total       0.91      0.78      0.82      9000



In [15]:
class Predictor:
    def __init__(self, test_data, test_target):
        self.test_data = test_data
        self.test_target = test_target

    def sample_predict(self, clf):
        test_result = clf.predict(self.test_data)
        print (metrics.classification_report(self.test_target, test_result))
        print (metrics.confusion_matrix(self.test_target, test_result))

    def new_predict(self, clf):
        test_result = clf.predict(self.test_data)
        with open('D:\Document\CS\Program\py program\spam messages\predict_label.txt', 'wt') as f:
            for i in range(len(test_result)):
                f.writelines(test_result[i])
        self.test_target = test_result

In [16]:
class Evaluator:
    clf = joblib.load('D:\Document\CS\Program\py program\spam messages\spam_bayes_estimator.pkl')

    def __init__(self, training_data, training_target, test_data, test_target):
        self.trainer = Trainer_bayes(training_data, training_target)
        self.predictor = Predictor(test_data, test_target)

    def train(self):
        #self.trainer.learn_best_param()
        self.trainer.train_classifier()
        joblib.dump(self.clf, 'D:\Document\CS\Program\py program\spam messages\Terminal_estimator.pkl')
        Evaluator.clf = joblib.load('D:\Document\CS\Program\py program\spam messages\Terminal_estimator.pkl')

    def cross_validation(self):
        self.trainer.cross_validation()

    def predict(self, type):
        if (type == 'sample_data'):
            self.predictor.sample_predict(Evaluator.clf)
        elif (type == 'new_data'):
            self.predictor.new_predict(Evaluator.clf)

In [17]:
if '__main__' == __name__:
    content = io.mmread('D:\Document\CS\Program\py program\spam messages\word_vector.mtx')
    with open('D:\Document\CS\Program\py program\spam messages\spam_label.json', 'r') as f:
        label = json.load(f)
    training_data, test_data, training_target, test_target = split_data(content, label)
    training_data, test_data = dimensionality_reduction(training_data.todense(), test_data.todense(), type='pca')
    evaluator = Evaluator(training_data.todense(), training_target, test_data.todense(), test_target)
    evaluator.train()
    #evaluator.cross_validation()
    evaluator.predict(type='sample_data')

done in 38.650s
done in 4.427s
             precision    recall  f1-score   support

          0       0.98      0.78      0.87      8128
          1       0.29      0.83      0.43       872

avg / total       0.91      0.78      0.82      9000

             precision    recall  f1-score   support

          0       0.99      0.53      0.69       906
          1       0.17      0.97      0.30        94

avg / total       0.92      0.57      0.65      1000

[[477 429]
 [  3  91]]
