In [36]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import copy
import random

In [37]:
def loadDataset(filename):
    with open(filename, 'r') as f:
        lines = csv.reader(f)
        data_set = list(lines)
    offset = 1 if 'test' in filename else 0
    # 整理数据
    for i in range(len(data_set)):
        del(data_set[i][0])
        del(data_set[i][3-1-offset])
        data_set[i][6-2-offset] += data_set[i][7-2-offset]
        del(data_set[i][7-2-offset])
        del(data_set[i][8-3-offset])
        del(data_set[i][10-4-offset])
        del(data_set[i][11-5-offset])
        if 'train' in filename:
            survived = data_set[i].pop(0)
            data_set[i].append(survived)

    category = data_set[0]
    del (data_set[0])
    
    #留下的特征
    # 训练集：['Pclass', 'Sex', 'Age', 'SibSpParch', 'Fare', 'Survived']
    # "Age"有缺项
    # 测试集：['Pclass', 'Sex', 'Age', 'SibSpParch', 'Fare']
    # "Age"&"Fare"有缺项
    # 转换数据格式
    for data in data_set:
        pclass = int(data[0])
        # male : 1, female : 0
        sex = 1 if data[1] == 'male' else 0
        age = int(float(data[2])) if data[2] != '' else 28
        sibspparch = float(data[3][0])+float(data[3][1])
        fare = float(data[4]) if data[4] != '' else 0
        # 补全缺失值 转换记录方式 分类
        # 经过测试，如果不将数据进行以下处理，分布会过于密集，处理后，数据的分布变得稀疏了
        # age <25 为0, 25<=age<60为1，age>=60为2
        if age < 25:
            age = 0
        elif age >= 25 and age < 60: 
            age = 1
        else:
            age = 2
        # sibsp&parch以2为界限，小于为0，大于为1
        if sibspparch < 2:
            sibspparch = 0
        else:
            sibspparch = 1
        # fare以64为界限
        if fare < 64:
            fare = 0
        else:
            fare = 1
        #更新数据
        data[0] = pclass 
        data[1] = sex
        data[2] = age
        data[3] = sibspparch
        data[4] = fare

        if 'train' in filename:
            data[-1] = int(data[-1])
    #print(len(data_set), category)

    return data_set, category

def split_data(data):

    data_set = copy.deepcopy(data)

    data_mat = []
    label_mat = []
    for i in range(len(data_set)):
        if data_set[i][-1] == 0:
            data_set[i][-1] = -1

        label_mat.append(data_set[i][-1])
        del(data_set[i][-1])
        data_mat.append(data_set[i])

    return data_mat, label_mat

In [38]:
def select_j_rand(i ,m):
    # 选取alpha
    j = i
    while j == i:
        j = int(random.uniform(0, m))
    return j

def clip_alpha(aj, H, L):
    # 修剪alpha
    if aj > H:
        aj = H
    if L > aj:
        aj = L

    return aj

In [39]:
class SVMclassifier(object):
    def __init__(self, kTup):
        self.kTup = kTup
    
    def kernelTrans(self, X, A, kTup): #通过数据计算转换后的核函数
        m = np.shape(X)[0]
        K = np.mat(np.zeros((m,1)))
        
        if kTup[0]=='linear': #线性核函数
            K = X * A.T   
    
        elif kTup[0]=='rbf':#高斯核
            for j in range(m):
                deltaRow = X[j,:] - A
                K[j] = deltaRow * deltaRow.T
            K = np.exp(K/(-1*kTup[1]**2)) 
 
        elif kTup[0] == 'laplace':#拉普拉斯核
            for j in range(m):
                deltaRow = X[j,:] - A
                K[j] = deltaRow*deltaRow.T
                K[j] = np.sqrt(K[j])
            K = np.exp(-K/kTup[1])     
 
        elif kTup[0] == 'poly':#多项式核
            K = X * A.T
            for j in range(m):
                K[j] = K[j]**kTup[1]
        
        elif kTup[0] == 'sigmoid':#Sigmoid核
            K = X * A.T
            for j in range(m):
                K[j] = np.tanh(kTup[1]*K[j]+kTup[2])
        
        return K
    
    def smo(self, data_mat_In, class_label, C, toler, max_iter):
        # 转化为numpy的mat存储
        data_matrix = np.mat(data_mat_In)
        label_mat = np.mat(class_label).transpose()
        
        # 初始化b，统计data_matrix的维度
        b = 0
        m = np.shape(data_matrix)[0]
        # 初始化alpha，设为0
        alphas = np.mat(np.zeros((m, 1)))
        
        # 初始化迭代次数
        iter_num = 0
        alpha_pairs_changed = 0
        entire_set = True
        # 最多迭代max_iter次
        while iter_num <= max_iter and ((alpha_pairs_changed > 0) or (entire_set)):
            alpha_pairs_changed = 0
            if entire_set and (iter_num <= max_iter):
                for i in range(m):
                    # 计算误差Ei
                    fxi = float(np.multiply(alphas, label_mat).T*(self.kernelTrans(data_matrix, data_matrix[i,:], self.kTup))) + b
                    Ei = fxi - float(label_mat[i])
                    # 优化alpha，松弛向量
                    if (label_mat[i]*Ei < -toler and alphas[i] < C) or (label_mat[i]*Ei > toler and alphas[i] > 0):
                        # 随机选取另一个与alpha_j成对优化的alpha_j
                        j = select_j_rand(i, m)
                        # 1.计算误差Ej
                        fxj = float(np.multiply(alphas, label_mat).T*(self.kernelTrans(data_matrix, data_matrix[j,:], self.kTup))) + b
                        Ej = fxj - float(label_mat[j])
                        # 保存更新前的alpha，deepcopy
                        alpha_i_old = copy.deepcopy(alphas[i])
                        alpha_j_old = copy.deepcopy(alphas[j])
                        # 2.计算上下界L和H
                        if label_mat[i] != label_mat[j]:
                            L = max(0, alphas[j] - alphas[i])
                            H = min(C, C + alphas[j] - alphas[i])
                        else:
                            L = max(0, alphas[j] + alphas[i] - C)
                            H = min(C, alphas[j] + alphas[i])
                        if L == H:
                            print("L == H")
                            continue
                        # 3.计算eta
                        eta = data_matrix[i, :]*data_matrix[j, :].T - data_matrix[i, :]*data_matrix[i, :].T - data_matrix[j, :]*data_matrix[j, :].T
                        if eta >= 0:
                            print("eta >= 0")
                            continue
                        # 4.更新alpha_j
                        alphas[j] -= label_mat[j]*(Ei - Ej)/eta
                        # 5.修剪alpha_j
                        alphas[j] = clip_alpha(alphas[j], H, L)
                        if abs(alphas[j] - alphas[i]) < 1e-5:
                            print("alpha_j变化太小")
                            continue
                        # 6.更新alpha_i
                        alphas[i] += label_mat[j]*label_mat[i]*(alpha_j_old - alphas[j])
                        # 7.更新b_1和b_2
                        b_1 = b - Ei - label_mat[i]*(alphas[i] - alpha_i_old)*data_matrix[i, :]*data_matrix[i, :].T - label_mat[j]*(alphas[j] - alpha_j_old)*data_matrix[i, :]*data_matrix[j, :].T
                        b_2 = b - Ej - label_mat[i]*(alphas[i] - alpha_i_old)*data_matrix[i, :]*data_matrix[j, :].T - label_mat[j]*(alphas[j] - alpha_j_old)*data_matrix[j, :] * data_matrix[j, :].T
                        # 8.根据b_1和b_2更新b
                        if 0 < alphas[i] and C > alphas[i]:
                            b = b_1
                        elif 0 < alphas[j] and C > alphas[j]:
                            b = b_2
                        else:
                            b = (b_1 + b_2)/2
                        # 统计优化次数
                        alpha_pairs_changed += 1
                        # 打印统计信息
                        print("full-set 样本：%d , alpha优化次数：%d" % (i, alpha_pairs_changed))
                iter_num += 1
                
            elif(iter_num <= max_iter):
                non_bound_is = np.nonzero((alphas[i] > 0) * (alphas[i] < C))[0]
                for i in non_bound_is:
                    if (label_mat[i]*Ei < -toler and alphas[i] < C) or (label_mat[i]*Ei > toler and alphas[i] > 0):
                        alpha_pairs_changed += 1
                    print ("non-bound 样本：%d , alpha优化次数：%d" % (i,alpha_pairs_changed))
                iter_num += 1
                
            if entire_set: 
                entire_set = False 
            elif (alpha_pairs_changed == 0): 
                entire_set = True  
            print ("迭代次数: %d" % iter_num)

        return b, alphas
    
    def prediction(self, test, w, b):
        test = np.mat(test)
        result = []
        for i in test:
            if i * w + b > 0:
                result.append(1)
            else:
                result.append(-1)

        return result
    
    def caluelate_w(self, data_mat, label_mat, alphas):
        # 计算w
        alphas = np.array(alphas)
        data_mat = np.array(data_mat)
        label_mat = np.array(label_mat)

        w = np.dot((np.tile(label_mat.reshape(1, -1).T, (1, 5))*data_mat).T, alphas)
        return w.tolist()

In [40]:
if __name__ == "__main__":
    data_set, category_train = loadDataset('train.csv')

    data_mat, label_mat = split_data(data_set)

    test_mat = data_mat[:800]
    test_label = label_mat[:800]
    data_mat = data_mat[800:]
    label_mat = label_mat[800:]
    
    classifier = SVMclassifier(kTup=['linear'])
    
    #训练
    b, alphas = classifier.smo(data_mat, label_mat, 0.5, 1e-3, 200)

    w = classifier.caluelate_w(data_mat, label_mat, alphas)

    result = classifier.prediction(test_mat, w, b)

    count = 0
    survived = 0
    
    #准确率
    for i in range(len(result)):
        if result[i] == test_label[i]:
            count += 1
    #训练集存活率
    for i in range(len(data_mat)):
        if label_mat[i] == 1:
            survived += 1
    
    print('survive_rate_in_training_set:'+str(survived/len(data_mat)*100)+'%')
    print('accuracy:'+str(count/len(result)*100)+'%')

L == H
L == H
L == H
L == H
full-set 样本：4 , alpha优化次数：1
full-set 样本：5 , alpha优化次数：2
L == H
full-set 样本：7 , alpha优化次数：3
full-set 样本：9 , alpha优化次数：4
L == H
L == H
L == H
full-set 样本：13 , alpha优化次数：5
full-set 样本：14 , alpha优化次数：6
full-set 样本：15 , alpha优化次数：7
full-set 样本：20 , alpha优化次数：8
full-set 样本：21 , alpha优化次数：9
full-set 样本：22 , alpha优化次数：10
L == H
L == H
full-set 样本：28 , alpha优化次数：11
full-set 样本：29 , alpha优化次数：12
full-set 样本：30 , alpha优化次数：13
L == H
L == H
L == H
full-set 样本：34 , alpha优化次数：14
L == H
full-set 样本：38 , alpha优化次数：15
full-set 样本：40 , alpha优化次数：16
full-set 样本：41 , alpha优化次数：17
full-set 样本：42 , alpha优化次数：18
full-set 样本：43 , alpha优化次数：19
full-set 样本：48 , alpha优化次数：20
full-set 样本：49 , alpha优化次数：21
full-set 样本：50 , alpha优化次数：22
full-set 样本：53 , alpha优化次数：23
full-set 样本：54 , alpha优化次数：24
full-set 样本：55 , alpha优化次数：25
full-set 样本：56 , alpha优化次数：26
full-set 样本：57 , alpha优化次数：27
full-set 样本：58 , alpha优化次数：28
full-set 样本：59 , alpha优化次数：29
L == H
full-set 样本：62 , alpha优化次数：30
full-set