In [2]:
import numpy as np
from random import randint

def loadData(file):
    content = np.loadtxt(file, dtype=np.str, delimiter=',')[1:]
    types = [line[2] for line in content]
    attrs = [[float(num) for num in line[6:11]] for line in content]
    return types, attrs

def splitCases(x, y):
    if len(x) != len(y):
        return [], [], [], []
    trainX, trainY, testX, testY = [], [], [], []
    for i in range(len(x)):
        if randint(1, 4) > 3:
            testX.append(x[i])
            testY.append(y[i])
        else:
            trainX.append(x[i])
            trainY.append(y[i])
    return trainX, trainY, testX, testY

types, attrs = loadData('./data/gf.csv')
TrainX, TrainY, TestX, TestY = splitCases(attrs, types)

- 将数据按照枪种分类，并且计算每一种枪的高斯分布模型以及每一种枪的个数占比

In [3]:
def splitByType(x, y):
    _groups = {}
    for _i in range(len(x)):
        if y[_i] not in _groups:
            _groups[y[_i]] = []
        _groups[y[_i]].append(x[_i])
    return _groups

def gaussianFit(_attr):
    _n, _m = len(_attr), len(_attr[0])
    _mean = np.zeros(_m)
    _sigma = np.mat(np.zeros((_m, _m)))
    for _j in range(_m):
        for _i in range(_n):
            _mean[_j] += _attr[_i][_j]
        _mean[_j] /= _n
    for _i in range(_n):
        _tmp = np.mat([_attr[_i][_j] - _mean[_j] for _j in range(_m)])
        _sigma += np.transpose(_tmp) * _tmp
    return _mean, _sigma

Group = splitByType(TrainX, TrainY)
names, pClass, means, sigmas = [], [], [], []
count = 0
for name, attr in Group.items():
    names.append(name)
    pClass.append(len(attr))
    count += len(attr)
    mean, sigma = gaussianFit(attr)
    means.append(mean)
    sigmas.append(sigma)
pClass = [num / count for num in pClass]

- 计算概率并最终得出结论的函数
- 这里只要比较分子就可以了，因为分母是相同的；高斯的概率密度函数也可以化简

In [4]:
def calcPossibility(x, i: int):
    _tmp = np.mat(x - means[i])
    _inv = np.linalg.inv(sigmas[i])
    _exp = np.exp(-0.5 * _tmp * _inv * np.transpose(_tmp))
    return pClass[i] * _exp / np.sqrt(np.linalg.det(sigmas[i]))

def classifier(x):
    _maxId, _maxP = 0, -1
    for _i in range(len(names)):
        _currP = calcPossibility(x, _i)
        if _maxP < _currP :
            _maxId, _maxP = _i, _currP
    return names[_maxId]

- 测试数据。

In [5]:
correct = 0
for i in range(len(TestX)):
    guess = classifier(TestX[i])
    print('predict: %s, answer: %s' % (guess, TestY[i]))
    if guess == TestY[i]:
        correct += 1
print('Rate of correct predictions: %f' % (correct / len(TestX)))

predict: HG, answer: HG
predict: HG, answer: HG
predict: HG, answer: HG
predict: HG, answer: HG
predict: HG, answer: HG
predict: SMG, answer: SMG
predict: SMG, answer: SMG
predict: SMG, answer: SMG
predict: SMG, answer: SMG
predict: SMG, answer: SMG
predict: RF, answer: RF
predict: RF, answer: RF
predict: RF, answer: RF
predict: RF, answer: RF
predict: RF, answer: RF
predict: RF, answer: RF
predict: RF, answer: RF
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: MG, answer: MG
predict: MG, answer: MG
predict: SMG, answer: SMG
predict: SMG, answer: SMG
predict: RF, answer: RF
predict: HG, answer: HG
predict: SMG, answer: SMG
predict: AR, answer: AR
predict: AR, answer: AR
predict: SMG, answer: SMG
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: RF, answer: RF
predict: RF, answer: R

- 我佛了，怎么这么高的……我可没有犯把train和test当作同一个的错误啊。。。
- 用sklearn试试？

In [6]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(TrainX, TrainY)
res = model.predict(TestX)
count = 0
for i in range(len(res)):
    print('predict: %s, answer: %s' % (res[i], TestY[i]))
    if res[i] == TestY[i]:
        count += 1
print('Rate of correct predictions: %f' % (count / len(TestY)))

predict: HG, answer: HG
predict: HG, answer: HG
predict: HG, answer: HG
predict: HG, answer: HG
predict: HG, answer: HG
predict: SMG, answer: SMG
predict: SMG, answer: SMG
predict: SMG, answer: SMG
predict: SMG, answer: SMG
predict: SMG, answer: SMG
predict: RF, answer: RF
predict: RF, answer: RF
predict: RF, answer: RF
predict: RF, answer: RF
predict: RF, answer: RF
predict: RF, answer: RF
predict: RF, answer: RF
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: MG, answer: MG
predict: MG, answer: MG
predict: SMG, answer: SMG
predict: SMG, answer: SMG
predict: RF, answer: RF
predict: HG, answer: HG
predict: SMG, answer: SMG
predict: AR, answer: AR
predict: AR, answer: AR
predict: SMG, answer: SMG
predict: AR, answer: AR
predict: AR, answer: AR
predict: AR, answer: AR
predict: RF, answer: RF
predict: RF, answer: R

- 好吧，确实挺好用的。。两三行搞定我半天写的东西。。

In [18]:
for i in range(len(sigmas)):
    print('Covariance matrix for %s' % names[i])
    print(sigmas[i])

Covariance matrix for HG
[[ 53742.96875   1333.28125  -4061.875   -11030.78125  -2615.46875]
 [  1333.28125    556.46875    327.875     -609.96875   -432.78125]
 [ -4061.875      327.875     2283.5       1339.125     -111.125  ]
 [-11030.78125   -609.96875   1339.125     4815.46875    176.28125]
 [ -2615.46875   -432.78125   -111.125      176.28125   1193.96875]]
Covariance matrix for SMG
[[ 2.20652679e+05  7.09821429e+02 -1.90000000e+02 -1.86701786e+04
  -1.24607143e+03]
 [ 7.09821429e+02  1.52678571e+02 -2.90000000e+01 -1.96321429e+02
  -1.50928571e+02]
 [-1.90000000e+02 -2.90000000e+01  5.20000000e+01  1.15000000e+02
  -6.80000000e+01]
 [-1.86701786e+04 -1.96321429e+02  1.15000000e+02  2.51867857e+03
   3.66071429e+02]
 [-1.24607143e+03 -1.50928571e+02 -6.80000000e+01  3.66071429e+02
   1.58042857e+03]]
Covariance matrix for RF
[[15886.95652174  -578.26086957  -984.13043478 -1459.13043478
   -256.73913043]
 [ -578.26086957  7262.43478261   811.2173913   -451.7826087
   -885.43478261