In [1]:
import numpy as np
import pandas as pd
from numpy import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
def autoNorm(dataSet):
    #获得数据的最小值
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    #最大值和最小值的范围
    ranges = maxVals - minVals
    #shape(dataSet)返回dataSet的矩阵行列数
    normDataSet = np.zeros(np.shape(dataSet))
    #返回dataSet的行数
    m = dataSet.shape[0]
    #原始值减去最小值
    normDataSet = dataSet - np.tile(minVals, (m, 1))
    #除以最大和最小值的差,得到归一化数据
    normDataSet = normDataSet / np.tile(ranges, (m, 1))
    #返回归一化数据结果,数据范围,最小值
    return normDataSet

In [22]:
class Probability_NN:
    def __init__(self, feature_num, sigma = 1):
        self.feature_num = feature_num
        self.class_num = 2
        self.sigma = sigma
        self.choice = 1 #正太窗口
        self.prior_0 = 0           #每个类的先验概率
        self.prior_1 = 0
        
    def learn(self, x_data, y_data):  #只读入train_set
        self.class0, self.class1 = self.split_the_data(x_data, y_data)  #without label
        self.prior_0 = self.class0.shape[0]/(self.class0.shape[0]+self.class1.shape[0])    #p(w1) = N1/(N1+N2)
        self.prior_1 = 1-self.prior_0
        
    def predict(self,x_test):     #读入test_set中的某个样本
        prediction = self.feedforward(x_test)

        return prediction
              
    def feedforward(self,x_sample):     #x_sample即x_test
        likehood_0 = self.Parzen_window(self.choice,x_sample,self.class0)   #计算似然
        likehood_1 = self.Parzen_window(self.choice,x_sample,self.class1)
        
        posterior_0 = likehood_0 * self.prior_0                  #计算后验概率
        posterior_1 = likehood_1 * self.prior_1
        
        if posterior_0 > posterior_1:
            return 0
        
        return 1
                     
    def Parzen_window(self,choice,x_sample,group):
        k_N = 0
        N = group.shape[0]
        d = self.feature_num
        h_N = self.sigma/np.sqrt(N)     #the width of the window
        if choice == 1:     #正太窗
            for i in range(N):
                dist = np.linalg.norm(group[i] - x_sample)
                k_N += 1/(2*pi)**(d/2)*exp(-1/2*(dist/h_N)**2)
                
            likehood = k_N/(N*h_N**d)
            
        return likehood
            

    def split_the_data(self, Data,label):           # Data没有label
        #将数据划分为2类       
        #0代表负类（多数类），1代表正类（少数类）
        Mclass_0 = Data[np.where(label == 0)]
        Mclass_1 = Data[np.where(label == 1)]

        return Mclass_0, Mclass_1

In [19]:
#数据读取(pima)
#skiprows[a,b,c,d] abcd行不读取//// sep=‘\s+’识别切割的字符（空格，或多个空格），默认为 “，”。
name = '01.87_pima.dat'
skip = []

#需要忽略的行数
n = -1   

for line in open(name):
    n += 1
    if line[0] != '@':
        break

for i in range(n):
    skip.append(i)

data = pd.read_table(name,header=None,skiprows = skip,sep='\s+')

sample_num = data.shape[0]
feature_num = data.shape[1]-1
fea = feature_num
if feature_num == 0:
    feature_num = len(data.values[0][0].split(','))-1
#print("样本数量:",sample_num)
#print("特征数量:",feature_num)
#print(dataname)
#print(type(data))

#将数据存入一个矩阵
matrix = np.zeros((sample_num,feature_num+1))
if fea != 0:
    for i in range(sample_num):
        for j in range(feature_num+1):
            if j<feature_num:
                d_value = float(data[j][i].rstrip(','))
                matrix[i][j] = d_value
            else:
                if data[j][i]=='positive':
                    matrix[i][j] = 1
                else:
                    matrix[i][j] = 0
else:
    for i in range(sample_num):
        str_data = data.values[i][0].split(',')
        for j in range(feature_num+1):
            if j<feature_num:
                d_value = float(str_data[j])
                matrix[i][j] = d_value
            else:
                if str_data[j]=='positive':
                    matrix[i][j] = 1
                else:
                    matrix[i][j] = 0

In [7]:
data = matrix

In [40]:
if __name__ == "__main__":
    
#     data_frame =  pd.read_csv("2019_LOTAAS_1_Lyon_Features (8).csv", header=None)
    data_frame =  pd.read_csv("data4.csv", header=None)
    data = np.array(data_frame)
    feature_num = data.shape[1]-1
    x_data = data[:,0:feature_num]    #without label
    y_data = mat(data[:,feature_num]).T
    x_data_norm = autoNorm(x_data)

    x_train, x_test, y_train, y_test = train_test_split(x_data_norm, y_data,test_size=0.2,stratify=y_data)
    
    prediction = np.zeros([1,y_test.shape[0]])

    PNN = Probability_NN(feature_num) 

    PNN.learn(x_train,y_train)         #建立实例

    #开始测试
    for i in range(x_test.shape[0]):
        prediction[0,i] = PNN.predict(x_test[i])
        
    acc = accuracy_score(y_test,prediction.T)
    auc = roc_auc_score(array(y_test),prediction.T) #按label
    f1 = f1_score(array(y_test),prediction.T)
    recall = recall_score(array(y_test),prediction.T)
    
    print('acc:',acc)
    print('auc:',auc)
    print('recall:',recall)
    print('f1_score:',f1)

acc: 0.835
auc: 0.835
recall: 0.81
f1_score: 0.8307692307692308


In [81]:
feature_num = data.shape[1]-1
x_data = data[:,0:feature_num]    #without label
y_data = mat(data[:,feature_num]).T
x_data_norm = autoNorm(x_data)
x_train, x_test, y_train, y_test = train_test_split(x_data_norm, y_data,test_size=0.2)
feature_num

2

In [41]:
prediction

array([[0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1.,
        0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 0.,
        1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 1.,
        1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0.,
        1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 1.,
        0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
        0., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1.,
        1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
        0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1.,
        0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1.,
        0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
        0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 

In [43]:
y_data

matrix([[1.],
        [0.],
        [1.],
        ...,
        [1.],
        [0.],
        [1.]])