In [1]:

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import multivariate_normal
soiltypeindex = ['DL','DSG','ED','EQ','ES','F','FYLD','GC','GL','GNG','GS','HAG','HG','HSL','HV','JV','KQ','KS','LL','ML','MSG','PAM','PL','ROG','SK','SO','SVG','T','VAG','Y','ZK','']
soiltype_norm = float(len(soiltypeindex)-1.0)

observationFile = 'Limonium_vulgare.txt' 
observationFileNegative = 'Limonium_vulgare_neg.txt'

#conver text dat into feature
def convert2fea(line):
    fea = []
    temp = line.split(';')

    if temp[2] == '':
        fea.append(1.0)
    else:
        fea.append(float(temp[2])/10.0)

    fea.append((float(temp[4])-1.0)/4.0)

    if temp[5] == 'CON':
        fea.append(0.0)
    else:
        fea.append(2.0)

    fea.append(float(soiltypeindex.index(temp[6]))/31.0)

    fea.append(float(temp[7])/48991.0)

    fea.append((float(temp[8])-441994.0)/(892641.0-441994.0))

    fea.append((float(temp[9])-6050562.0)/(6402150.0-6050562.0))

    return fea

#read positive data
pos_fea = []
f = open(observationFile, 'r')
lines_pos=f.read().splitlines()
f.close()
for i in range(1,len(lines_pos)):
    temp_fea = convert2fea(lines_pos[i])
    pos_fea.append(temp_fea)


#split positive data into training and testing
dim = len(pos_fea)
testSetSize = 100
trainSetSize = dim-testSetSize
print ('number of positive observations: '+str(dim))
print ('observations for testing: '+str(testSetSize))
print ('observations for training: '+str(trainSetSize))

pos_train=pos_fea[:trainSetSize]
pos_train_lab=np.zeros((trainSetSize,1))+1.0
pos_test=pos_fea[trainSetSize:trainSetSize+testSetSize]
pos_test_lab=np.zeros((testSetSize,1))+1.0

#read negative data
neg_fea = []
f = open(observationFileNegative, 'r')
lines_neg=f.read().splitlines()
f.close()
for i in range(1,len(lines_neg)):
    temp_fea = convert2fea(lines_neg[i])
    neg_fea.append(temp_fea)

#split negative data into training and testing
neg_train=neg_fea[:trainSetSize]
neg_train_lab=np.zeros((trainSetSize,1))
neg_test=neg_fea[trainSetSize:trainSetSize+testSetSize]
neg_test_lab=np.zeros((testSetSize,1))

#estimate mean and covariance matrix
train_fea = np.asarray(pos_train)
mean_val = np.mean(train_fea, axis=0)
std_val = np.cov(train_fea, rowvar=0)

#print (scipy.stats.norm(test_fea,mean_val, std_val ))
model = multivariate_normal(mean=mean_val, cov=std_val)
numToPrint =100
#np.set_printoptions(precision=3)
#np.set_printoptions(suppress=True)
#print (pos_test)
#print (mean_val)
#print (std_val)
posPredicted = model.pdf(pos_test)
negPredicted = model.pdf(neg_test)
print ('Accuracy:')
print ((np.sum(posPredicted>5)+np.sum(negPredicted<=5))/200.0)


number of positive observations: 559
observations for testing: 100
observations for training: 459
Accuracy:
0.94
