In [56]:
import pandas
import numpy

import matplotlib
from matplotlib import pyplot

import seaborn

from scipy.stats import mode

from sklearn import metrics

%matplotlib inline

In [3]:
#read csv
data_folder = 'data/zoo_data/'
class_data_frame = pandas.read_csv(data_folder + 'class.csv')
zoo_data_frame = pandas.read_csv(data_folder + 'zoo.csv')

In [4]:
class_data_frame.head()

Unnamed: 0,Class_Number,Number_Of_Animal_Species_In_Class,Class_Type,Animal_Names
0,1,41,Mammal,"aardvark, antelope, bear, boar, buffalo, calf,..."
1,2,20,Bird,"chicken, crow, dove, duck, flamingo, gull, haw..."
2,3,5,Reptile,"pitviper, seasnake, slowworm, tortoise, tuatara"
3,4,13,Fish,"bass, carp, catfish, chub, dogfish, haddock, h..."
4,5,4,Amphibian,"frog, frog, newt, toad"


In [5]:
zoo_data_frame.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [6]:
len(zoo_data_frame)

101

In [7]:
#check null
zoo_data_frame.apply(lambda x: x.isnull()).sum()

animal_name    0
hair           0
feathers       0
eggs           0
milk           0
airborne       0
aquatic        0
predator       0
toothed        0
backbone       0
breathes       0
venomous       0
fins           0
legs           0
tail           0
domestic       0
catsize        0
class_type     0
dtype: int64

In [10]:
zoo_data_frame.dtypes

animal_name    object
hair            int64
feathers        int64
eggs            int64
milk            int64
airborne        int64
aquatic         int64
predator        int64
toothed         int64
backbone        int64
breathes        int64
venomous        int64
fins            int64
legs            int64
tail            int64
domestic        int64
catsize         int64
class_type      int64
dtype: object

In [79]:
#model

#naive bayes

#class probability
#feature|class probability product
#calculate posterior for all classes
#max of posterior as prediction

def get_class_probability(data_frame, class_type):
    return len(data_frame.loc[(data_frame['class_type'] == class_type)]) * 1.0 / len(data_frame)

def get_feature_class_probability(data_frame, features, class_type, row):
    product = 1.0
    for feature in features:
        feature_data_frame = data_frame.loc[(data_frame[feature] == row[feature])]
        product = product * get_class_probability(feature_data_frame, class_type)
                                             
    return product

def get_max_posterior(class_data_frame, data_frame, features, row):
    posterior_array = []
    for class_type in class_data_frame['Class_Number']:
        posterior_array.append(get_class_probability(data_frame, class_type)
             * get_feature_class_probability(data_frame, features, class_type, row))
    
#     print(posterior_array)
    return numpy.argmax(posterior_array)

def get_prediction(class_data_frame, data_frame, features, test_data_frame):
    return test_data_frame.apply(lambda row: \
        get_max_posterior(class_data_frame, data_frame, features, row), axis=1)\
        .apply(lambda x: x + 1)

    
#unit testing
get_class_probability(zoo_data_frame, 1)

test_data_frame = zoo_data_frame.sample(n=1)
test_data_frame.apply(lambda row: get_feature_class_probability(zoo_data_frame, ['hair'], 1, row), axis=1)
# test_data_frame.apply(lambda row: \
#         get_max_posterior(class_data_frame, zoo_data_frame, ['hair'], row), axis=1)


68    0.906977
dtype: float64

In [87]:
#train data
train_data_frame = zoo_data_frame.copy(deep=True)

# features = zoo_data_frame.columns.values.tolist()
# features.remove('class_type')
# features.remove('animal_name')

features = ['hair', 'legs']

accuracies = []
for i in range(10):
    test_data_frame = train_data_frame.sample(n=10)
    test_data_frame['prediction'] = get_prediction(class_data_frame, train_data_frame, features, test_data_frame)
#     print(test_data_frame[['class_type', 'prediction']])
    accuracy = metrics.accuracy_score(test_data_frame['class_type'], test_data_frame['prediction'])
    print('accuracy ' + str(accuracy))
    accuracies.append(accuracy)
    
print('mean accuracy ' + str(numpy.mean(accuracies)))

accuracy 0.9
accuracy 1.0
accuracy 0.8
accuracy 0.9
accuracy 0.9
accuracy 0.9
accuracy 1.0
accuracy 0.9
accuracy 0.7
accuracy 0.9
mean accuracy 0.89
