In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
iris = load_iris()

In [3]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
df = pd.DataFrame(np.c_[iris.data,iris.target], columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'target'])

In [5]:
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [6]:
for c in df.columns.values[:-1]:
    df[c] = np.around(df[c].values)

In [7]:
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.0,4.0,1.0,0.0,0.0
1,5.0,3.0,1.0,0.0,0.0
2,5.0,3.0,1.0,0.0,0.0
3,5.0,3.0,2.0,0.0,0.0
4,5.0,4.0,1.0,0.0,0.0


In [8]:
train, test = train_test_split(df, test_size = 0.2, random_state = 1)

In [9]:
features = ['sepal length', 'sepal width', 'petal length', 'petal width']

In [10]:
marginal_prob = pd.DataFrame(train.groupby(features).size()/len(train))
marginal_prob.reset_index(inplace = True)
# P(y)
prior_prob = 1/3

In [11]:
#likelihood[class][feature name][feature value]
likelihood = {}
for t in train.target.unique():
    likelihood[t] = {}
    t_num = len(train[train['target'] == t])
    for f in features:
        likelihood[t][f] = {}
        v_counts = pd.DataFrame(train[train['target']==t][f].value_counts())
        for i,v in v_counts.iterrows():
            likelihood[t][f][i] = v.values[0]/t_num
        

In [12]:
# P(x_i|y), sample = x_i, target = y
def get_likelihood(sample, target):
    tmp_prob = 1
    for f in features:
        if target in likelihood and f in likelihood[target] and sample[f] in likelihood[target][f]:
            tmp_prob *= likelihood[target][f][sample[f]]
        else :
            return 0
    return tmp_prob

In [13]:
# P(x_i)
def get_marginal_prob(li):
    tmp = marginal_prob
    for i,v in enumerate(li):
        tmp = tmp[tmp[features[i]]==v]
        if len(tmp) == 0: return 0.00000001
    return tmp[0].values[0]

In [14]:
predict = []
for i,v in test.iterrows():
    prob = []
    marg = get_marginal_prob(v[features].values)
    for c in [0,1,2]:
        like = get_likelihood(v[features], c)
        #predicted y = argmax(likelihood*prior_probability/marginal_probability)
        prob.append(like*prior_prob/marg)
    predict.append(np.argmax(prob))

In [15]:
test['predict'] = predict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predict'] = predict


In [16]:
test

Unnamed: 0,sepal length,sepal width,petal length,petal width,target,predict
14,6.0,4.0,1.0,0.0,0.0,0
98,5.0,2.0,3.0,1.0,1.0,1
75,7.0,3.0,4.0,1.0,1.0,1
16,5.0,4.0,1.0,0.0,0.0,0
131,8.0,4.0,6.0,2.0,2.0,2
56,6.0,3.0,5.0,2.0,1.0,2
141,7.0,3.0,5.0,2.0,2.0,2
44,5.0,4.0,2.0,0.0,0.0,0
29,5.0,3.0,2.0,0.0,0.0,0
120,7.0,3.0,6.0,2.0,2.0,2


In [17]:
acc = 0
for i,v in test.iterrows():
    if v['target'] == v['predict']:
        acc+=1
print('정확도 : ',acc/len(test) * 100,'%')

정확도 :  93.33333333333333 %
