In [11]:
from sklearn.datasets import load_iris
dataset = load_iris()
#print("shape:", X.shape)
#print("shape:", y.shape)
X = dataset.data
#print("data:", X)
y = dataset.target
#print("target:", y)
print(type(dataset))
print(dataset.DESCR)

<class 'sklearn.utils.Bunch'>
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSH

In [3]:
import numpy as np

new_col=X[:,2]/X[:,3]
#print(new_col)
#print(np.reshape(new_col,(len(new_col),1)))
new_col=np.reshape(new_col,(len(new_col),1))

X = np.concatenate((X,new_col),axis=1)

attribute_means = X.mean(axis=0)
print("The attribute means:")
print(attribute_means)
X_d = np.array(X >= attribute_means,dtype='int')
print("\nThe discretized dataset:")
print(X_d[-5:])

#print(list(zip(X, y)))

The attribute means:
[5.84333333 3.05733333 3.758      1.19933333 4.31049976 4.31049976]

The discretized dataset:
[[1 0 1 1 0 0]
 [1 0 1 1 0 0]
 [1 0 1 1 0 0]
 [1 1 1 1 0 0]
 [1 0 1 1 0 0]]


In [12]:
from collections import defaultdict
from operator import itemgetter
def train_feature_value(X,y_true,feature_index,value):
    class_counts = defaultdict(int)
    #print("zip(X,y_true):")
    #print(zip(X,y_true))
    #print(X)
    #print(y_true)
    #print(len(X))
    #print(len(y_true))
    #print(list(zip(X, y_true)))
    #print('\n')
    for sample, y in zip(X, y_true):
        #print(sample)
        #print(y)
        if sample[feature_index] == value:
            class_counts[y] += 1
    #print("class_counts:")
    #print(class_counts)
    sorted_class_counts = sorted(class_counts.items(),key=itemgetter(1),reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    #print(sorted_class_counts)
    incorrect_predictions = [class_count for class_value, class_count in class_counts.items()\
                             if class_value != most_frequent_class]
    error = sum(incorrect_predictions)
    return most_frequent_class, error


In [13]:
def train_on_feature(X, y_true, feature_index):
    values = set(X[:,feature_index])
    #print(values)
    predictors = {}
    errors = []
    for current_value in values:
        most_frequent_class, error = train_feature_value(X,y_true,feature_index,current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)
    total_errors = sum(errors)
    #print(predictors)
    #print(total_errors)
    return predictors, total_errors

In [14]:
from sklearn.model_selection import train_test_split
Xd_train, Xd_test, y_train,y_test = train_test_split(X_d,y,random_state=14)
all_predictors = {}
errors = {}
for feature_index in range(Xd_train.shape[1]):
    predictor, total_error = train_on_feature(Xd_train, y_train, feature_index)
    all_predictors[feature_index] = predictor
    errors[feature_index] = total_error
    best_feature, best_error = sorted(errors.items(), key=itemgetter(1))[0]
    model = {'feature': best_feature,'predictor': all_predictors[best_feature][0]}
print("all_predictors, errors:")
print(all_predictors)
print(errors)

    
    
    

here 6
all_predictors, errors:
{0: {0: 0, 1: 2}, 1: {0: 1, 1: 0}, 2: {0: 0, 1: 2}, 3: {0: 0, 1: 2}, 4: {0: 2, 1: 0}, 5: {0: 2, 1: 0}}
{0: 41, 1: 58, 2: 37, 3: 37, 4: 43, 5: 43}


In [25]:
total_test_samples=Xd_test.shape[0]
count_correct=0
for sample,y in zip(Xd_test, y_test):
    print("for the sample:",sample)
    if sample[2]==0:
        prediction=0
    else:
        prediction=2
    print("we predicted",prediction)
    print("the actual label (flower) was",y)
    if y==prediction:
        count_correct+=1
        print("success!! total successes =",count_correct)
    else:
        print("failure...")
print("accuracy = ",count_correct/total_test_samples)

for the sample: [0 1 0 0 1]
we predicted 0
the actual label (flower) was 0
success!! total successes = 1
for the sample: [0 1 0 0 1]
we predicted 0
the actual label (flower) was 0
success!! total successes = 2
for the sample: [0 1 0 0 1]
we predicted 0
the actual label (flower) was 0
success!! total successes = 3
for the sample: [0 0 1 1 0]
we predicted 2
the actual label (flower) was 1
failure...
for the sample: [0 0 1 1 0]
we predicted 2
the actual label (flower) was 2
success!! total successes = 4
for the sample: [1 0 1 1 0]
we predicted 2
the actual label (flower) was 1
failure...
for the sample: [0 1 0 0 1]
we predicted 0
the actual label (flower) was 0
success!! total successes = 5
for the sample: [0 0 1 1 0]
we predicted 2
the actual label (flower) was 1
failure...
for the sample: [0 1 0 0 0]
we predicted 0
the actual label (flower) was 0
success!! total successes = 6
for the sample: [1 0 1 1 0]
we predicted 2
the actual label (flower) was 1
failure...
for the sample: [0 0 1 1 0