In [531]:
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [532]:
from sklearn import datasets
iris = datasets.load_iris()
list(iris.keys())

['data',
 'target',
 'frame',
 'target_names',
 'DESCR',
 'feature_names',
 'filename',
 'data_module']

In [533]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [534]:
X = iris["data"][:,2:]
y = iris["target"]
X = np.c_[np.ones([len(X), 1]), X]

In [535]:
classes = iris["target_names"]
classes

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [536]:
lr = 0.1
m = len(X)
param_matrix = np.random.randn(3,3)

In [537]:
from math import exp
def softmax(scores, cl):
    #print(scores)
    final_scores = np.zeros(3, dtype = np.float64)
    exp_sum = 0 
    exp_func = np.vectorize(exp)
    final_scores = exp_func(scores)
    exp_sum = sum(final_scores)
    final_scores = final_scores / exp_sum
    return final_scores[cl]
def softmax1(scores):
    #print(scores)
    final_scores = np.zeros(3, dtype = np.float64)
    exp_sum = 0 
    exp_func = np.vectorize(exp)
    final_scores = exp_func(scores)
    exp_sum = sum(final_scores)
    final_scores = final_scores / exp_sum
    return final_scores

In [538]:
softmax1(np.asarray([-3, 4, 5])) #softmax works

array([2.45182703e-04, 2.68875482e-01, 7.30879336e-01])

In [539]:
y_prepared = np.zeros(len(y))
y_prepared = [(y == 0), (y == 1), (y == 2)]
y_prepared = np.array(list(zip(y_prepared[0], y_prepared[1], y_prepared[2])))
y_prepared = y_prepared.astype(int)
y_prepared 

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0,

In [540]:
def predict(p_matrix, x):
    scores = np.zeros(len(classes))
    for class_num in range(len(classes)):
        theta = p_matrix[class_num]
        score = sum(theta * x) + theta[0]
        scores[class_num] = score
    return scores
    
    
    

In [541]:
import math
def cross_entropy(p_matrix, X):
    cost_sum = 0
    for i in range(m):
        for k in range(len(classes)):
            expected = y_prepared[i]
            x = X[i]
            probability = predict(p_matrix, x, k)
            pred = expected[k] * softmax(predictions , k)
            cost_sum += math.log(pred)
    return cost_sum * (-1 / m)

In [542]:
def cross_entropy_gradient(p_matrix, k):
    grad_sum = np.zeros(3)
    for i in range(m):
        x = X[i]
        predictions = predict(p_matrix, x)
        #print(predictions)
        expected = y_prepared[i]
        p = softmax(predictions, k)
        y = expected[k]
        grad_sum += (p - y) * x
    return grad_sum * (1 / m)
    

In [543]:
def softmax2(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums

In [544]:
def cross_gradient_new(p_matrix):
    logits = X.dot(p_matrix)
    y_proba = softmax2(logits)
    error = y_proba - y_prepared
    gradients = (1 / m) * X.T.dot(error)
    return gradients
    

In [545]:
def train(X, p_matrix):
    for k in range(len(classes)):
        grad = cross_entropy_gradient(p_matrix, k)
        #print(grad)
        theta = p_matrix[k]
        theta -= (lr * grad)
        p_matrix[k] = theta


In [546]:
def train_new(X, p_matrix):
    gradients = cross_gradient_new(p_matrix)
    p_matrix -= (lr) * gradients

In [547]:
param_matrix

array([[ 0.49671415, -0.1382643 ,  0.64768854],
       [ 1.52302986, -0.23415337, -0.23413696],
       [ 1.57921282,  0.76743473, -0.46947439]])

In [548]:
test(param_matrix)

0.3333333333333333

In [552]:
n_iterations = 3000
for _ in range(n_iterations):
    train(X, param_matrix)
    #train_new(X, param_matrix)
param_matrix

array([[ 6.2280141 , -2.13363766, -2.28858921],
       [ 1.98905277,  0.91077146, -1.43381821],
       [-4.55007535,  1.8524178 ,  3.73779699]])

In [550]:
def test(X, p_matrix):
    correct_results = 0
    for i in range(len(X)):
        x = X[i]
        expected = y[i]
        pred = predict(p_matrix, x)
        res = softmax1(pred)
        guess = np.argmax(res)
        if(guess == expected):
            correct_results += 1
    return correct_results / len(X)

In [553]:
test(param_matrix)

0.96