# What are we doing?
Predict who survived (1) or died (0) based on input information.

Raw data is also availble on wikipedia.

In [696]:
import numpy as np
import pandas as pd
import collections
from random import shuffle, randint

from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix

Getting errors with sklearn? 

Scikit-learn requires:
<li>Python (>= 2.6 or >= 3.3),
<li>NumPy (>= 1.6.1),
<li>SciPy (>= 0.9).

Try:
conda install scikit-learn
OR
pip install -U scikit-learn

Still got probs?

python -m pip install --upgrade pip
pip install --user numpy scipy matplotlib ipython jupyter pandas sympy nose

In [697]:
#read in csv file
data = pd.read_csv("iris.csv", header=None)
trainPercent = 30 #must be between 0 and 100
lmbda = .00001
numberRowsOfEachClass = 50
data.head() #look at first 5 rows of the data

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Useful functions
<li>date_frame.head()
<li>data.columns
<li>data_frame.shape


In [698]:
#Grab desired inputs to test one. Build a dataframe from them.
dataInputs= data.loc[:,0:3]
dataInputs.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [699]:
#Grab the expected outputs. (This is supervised learning)
y1=[]
y2=[]
y3=[]
for i in range(0,numberRowsOfEachClass):
    y1.append([1,0,0]);#setosa
    y2.append([0,1,0]);#versicolor
    y3.append([0,0,1]);#virginica
expectedOutput=np.concatenate([y1,y2,y3])


# Breakdown the input data into test and train

In [700]:
#Hint: use train_test_split
inputTrain, inputTest, expectedOutputTrain, expectedOutputTest = train_test_split(dataInputs, expectedOutput, train_size = trainPercent/100)

#inputTest.head()
print(inputTrain.shape)
print(expectedOutputTrain.shape)
print(inputTest.shape)
print(expectedOutputTest.shape)
d = collections.OrderedDict()
for a in expectedOutputTest:
    t = tuple(a)
    if t in d:
        d[t] += 1
    else:
        d[t] = 1

result = []
for (key, value) in d.items():
    result.append(list(key) + [value])

B = np.asarray(result)
print(B)
for Bi in B:
    if Bi[0] == 1:
        setosaTotal = Bi[3]
    elif Bi[1] == 1:
        versicolorTotal = Bi[3]
    elif Bi[2] == 1:
        virginicaTotal = Bi[3]
    
print(setosaTotal)
print(versicolorTotal)
print(virginicaTotal)

(45, 4)
(45, 3)
(105, 4)
(105, 3)
[[ 0  0  1 31]
 [ 0  1  0 34]
 [ 1  0  0 40]]
40
34
31


# Get weight matrix

In [701]:
#Build our least squares classifier for 2 classes
D = inputTrain.shape[1] + 1 #num of attributes, +1 is for the intercept (column of 1s)
K = expectedOutput.shape[1]

In [702]:
expectedOutputTrain = np.asarray(expectedOutputTrain)
expectedOutputTest = np.asarray(expectedOutputTest)
inputTrain = np.asarray(inputTrain)
inputTest = np.asarray(inputTest)
p = np.append(1, inputTrain[0])
p.shape
p.reshape(1,D)

array([[ 1. ,  4.8,  3. ,  1.4,  0.3]])

In [703]:
sum1 = 0
sum2 = 0
numRows = inputTrain.shape[0]
new_col = np.ones((numRows,1))
x = np.c_[new_col, inputTrain]
sum1 = np.dot(x.T, x) + lmbda
sum2 = np.dot(x.T, expectedOutputTrain)
print(sum1)
print(sum2)

[[   45.00001   267.80001   133.10001   188.60001    62.80001]
 [  267.80001  1621.34001   789.97001  1173.96001   393.30001]
 [  133.10001   789.97001   398.39001   549.46001   182.95001]
 [  188.60001  1173.96001   549.46001   912.74001   312.80001]
 [   62.80001   393.30001   182.95001   312.80001   109.44001]]
[[  10.    16.    19. ]
 [  49.2   94.8  123.8]
 [  33.    44.1   56. ]
 [  14.5   68.4  105.7]
 [   2.6   22.2   38. ]]


In [704]:
# i=0
# sum1=0
# sum2=0
# for xi in inputTrain:
#     xi = np.append(1, xi)
#     xi = xi.reshape(1,D)
#     yi=expectedOutputTrain[i].reshape(1,K)          
#     sum1 += np.dot(xi, xi.T) + lmbda   
#     sum2 += np.dot(xi.T, yi)
#     i += 1
# sum1 = np.asscalar(sum1)
# print(sum1)
# print(sum2)

In [705]:
W= np.dot(np.linalg.inv(sum1), sum2)
#W = np.array(W)[np.newaxis]
W

array([[-0.4380229 ,  2.73799612, -1.30000756],
       [ 0.0704297 , -0.03326769, -0.03715768],
       [ 0.38145307, -0.72143026,  0.33998148],
       [-0.13807884,  0.04448016,  0.09359878],
       [-0.22101551, -0.16986219,  0.39087431]])

In [706]:
#testing
total = expectedOutputTest.shape[0]
predicted = np.zeros((total, K))
i=0
setosaCorrect=versicolorCorrect=virginicaCorrect=totalCorrect=0
correctPredictionCol = [0]*total

In [707]:
for i in range(total):
    x = inputTest[i]    
    x = np.append(1,x)
    x = x.reshape(1,D)
    values = np.dot(W.T,x.T)
    values = values.T
    values = values.flatten()
    maxIndex = np.argmax(values)
    if expectedOutputTest[i][maxIndex] == 1:
        if maxIndex == 0:
            setosaCorrect += 1
        elif maxIndex == 1:
            versicolorCorrect += 1
        elif maxIndex == 2:
            virginicaCorrect += 1
        correctPredictionCol[i] = 1
    predicted[i][maxIndex] = 1
totalCorrect = setosaCorrect + versicolorCorrect + virginicaCorrect
print(totalCorrect)
totalAccuracy=totalCorrect/float(total)*100
setosaAccuracy = setosaCorrect/float(setosaTotal)*100
versicolorAccuracy = versicolorCorrect/float(versicolorTotal)*100
virginicaAccuracy = virginicaCorrect/float(virginicaTotal)*100
print("setosa misclassification error = ", 100 - setosaAccuracy)
print("versicolor misclassification error = ", 100 - versicolorAccuracy)
print("virginica misclassification error = ", 100 - virginicaAccuracy)
print("total misclassification error = ", 100 - totalAccuracy)
print(total)

87
setosa misclassification error =  2.5
versicolor misclassification error =  38.23529411764706
virginica misclassification error =  12.903225806451616
total misclassification error =  17.14285714285714
105


In [708]:
actual = pd.DataFrame(expectedOutputTest)
predictedDF = pd.DataFrame(predicted)
correct = pd.DataFrame(correctPredictionCol)
frames = [predictedDF, actual, correct]
modelTest = pd.concat(frames, axis=1)
modelTest.columns = ["Predicted","Predicted", "Predicted", "Actual", "Actual", "Actual", "Correct"]
modelTest

Unnamed: 0,Predicted,Predicted.1,Predicted.2,Actual,Actual.1,Actual.2,Correct
0,0.0,0.0,1.0,0,0,1,1
1,0.0,0.0,1.0,0,0,1,1
2,0.0,0.0,1.0,0,0,1,1
3,0.0,1.0,0.0,0,1,0,1
4,0.0,0.0,1.0,0,1,0,0
5,1.0,0.0,0.0,1,0,0,1
6,1.0,0.0,0.0,1,0,0,1
7,0.0,1.0,0.0,0,1,0,1
8,0.0,1.0,0.0,0,1,0,1
9,0.0,0.0,1.0,0,0,1,1


In [709]:
actualAsClassNumber = [0]*expectedOutputTest.shape[0]
predictedAsClassNumber = [0]*expectedOutputTest.shape[0]
for i in range(expectedOutputTest.shape[0]):
    actualAsClassNumber[i] = expectedOutputTest[i].tolist().index(1)
    predictedAsClassNumber[i] = predicted[i].tolist().index(1.0)
confusion_matrix(actualAsClassNumber, predictedAsClassNumber)

array([[39,  1,  0],
       [ 0, 21, 13],
       [ 0,  4, 27]])