# What are we doing?
Predict who survived (1) or died (0) based on input information.

Raw data is also availble on wikipedia.

In [1]:
import numpy as np
import pandas as pd
from random import shuffle, randint

from sklearn.cross_validation import train_test_split



Getting errors with sklearn? 

Scikit-learn requires:
<li>Python (>= 2.6 or >= 3.3),
<li>NumPy (>= 1.6.1),
<li>SciPy (>= 0.9).

Try:
conda install scikit-learn
OR
pip install -U scikit-learn

Still got probs?

python -m pip install --upgrade pip
pip install --user numpy scipy matplotlib ipython jupyter pandas sympy nose

In [2]:
#read in csv file
data = pd.read_csv("iris.csv", header=None)
trainPercent = 50 #must be between 0 and 100
lmbda = .00001
numberRowsOfEachClass = 50
data.head() #look at first 5 rows of the data

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Useful functions
<li>date_frame.head()
<li>data.columns
<li>data_frame.shape


In [3]:
#Grab desired inputs to test one. Build a dataframe from them.
dataInputs= data.loc[:,0:3]
dataInputs.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
#Grab the expected outputs. (This is supervised learning)
y1=[]
y2=[]
y3=[]
for i in range(0,numberRowsOfEachClass):
    y1.append([1,0,0]);#setosa
    y2.append([0,1,0]);#versicolor
    y3.append([0,0,1]);#virginica
expectedOutput=np.concatenate([y1,y2,y3])


# Breakdown the input data into test and train

In [5]:
#Hint: use train_test_split
inputTrain, inputTest, expectedOutputTrain, expectedOutputTest = train_test_split(dataInputs, expectedOutput, train_size = trainPercent/100)

#inputTest.head()
print(inputTrain.shape)
print(expectedOutputTrain.shape)
print(inputTest.shape)
print(expectedOutputTest.shape)

(75, 4)
(75, 3)
(75, 4)
(75, 3)


# Get weight matrix

In [6]:
#Build our least squares classifier for 2 classes
D = inputTrain.shape[1] + 1 #num of attributes, +1 is for the intercept (column of 1s)
K = expectedOutput.shape[1]

In [7]:
expectedOutputTrain = np.asarray(expectedOutputTrain)
expectedOutputTest = np.asarray(expectedOutputTest)
inputTrain = np.asarray(inputTrain)
inputTest = np.asarray(inputTest)
p = np.append(1, inputTrain[0])
p.shape
p.reshape(1,D)

array([[ 1. ,  5.6,  3. ,  4.5,  1.5]])

In [8]:
sum1 = 0
sum2 = 0
numRows = inputTrain.shape[0]
new_col = np.ones((numRows,1))
x = np.c_[new_col, inputTrain]
sum1 = np.dot(x.T, x) + lmbda
sum2 = np.dot(x.T, expectedOutputTrain)
print(sum1)
print(sum2)

[[   75.00001   442.30001   227.20001   288.00001    93.30001]
 [  442.30001  2663.29001  1334.52001  1797.16001   590.88001]
 [  227.20001  1334.52001   700.32001   848.08001   273.67001]
 [  288.00001  1797.16001   848.08001  1327.38001   450.94001]
 [   93.30001   590.88001   273.67001   450.94001   158.31001]]
[[  23.    25.    27. ]
 [ 113.6  149.3  179.4]
 [  78.3   70.2   78.7]
 [  33.8  105.2  149. ]
 [   5.9   32.6   54.8]]


In [9]:
# i=0
# sum1=0
# sum2=0
# for xi in inputTrain:
#     xi = np.append(1, xi)
#     xi = xi.reshape(1,D)
#     yi=expectedOutputTrain[i].reshape(1,K)          
#     sum1 += np.dot(xi, xi.T) + lmbda   
#     sum2 += np.dot(xi.T, yi)
#     i += 1
# sum1 = np.asscalar(sum1)
# print(sum1)
# print(sum2)

In [10]:
W= np.dot(np.linalg.inv(sum1), sum2)
#W = np.array(W)[np.newaxis]
W

array([[ 0.06520306,  1.48459698, -0.54981684],
       [ 0.0879039 , -0.0435849 , -0.04431655],
       [ 0.23334987, -0.38461488,  0.15126643],
       [-0.24423751,  0.24847705, -0.00423918],
       [-0.03694333, -0.54923977,  0.58618033]])

In [11]:
#testing
total = expectedOutputTest.shape[0]
predicted = np.zeros((total, K))
i=0
numCorrect=0
correctPredictionCol = [0]*total

In [12]:
for i in range(total):
    x = inputTest[i]    
    x = np.append(1,x)
    x = x.reshape(1,D)
    values = np.dot(W.T,x.T)
    values = values.T
    values = values.flatten()
    maxIndex = np.argmax(values)
    if expectedOutputTest[i][maxIndex] == 1:
        numCorrect += 1     
        correctPredictionCol[i] = 1
    predicted[i][maxIndex] = 1
print(numCorrect)
accuracy=numCorrect/float(total)*100
print(accuracy)
print(total)

65
86.66666666666667
75


In [13]:
actual = pd.DataFrame(expectedOutputTest)
predictedDF = pd.DataFrame(predicted)
correct = pd.DataFrame(correctPredictionCol)
frames = [predictedDF, actual, correct]
modelTest = pd.concat(frames, axis=1)
modelTest.columns = ["Predicted","Predicted", "Predicted", "Actual", "Actual", "Actual", "Correct"]
modelTest

Unnamed: 0,Predicted,Predicted.1,Predicted.2,Actual,Actual.1,Actual.2,Correct
0,0.0,0.0,1.0,0,0,1,1
1,1.0,0.0,0.0,1,0,0,1
2,0.0,1.0,0.0,0,1,0,1
3,1.0,0.0,0.0,1,0,0,1
4,1.0,0.0,0.0,1,0,0,1
5,0.0,1.0,0.0,0,1,0,1
6,0.0,0.0,1.0,0,0,1,1
7,1.0,0.0,0.0,1,0,0,1
8,0.0,1.0,0.0,0,1,0,1
9,1.0,0.0,0.0,1,0,0,1
