In [3]:
from numpy import *
import matplotlib.pyplot as plt



In [4]:
def sigmoid(inX):
    return 1.0/(1 + exp(-inX))

In [20]:
def trainLogReg(xtrain, ytrain, param):
    numSample, numFeature = shape(xtrain)
    alpha = params['alpha']
    maxIter = params['maxIter']
    weights = ones((numFeature, 1))
    
    for i in range(maxIter):
        if param['optimizeType'] == 'gradDescent': #gradient descent
            output = sigmoid(xtrain * weights)
            error = ytrain - output
            weights = weights + alpha * xtrain.transpose() * error
        elif param['optimizeType'] == 'stocGradDescent': #stochastic gradient descent
            for j in range(numSample):
                output = sigmoid(xtrain[j,:] * weights)
                error = ytrain[j] - output
                weights = weights + alpha * xtrain[j, :].transpose() * error
        elif param['optimizeType'] == 'smoothStocGradDescent': #smooth stochastic gradient descent
            dataIndex = range(numSample)
            for j in range(numSample):
                alpha = 4.0/(1.0 + i + j) + 0.01
                randIndex = int(random.uniform(0, len(dataIndex)))
                print weights
                output = sigmoid(xtrain[randIndex, :] * weights)
                error = ytrain[randIndex] - output
                weights = weights + alpha * xtrain[randIndex,:].transpose() * error
                del(dataIndex[randIndex]) #after one iteration, delete the sample
        else:
            raise NameError('Optimize type not valid!')
            
    return weights

In [16]:
def testLogReg(weights, xtest, ytest):
    numSample, numFeature = shape(xtest)
    correct = 0
    for i in xrange(numSample):
        predict = sigmoid(xtest[i,:] * weights) > 0.5
        if predict == ytest[i]:
            correct += 1
    accuracy = float(correct) / numSample
    return accuracy

In [17]:
def plotLogReg(weights, xtrain, ytrain):
    numSample, numFeature = shape(xtrain)
    #only works for 2D array
    #draw all dots
    for i in range(numSample):
        if (ytrain[i] == 0):
            plt.plot(xtrain[i, 1], xtrain[i, 2], 'or')
        elif (ytrain[i] == 1):
            plt.plot(xtrain[i, 1], xtrain[i, 2], 'ob')
            
    min_x = min(xtrain[:, 1])
    max_x = max(xtrain[:, 1])
    weights = weights.getA() #convert mat to array
    y_min = float(-weights[0] - weights[1] * min_x) / weights[2]
    y_max = float(-weights[0] - weights[1] * max_x) / weights[2]
    plt.plot([min_x, max_x], [y_min, y_max], '-g')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()

In [22]:
#Load data
print "Loading data..."
xdata = []
ydata = []
readFile = open('watermellon.txt')
for row in readFile.readlines():
    rowArray = row.strip().split()
    xdata.append([1.0, float(rowArray[0]), float(rowArray[1])])
    ydata.append(float(rowArray[2]))

trainx = mat(xtrain)
print trainx
trainy = mat(ytrain).transpose()
print trainy

testx = trainx
testy = trainy

print "Train the model..."
params = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}
trainedWeights = trainLogReg(trainx, trainy, params)


print "Test the model..."
accuracy = testLogReg(trainedWeights, testx, testy)
print "Accuracy = ", accuracy

print "Make plots..."
plotLogReg(trainedWeights, trainx, trainy)


Loading data...
[['1.0' '0.697' '0.460']
 ['1.0' '0.774' '0.376']
 ['1.0' '0.634' '0.264']
 ['1.0' '0.608' '0.318']
 ['1.0' '0.556' '0.215']
 ['1.0' '0.403' '0.237']
 ['1.0' '0.481' '0.149']
 ['1.0' '0.437' '0.211']
 ['1.0' '0.666' '0.091']
 ['1.0' '0.243' '0.267']
 ['1.0' '0.245' '0.057']
 ['1.0' '0.343' '0.099']
 ['1.0' '0.639' '0.161']
 ['1.0' '0.657' '0.198']
 ['1.0' '0.360' '0.370']
 ['1.0' '0.593' '0.042']
 ['1.0' '0.719' '0.103']]
[['1' '1' '1' '1' '1' '1' '1' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0']]
Train the model...
[[ 1.]
 [ 1.]
 [ 1.]]


TypeError: Cannot cast array data from dtype('float64') to dtype('S32') according to the rule 'safe'