In [16]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split

def hypothesis(x, t0, t1):
    return t0 + t1 * x

def cost_function(X, fx, h, t0, t1):
    soma = 0.
    N = len(X)
    
    for i in range(N):
        soma += (h(X[i], t0, t1) - fx[i]) ** 2.
    
    return (1./(2. * float(N))) * soma

def update_t0(X, fx, h, t0, t1, alpha):
    """
    Atualiza t0 com base nos N valores passados para esta função.
    """
    
    N = len(X)
    soma = 0.
    
    for i in range(N):
        soma += (h(X[i], t0, t1) - fx[i])
    
    return t0 - ((alpha * (1./float(N))) * soma)


def update_t1(X, fx, h, t0, t1, alpha):
    """
    Atualiza t1 com base nos N valores passados para esta função.
    """
    N = len(X)
    
    soma = 0.
    for i in range(N):
        soma += (h(X[i], t0, t1) - fx[i]) * X[i]
    
    return t1 - ((alpha * (1./float(N))) * soma)

def main():
    dataset = pd.read_csv('ex1data1.csv', nrows=None, header=0, index_col=None)
    trainset, testset, = train_test_split(dataset, test_size=0.5)
    
    t0 = 0.5  # theta 0
    t1 = 0.  # theta 1
    alpha = 0.01  # taxa de aprendizado
#     X, fx = np.array([0, 1, 2, 3], dtype=float), np.array([1, 2, 4, 9], dtype=float)  # X e fx
    X, fx = trainset['populacao'].values, trainset['lucro'].values
    threshold = 0.001  # diferença aceitável entre custos
    batch_size = 8  # tamanho do batch
    epoch = 0
    max_epoch = 10  # máximo número de iterações permitido
    
    prev = np.inf  # custo anterior
    curr = cost_function(X, fx, hypothesis, t0, t1)  # custo atual
    while (abs(curr - prev) > threshold) and (epoch < max_epoch):
        bc = 0  # contador de quantas instâncias passaram pelo batch

        for i in range(batch_size):
            X_local = X[bc:(bc + batch_size)]
            fx_local = fx[bc:(bc + batch_size)]

            temp0 = update_t0(X_local, fx_local, hypothesis, t0, t1, alpha)
            temp1 = update_t1(X_local, fx_local, hypothesis, t0, t1, alpha)

            t0 = temp0
            t1 = temp1

            bc += 1
            
#     plt.plot(t1s, [cost_function(X, fx, hypothesis, a, b) for a, b in zip(t0s, t1s)], c='green')
#     plt.xlabel(r'$\theta_1$')
#     plt.ylabel(r'$J(\theta_1)$')
#     plt.title(u'Custo associado ao parâmetro ' + r'$\theta_1$')
#     plt.show()

        prev = curr
        curr = cost_function(X, fx, hypothesis, t0, t1)
        print 'custo na época %d: %f' % (epoch, curr)
        epoch += 1
    print 't0:', t0
    print 't1:', t1
    
main()

custo na época 0: 7.334416
custo na época 1: 7.282304
custo na época 2: 7.231723
custo na época 3: 7.182626
custo na época 4: 7.134965
custo na época 5: 7.088693
custo na época 6: 7.043768
custo na época 7: 7.000145
custo na época 8: 6.957784
custo na época 9: 6.916646
t0: 0.20105876428812047
t1: 0.9563867171890845


In [36]:
from sklearn import linear_model

dataset = pd.read_csv('ex1data1.csv', nrows=None, header=0, index_col=None)
trainset, testset, = train_test_split(dataset, test_size=0.1)

clf = linear_model.LinearRegression(normalize=True)
clf.fit(trainset.iloc[:, 0:-1], trainset.iloc[:, -1])
predictions = clf.predict(testset.iloc[:, 0:-1])
print predictions
score = clf.score(testset.iloc[:, 0:-1], testset.iloc[:, -1])
print score #best 1.0

[ 3.45236022 20.3067826   4.09721789  9.30557461  3.32218379 20.39065307
 10.18326145 12.18434019  2.48052581  2.71689881]
0.9062289380302906
