In [77]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split

def hypothesis(X, thetas):
    return np.dot(X, thetas.T)

def cost_function(X, fx, h, thetas):
    soma = 0.
    N = len(X)
    
    for i in range(N):
        soma += (h(X[i], thetas) - fx[i]) ** 2.
    
    return (1./(2. * float(N))) * soma

def update_t0(X, fx, h, thetas, alpha):
    """
    Atualiza t0 com base nos N valores passados para esta função.
    """
    N = len(X)
    soma = 0.
    
    for i in range(N):
        soma += (h(X[i], thetas) - fx[i])
    
    return thetas[0][0] - ((alpha * (1./float(N))) * soma)


def update_t1(X, fx, h, thetas, alpha):
    """
    Atualiza t1 com base nos N valores passados para esta função.
    """
    N = len(X)
    
    soma = 0.
    for i in range(N):
        soma += (h(X[i], thetas) - fx[i]) * X[i]
    
    return thetas[0][1] - ((alpha * (1./float(N))) * soma)

def update_t2(X, fx, h, thetas, alpha):
    """
    Atualiza t1 com base nos N valores passados para esta função.
    """
    N = len(X)
    
    soma = 0.
    for i in range(N):
        soma += (h(X[i], thetas) - fx[i]) * X[i]
    
    return thetas[0][2] - ((alpha * (1./float(N))) * soma)

def main():
    dataset = pd.read_csv('ex1data2.csv', nrows=None, header=0, index_col=None)
    x0 = np.ones(dataset.shape[0]).reshape(dataset.shape[0], 1)  # define vetor-coluna x0
    
    dataset = pd.DataFrame(np.hstack((x0, dataset.values)))  # adiciona x0 ao começo da matriz de instâncias
        
    trainset, testset, = train_test_split(dataset, test_size=0.5)
    
    thetas = np.array([50000., 120., 6000.]).reshape(1,-1)
    
#     t0 = 0.5  # theta 0
#     t1 = 0.  # theta 1
#     t2 = 1.
    alpha = 0.0001 # taxa de aprendizado
#     X, fx = np.array([0, 1, 2, 3], dtype=float), np.array([1, 2, 4, 9], dtype=float)  # X e fx
    X, fx = trainset.iloc[:, 0:-1].values, trainset.iloc[:, -1].values
    threshold = 0.001  # diferença aceitável entre custos
    batch_size = 8  # tamanho do batch
    epoch = 0
    max_epoch = 10  # máximo número de iterações permitido
    
    prev = np.inf  # custo anterior
    curr = cost_function(X, fx, hypothesis, thetas)  # custo atual
#     print 'custo na época %d: %f' % (0, curr)
    while (abs(curr - prev) > threshold) and (epoch < max_epoch):
        bc = 0  # contador de quantas instâncias passaram pelo batch

        for i in range(batch_size):
            X_local = X[bc:(bc + batch_size)]
            fx_local = fx[bc:(bc + batch_size)]

            temp0 = update_t0(X_local, fx_local, hypothesis, thetas, alpha)
#             print temp0 
            temp1 = update_t1(X_local, fx_local, hypothesis, thetas, alpha)
#             print temp1
            temp2 = update_t2(X_local, fx_local, hypothesis, thetas, alpha)
#             print temp2

            thetas[0][0] = temp0[0]
            thetas[0][1] = temp1[0]
            thetas[0][2] = temp2[0]

            bc += 1
            
#     plt.plot(t1s, [cost_function(X, fx, hypothesis, a, b) for a, b in zip(t0s, t1s)], c='green')
#     plt.xlabel(r'$\theta_1$')
#     plt.ylabel(r'$J(\theta_1)$')
#     plt.title(u'Custo associado ao parâmetro ' + r'$\theta_1$')
#     plt.show()

        prev = curr
        curr = cost_function(X, fx, hypothesis, thetas)
        print 'custo na época %d: %f' % (epoch, curr)
        epoch += 1
    print 't0:', thetas[0][0]
    print 't1:', thetas[0][1]
    print 't2:', thetas[0][2]
    
main()

custo na época 0: 2435056730.536877
custo na época 1: 2428191455.292618
custo na época 2: 2428751828.188519
custo na época 3: 2428893055.979080
custo na época 4: 2428917974.356231
custo na época 5: 2428922173.599515
custo na época 6: 2428922876.042500
custo na época 7: 2428922993.401924
custo na época 8: 2428923013.005527
custo na época 9: 2428923016.279982
t0: 50013.8866001
t1: 133.886600144
t2: 6013.88660014


In [72]:
from sklearn import linear_model

dataset = pd.read_csv('ex1data2.csv', nrows=None, header=0, index_col=None)
trainset, testset, = train_test_split(dataset, test_size=0.5)

clf = linear_model.LinearRegression(normalize=True)
clf.fit(trainset.iloc[:, 0:-1], trainset.iloc[:, -1])
predictions = clf.predict(testset.iloc[:, 0:-1])
print predictions
score = clf.score(testset.iloc[:, 0:-1], testset.iloc[:, -1])
print score #best 1.0
print clf.coef_
print clf.intercept_

[ 381882.56548757  392999.13838311  449243.21665152  242969.38941946
  229549.71849138  318123.11469191  292460.65948558  398090.19719189
  322451.60917575  340270.31500647  366609.38906124  229393.23259605
  252424.21292148  241186.42434002  343906.78558417  342366.25187657
  329118.47190153  306329.92294795  269636.84032259  455945.34958671
  289879.85987178  445485.5303879   304789.38924035]
0.437979521869
[  121.21568592  6146.72977268]
57251.5535273
