In [272]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
#jt -t oceans16 -fs 15 -nfs 15 -ofs 10 -dfs 10 -cellw 1500 -T -N


In [273]:
data = pd.read_csv(os.path.join(os.getcwd(), 'Datas', 'Data-Cleaning-2.csv'))
data.head()

Unnamed: 0,symboling,normalized-losses,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,highway-mpg,volume,...,num-of-cylinders_three,num-of-cylinders_twelve,fuel-system_1bbl,fuel-system_2bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi,price
0,1.78685,1.336575,0.045215,0.513027,-1.808186,-0.288273,0.198569,-0.213359,-0.557058,-1.168294,...,0,0,0,0,0,0,1,0,0,13495.0
1,1.78685,1.336575,0.045215,0.513027,-1.808186,-0.288273,0.198569,-0.213359,-0.557058,-1.168294,...,0,0,0,0,0,0,1,0,0,16500.0
2,0.16397,0.16715,0.575559,-2.394827,0.702918,-0.288273,1.334283,-0.213359,-0.704134,-0.422041,...,0,0,0,0,0,0,1,0,0,16500.0
3,0.97541,1.200962,-0.461021,-0.517605,0.480415,-0.036204,-0.039139,0.856208,-0.115832,0.169527,...,0,0,0,0,0,0,1,0,0,13950.0
4,0.97541,1.200962,0.189854,-0.517605,0.480415,-0.540341,0.304217,0.856208,-1.292436,0.193551,...,0,0,0,0,0,0,1,0,0,17450.0


In [274]:
target = data['price']
features = data.drop('price', axis=1)

In [275]:
# 这里开始划分数据
seed = 123
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=seed)


然后就要开始做梯度的公式了，先看一下目标函数:<br>
X代表一个数据矩阵，每一行代表一个数据点，用$x^a$表示a代表第几行，$x^a_b$代表第a行数据的第b个特征<br>
M代表数据的行数，j代表特征的个数，有多少个特征就有多少个参数<br>
$\theta$是参数，他是一个行向量,$\theta^T$就代表列向量,$\theta_a$代表$\theta$的第a个参数<br>
<br>
$J_{(\theta)}=\frac{1}{2M}\bigg\{\left[(x^0_0*\theta_0+x^0_1*\theta_1+x^0_2*\theta_2+....+x^0_j*\theta_j)-y^0\right]^2+\left[(x^1_0*\theta_0+x^1_1*\theta_1+x^1_2*\theta_2+....+x^1_j*\theta_j)-y^1\right]^2\\\quad
+\left[(x^2_0*\theta_0+x^2_1*\theta_1+x^2_2*\theta_2+....+x^2_j*\theta_j)-y^2\right]^2+......+\left[(x^m_0*\theta_0+x^m_1*\theta_1+x^m_2*\theta_2+....+x^m_j*\theta_j)-y^m\right]^2\bigg\}\\\quad=\frac{1}{2M}\left[(x^0*\theta^T-y^0)^2+(x^1*\theta^T-y^1)^2+(x^2*\theta^T-y^2)^2+......+(x^m*\theta^T-y^m)^2\right]\\\quad
=\frac{1}{2M}\sum\limits_{i=0}^{m}\big[(x^m*\theta^T)-y^m\big]^2$<br><br>
然后就要开始求梯度了，梯度其实是一个指向函数最大值的方向导数<br>
$\nabla J{(\theta_j)}=\frac{1}{M}\sum\limits_{i=0}^{M}\big(x^i*\theta^T-y^i\big)x^i_j$<br>
上面这一行是单个$\theta$的偏导，每一次修改$\theta$的值得时候，是需要所有参数一起修改的，所以每一个 $\theta$ 都要改<br>
$\theta_j=\theta_j - \frac{1}{M}\sum\limits_{i=0}^{M}\big(x^i*\theta^T-y^i\big)x^i_j$<br>


In [276]:
theta = np.ones((features.shape[1],1))
theta.shape

(66, 1)

In [277]:
x_train.shape

(135, 66)

每一个 $\theta_i$ 都要单独计算

In [278]:
def gradient_descent(rate,theta,x_train,y_train):
    m = 1/x_train.shape[0]  # 这是训练数据的数量要除的那个,我先把他化成1/m
    gradient = theta.copy()
    for index, theta_i in enumerate(theta):
        gradient[index] = rate*m*(x_train.dot(theta).sub(y_train,axis=0).T.dot(x_train.iloc[:,index]))
    theta -= gradient
    return theta
# theta = gradient_descent(0.01,theta,x_train,y_train)


然后写一个函数计算$J(\theta)$

In [279]:
def cost_function(x_train,y_train,theta):
    m = 1/(x_train.shape[0]*2)
    cost = m*(((x_train.dot(theta).sub(y_train,axis=0))**2).sum(axis=0))
    return int(cost)

In [280]:
%%time
cost = 5000
times = 1000000
while times:
    theta = gradient_descent(0.01,theta,x_train,y_train)
    cost = cost_function(x_train,y_train,theta)
    times -= 1
print(theta)
print('cost:',cost,'times:',times)

[[-5.33570779e+02]
 [ 5.14635021e+01]
 [ 3.83820426e+03]
 [ 8.61409421e+01]
 [-5.41970676e+02]
 [-1.05771563e+03]
 [ 1.29830298e+03]
 [ 1.27836475e+03]
 [ 9.77417211e+02]
 [ 1.53111737e+03]
 [ 4.19985518e+03]
 [ 2.10667747e+03]
 [ 3.07415585e+03]
 [ 4.15349050e+03]
 [-2.38462666e+03]
 [-1.95621029e+03]
 [ 5.33837603e+02]
 [ 1.00000000e+00]
 [ 7.05504342e+03]
 [-4.70226297e+02]
 [ 4.84625530e+03]
 [-4.78519879e+03]
 [-1.93108133e+03]
 [-1.95036521e+03]
 [-1.87716870e+03]
 [-2.69498636e+03]
 [ 4.32787167e+03]
 [ 1.95971519e+03]
 [-1.17960207e+03]
 [-1.72310168e+03]
 [-3.95415443e+02]
 [-2.49020897e+03]
 [ 3.21785167e+03]
 [ 9.83003507e+02]
 [ 8.19449833e+02]
 [ 3.38140535e+03]
 [ 2.19327673e+03]
 [ 2.00757845e+03]
 [ 3.91379093e+03]
 [ 1.24016124e+03]
 [ 1.85172012e+02]
 [-1.26135773e+02]
 [-1.00913323e+03]
 [ 1.67625818e+03]
 [ 1.37086302e+02]
 [ 2.38851070e+03]
 [ 1.08557228e+03]
 [ 3.11528290e+03]
 [-7.63582708e+02]
 [ 1.64332840e+03]
 [ 3.14748282e+03]
 [ 1.93468083e+03]
 [-1.7580541

In [359]:
ret = x_test.dot(theta).iloc[:,0]
ret -= y_test
ret.describe()


count      58.000000
mean      431.792765
std      1896.556423
min     -3104.039401
25%      -854.644594
50%       143.575967
75%      1475.380739
max      5757.502563
Name: 0, dtype: float64

In [360]:
type(theta)

numpy.ndarray

In [367]:
save_theta = pd.DataFrame(theta)
save_theta.to_csv(os.path.join(os.getcwd(),'Datas','theta_1.csv'),index=False)