# Data Normalization

In [1]:
import numpy as np
from sklearn import datasets

In [3]:
boston = datasets.load_boston()
X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

In [4]:
from LR.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=333)

In [6]:
from LR.LinearRegression import LinearRegression

lin_reg1 = LinearRegression()
%time lin_reg1.fit_normal(X_train, y_train)
lin_reg1.score(X_test, y_test)

Wall time: 241 ms


0.8704428049515445

### 1. 使用梯度下降法

In [7]:
lin_reg2 = LinearRegression()
%time lin_reg2.fit_gd(X_train, y_train)
lin_reg2.score(X_test, y_test)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
  if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):


Wall time: 746 ms


nan

In [8]:
lin_reg2.coef_

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

In [9]:
# 此时默认 eta 对于此数据太大，导致梯度下降没有收敛，

In [15]:
%time lin_reg2.fit_gd(X_train, y_train, eta=0.000001)
# 可以看到， 把 eta 设置的小点，程序不再溢出

Wall time: 660 ms


LinearRegression()

In [16]:
lin_reg2.score(X_test, y_test)
# 此时模型分数太低，假设是因为 n_iters 太小了，在默认的迭代次数下还没找到最佳参数

0.3575012170949491

In [17]:
%time lin_reg2.fit_gd(X_train, y_train, eta=0.000001, n_iters=1e6)
# 此时结果有了很大提高，但是太耗时，解决方法就是数据归一化

Wall time: 1min 1s


LinearRegression()

In [18]:
lin_reg2.score(X_test, y_test)

0.8127098474176682

### 2. 数据归一化

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
standardScaler = StandardScaler()

In [21]:
standardScaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [25]:
X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)

In [26]:
lin_reg3 = LinearRegression()
%time lin_reg3.fit_gd(X_train_standard, y_train)

Wall time: 240 ms


LinearRegression()

In [27]:
lin_reg3.score(X_test_standard, y_test)
# 可以看到，经过数据处理，得到的模型结果和使用正规方程的结果几乎相同，而且速度快

0.8704587151914651

### 3. 梯度下降法的优势

In [29]:
# 生成随机
m = 1000
n = 5000

big_X = np.random.normal(size=(m, n))
true_theta = np.random.uniform(0.0, 100.0, size=n+1)
big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0., 10., size=m)

In [30]:
# 正规方程
big_reg1 = LinearRegression()
%time big_reg1.fit_normal(big_X, big_y)

Wall time: 12.7 s


LinearRegression()

In [31]:
# 梯度下降
big_reg2 = LinearRegression()
%time big_reg2.fit_gd(big_X, big_y)

Wall time: 5.12 s


LinearRegression()

- 当前数据中，样本量少于属性数量，这是因为在梯度下降算法过程中
- 如果样本量太大，计算结果也会比较耗时
- 解决方法是使用**随机梯度下降法**