# Project1 - 多元线性回归

本部分使用了梯度下降和最小二乘法两种方式实现了多元线性回归，同时使用了RMSE和r2进行了评估，并和sklearn的方法进行了对比。所使用的数据集是CCPP数据集。

In [9]:
import numpy as np
import pandas as pd

# 用于对比sklearn结果所需的库
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [10]:
# 归一化数据
def normalized(data):
    return (data - data.mean())/data.std()

In [11]:
# 切割数据为训练集和测试集
def split_data(data):
    m = 7000
    f = 4

    X = data[:,:4]
    y = data[:,-1]

    X_train = X[:m,:f]
    X_train = np.c_[np.ones(len(X_train),dtype='int64'),X_train]
    y_train = y[:m]
    X_test = X[m:,:f]
    X_test = np.c_[np.ones(len(X_test),dtype='int64'),X_test]
    y_test = y[m:]

    return X_train,X_test,y_train,y_test

In [12]:
# 初始化学习率和迭代次数
alpha = 0.005
iters = 2000

以下部分分别用了梯度下降和最小二乘法两种方式训练模型。

In [13]:
# 梯度下降
def gradient_descent(X, Y, B, iterations, alpha):
    m = len(X)

    for iteration in range(iterations):
        # Hypothesis Values
        h = X.dot(B)
        # Difference b/w Hypothesis and Actual Y
        loss = h - Y
        # Gradient Calculation
        gradient = X.T.dot(loss) / m
        # Changing Values of B using Gradient
        B = B - alpha * gradient

    return B

In [14]:
# 最小二乘法
def least_square(X,Y):
    return np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), Y)

### 运行

分别使用两种方式对训练集进行训练，并在末尾使用了sklearn方法进行训练。

In [15]:
# 运行
# 获取数据
data = pd.read_excel('Folds5x2_pp.xlsx')

# 梯度下降
gradient_data = normalized(data)
gradient_X_train,gradient_X_test,gradient_y_train,gradient_y_test = split_data(gradient_data.values)
gradient_B = np.zeros(gradient_X_train.shape[1])
gradient_newB = gradient_descent(gradient_X_train,gradient_y_train,gradient_B,iters,alpha)

# 最小二乘法
least_data = normalized(data)
least_X_train,least_X_test,least_y_train,least_y_test = split_data(least_data.values)
least_B = np.zeros(least_X_train.shape[1])
least_newB = least_square(least_X_train,least_y_train)

# sklearn
sc = StandardScaler()
sk_data = sc.fit_transform(data)
sk_X_train,sk_X_test,sk_y_train,sk_y_test = split_data(sk_data)
regression = LinearRegression()
regression.fit(sk_X_train, sk_y_train)

LinearRegression()

In [16]:
# RMSE
def RMSE(X,y,B):
    return np.sqrt(np.sum((X.dot(B)- y) ** 2)/(2 * len(X)))

# r2
def r_square(y_pred,y):
    sst = np.sum((y-y.mean())**2)
    ssr = np.sum((y_pred-y)**2)
    r2 = 1-(ssr/sst)
    return r2

### 评估

分别对梯度下降，最小二乘法和sklearn三种方式进行评估，评估方式有RMSE和r2两种。

In [None]:
# 预测结果
# 梯度下降
gradient_y_pred = gradient_X_test.dot(gradient_newB)
print("梯度下降RMSE：%s" %(RMSE(gradient_X_test,gradient_y_test,gradient_newB)))
print("梯度下降r2：%s" %(r_square(gradient_y_pred,gradient_y_test)))

# 最小二乘法
least_y_pred = least_X_test.dot(least_newB)
print("最小二乘法RMSE：%s" %(RMSE(least_X_test,least_y_test,least_newB)))
print("最小二乘法r2：%s" %(r_square(least_y_pred,least_y_test)))

# sklearn
sk_y_pred = regression.predict(sk_X_test)
print("sklearn的RMSE：%s" %(np.sqrt(mean_squared_error(sk_y_test,sk_y_pred))))
print("sklearn的r2：%s" %(r2_score(sk_y_test, sk_y_pred)))
