In [1]:
#!python 3.6
import os
import numpy as np
import pandas as pd
from pub_func import read_raw_data

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

### 1. 训练集预处理

In [2]:
training_set = read_raw_data(f_path='./given/train.csv', f_type='train', hour_range=9, hours_each_day=24)

This is training set...
Start to convert raw data to each hour x raw features...
Start to create real feature...
The shape of data table:  (5652, 163)


In [3]:
training_set.head()

Unnamed: 0,AMB_TEMP_0,CH4_0,CO_0,NMHC_0,NO_0,NO2_0,NOx_0,O3_0,PM10_0,PM2.5_0,...,PM2.5_8,RAINFALL_8,RH_8,SO2_8,THC_8,WD_HR_8,WIND_DIREC_8,WIND_SPEED_8,WS_HR_8,y
0,14.0,1.8,0.51,0.2,0.9,16.0,17.0,16.0,56.0,26.0,...,19.0,0.0,66.0,5.1,2.1,124.0,232.0,0.6,0.5,30.0
1,14.0,1.8,0.41,0.15,0.6,9.2,9.8,30.0,50.0,39.0,...,30.0,0.0,56.0,15.0,2.0,46.0,153.0,0.8,0.3,41.0
2,14.0,1.8,0.39,0.13,0.5,8.2,8.7,27.0,48.0,36.0,...,41.0,0.0,45.0,4.5,2.0,241.0,283.0,1.6,0.8,44.0
3,13.0,1.8,0.37,0.12,1.7,6.9,8.6,23.0,35.0,35.0,...,44.0,0.0,37.0,2.7,2.0,280.0,269.0,1.9,1.2,33.0
4,12.0,1.8,0.35,0.11,1.8,6.8,8.5,24.0,25.0,31.0,...,33.0,0.0,40.0,3.5,1.9,297.0,290.0,2.1,2.0,37.0


#### 测试集

In [None]:
test_set = read_raw_data(f_path='./given/test_X.csv', f_type='test', hour_range=9, hours_each_day=9)

This is test set...
Start to convert raw data to each hour x raw features...


In [None]:
test_set.head()

In [None]:
test_set = test_set.astype(np.float)

In [None]:
test_set.dtypes

In [None]:
X_train_total = training_set.iloc[:, range(162)].copy()
y_train_total = training_set.iloc[:, -1].copy()

In [None]:
X_train_total.shape, y_train_total.shape

In [None]:
X_train_total.head()

In [None]:
y_train_total.head()

### 2. ~~将全部训练集划分为训练集和验证集~~
- sklearn中的SGDRegressor会自动分一部分训练集当做验证集

In [None]:
# split to training set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_total, y_train_total, 
                                                  test_size=0.20, random_state=42)

In [None]:
X_train.shape, X_val.shape

### 3. 数据的缩放（scaling）

In [None]:
scaler = StandardScaler()
scaler.fit(X_train_total)
X_train_scaled = pd.DataFrame(data=scaler.transform(X_train_total), index=X_train_total.index, 
                              columns=X_train_total.columns)
X_train_scaled.head()

In [None]:
X_train_scaled.shape

In [None]:
joblib.dump(scaler, 'X_dataset_scaler.pkl')

### 4. 正规方程直接求解

In [None]:
lr_model = LinearRegression()

In [None]:
lr_model.fit(X_train_scaled, y_train_total)

In [None]:
joblib.dump(lr_model, 'lr_model_solved_by_normal_equation.pkl')

#### 正规方程解出来的参数

In [None]:
lr_model.coef_[:5], lr_model.coef_.shape, lr_model.intercept_

In [None]:
y_train_total_pred = lr_model.predict(X_train_scaled)

#### 在训练集中的表现

In [None]:
plt.scatter(y_train_total, y_train_total_pred)
plt.show()

In [None]:
r2_score(y_pred=y_train_total_pred, y_true=y_train_total)

#### 在测试集中的表现

In [None]:
X_test = test_set.iloc[:, range(162)].copy()
X_test.shape

In [None]:
X_test_scaled = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns)
X_test_scaled.head()

In [None]:
y_test_pred = lr_model.predict(X_test_scaled)

In [None]:
y_test_pred[:20], y_test_pred.shape

In [None]:
y_test_pred_df = pd.DataFrame(data=y_test_pred, index=['id_'+str(i) for i in range(240)])
y_test_pred_df.head()

In [None]:
y_test_pred_df.to_csv('./result/normal_equation_20190414.csv', header=None)

### 5. 梯度下降求解

In [None]:
sgd_lr_model = SGDRegressor(learning_rate='constant', eta0 = 0.001, 
                            loss='squared_loss', max_iter=1000,
                            penalty='l1', early_stopping=True, validation_fraction=0.2)

In [None]:
sgd_lr_model.fit(X_train_scaled, y_train_total)

In [None]:
joblib.dump(sgd_lr_model, 'sgd_lr_model.pkl')

#### 参数

In [None]:
sgd_lr_model.coef_[:20], sgd_lr_model.coef_.shape, sgd_lr_model.intercept_

In [None]:
y_train_total_pred2 = sgd_lr_model.predict(X_train_scaled)

#### 在训练集中的表现

In [None]:
plt.scatter(y_train_total, y_train_total_pred2)
plt.show()

In [None]:
r2_score(y_pred=y_train_total_pred2, y_true=y_train_total)

#### 在测试集中的表现

In [None]:
y_test_pred2 = sgd_lr_model.predict(X_test_scaled)

In [None]:
y_test_pred2[:20], y_test_pred2.shape

In [None]:
y_test_pred2_df = pd.DataFrame(data=y_test_pred2, index=['id_'+str(i) for i in range(240)])
y_test_pred2_df.head()

In [None]:
y_test_pred2_df.to_csv('./result/sgd_20190414.csv', header=None)