In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold #cross_validation

from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet  # 批量导入要实现的回归算法
from sklearn.svm import LinearSVR  # SVM中的回归算法
from sklearn.svm import SVR  # SVM中的回归算法
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor  # 集成算法
from sklearn.neighbors import KNeighborsRegressor #  knn
from sklearn.model_selection import cross_val_score  # 交叉检验
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score  # 批量导入指标算法
from sklearn.model_selection import GridSearchCV 
import matplotlib.pyplot as plt  # 导入图形展示库
from sklearn.preprocessing import StandardScaler #引入标准化模块
import seaborn as sns
from sklearn.neural_network import MLPRegressor
from sklearn.utils.validation import check_array as check_arrays
from sklearn.model_selection import train_test_split

### Combine reconstructed LongLat data with full table created by step 1.

In [2]:
#read reconstructed longitude and latitude
re_Long = pd.read_csv('reconstructed_Longitude.csv',encoding='utf-8',header=None)
re_Lat = pd.read_csv('reconstructed_Lat.csv',encoding='utf-8',header=None)

In [3]:
# read original yet noise-filtered table
original_table = pd.read_csv('noise_filtered_Route11Bus51938440.csv',encoding='utf-8',header=None)

re_Long and re_lat don't have  the same length, so one needs to find the minimum of them before concat them to original table

In [4]:
min_len = min([len(re_Long),len(re_Lat)])

### prepare for the prediction dataset
From above, we know the min_len is the length of re_Lat

In [5]:
# original table should start here, at (len(original_table) - min_len)
table_1 = original_table.iloc[(len(original_table) - min_len):,:]

In [6]:
# slice re_Long in order to get the same length as re_Lat
re_Long = re_Long.tail(min_len) # get the last min_len rows of re_Long


## test prection based on unrecontructed data

In [None]:
# Do not run this one
def prediction_process(X,y):
    # 训练回归模型
    X_train_no_cv, X_test_no_cv, y_train_no_cv, y_test_no_cv = train_test_split(
    X, y, test_size=0.3, shuffle=False)
    
    n_folds = 6  # 设置交叉检验的次数
    model_knn = KNeighborsRegressor()  # 建立贝叶斯岭回归模型对象
    model_lr = LinearRegression()  # 建立普通线性回归模型对象
    model_ann = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)  # 建立ANN
    model_svr = LinearSVR(max_iter=3000)  # 建立支持向量机回归模型对象
    model_gbr = GradientBoostingRegressor(learning_rate=0.06, n_estimators=600,max_depth=5, min_samples_leaf =60, 
                   min_samples_split =1000, subsample=0.8)  # 建立梯度增强回归模型对象
    model_names = ['KNN', 'KF', 'ANN', 'LS-SVM', 'GBRT']  # 不同模型的名称列表
    model_dic = [model_knn, model_lr, model_ann, model_svr, model_gbr]  # 不同回归模型对象的集合

    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    cv_score_list = []  # 交叉检验结果列表
    pre_y_list = []  # 各个回归模型预测的y值列表
    for model in model_dic:  # 读出每个回归模型对象
        
        kf = KFold(n_splits=n_folds)
        scores = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            y_pred = model.fit(X_train,y_train).predict(X_test)
            
            y_diff = abs(y_test-y_pred)
            y_no_noise_loc = np.where(y_diff < 0.1)[0]
            y_test_filtered = y_test[y_no_noise_loc]
            y_pred_filtered = y_pred[y_no_noise_loc]
            scores.append(mean_absolute_error(y_test_filtered,y_pred_filtered))
        scores = np.array(scores)
        scores = scores.reshape([n_folds,])
        
        y_pred = model.fit(X_train_no_cv, y_train_no_cv).predict(X_test_no_cv)
        y_diff = abs(y_test_no_cv-y_pred)
        y_no_noise_loc = np.where(y_diff < 0.1)[0]
        y_test_filtered = y_test_no_cv[y_no_noise_loc]
        y_pred_filtered = y_pred[y_no_noise_loc]

        cv_score_list.append(scores)  # 将交叉检验结果存入结果列表
        pre_y_list.append(y_pred_filtered)  # 将回归训练中得到的预测y存入列表，要分训练集和预测集
    # 模型效果指标评估
    n_samples, n_features = X.shape  # 总样本量,总特征数
    model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score]  # 回归评估指标对象集
    model_metrics_list = []  # 回归评估指标列表
    for i in range(5):  # 循环每个模型索引
        tmp_list = []  # 每个内循环的临时结果列表
        for m in model_metrics_name:  # 循环每个指标对象
            tmp_score = m(y_test_filtered, pre_y_list[i])  # 计算每个回归指标结果
            tmp_list.append(tmp_score)  # 将结果存入每个内循环的临时结果列表
        model_metrics_list.append(tmp_list)  # 将结果存入回归评估指标列表
    df1 = pd.DataFrame(cv_score_list, index=model_names)  # 建立交叉检验的数据框
    df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2'])  # 建立回归指标的数据框
    print ('samples: %d \t features: %d' % (n_samples, n_features))  # 打印输出样本量和特征数量
    print (70 * '-')  # 打印分隔线
    print ('cross validation result:')  # 打印输出标题
    print (df1)  # 打印输出交叉检验的数据框
    print (70 * '-')  # 打印分隔线
    print ('regression metrics:')  # 打印输出标题
    print (df2)  # 打印输出回归指标的数据框
    print (70 * '-')  # 打印分隔线
    print ('short name \t full name')  # 打印输出缩写和全名标题
    print ('ev \t explained_variance')
    print ('mae \t mean_absolute_error')
    print ('mse \t mean_squared_error')
    print ('r2 \t r2')
    print (70 * '-')  # 打印分隔线
    
    return pre_y_list, y_test_filtered

In [7]:
def prediction_process(X,y):
    # 训练回归模型
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False) # don't shuffle the data
    
    model_knn = KNeighborsRegressor()  # 建立贝叶斯岭回归模型对象
    model_lr = LinearRegression()  # 建立普通线性回归模型对象
    model_ann = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)  # 建立ANN
    model_svr = SVR(max_iter=3000)  # 建立支持向量机回归模型对象
    model_gbr = GradientBoostingRegressor(learning_rate=0.06, n_estimators=600,max_depth=5, min_samples_leaf =60, 
                   min_samples_split =1000, subsample=0.8)  # 建立梯度增强回归模型对象
    model_names = ['KNN', 'KF', 'ANN', 'LS-SVM', 'GBRT']  # 不同模型的名称列表
    model_dic = [model_knn, model_lr, model_ann, model_svr, model_gbr]  # 不同回归模型对象的集合

    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    pre_y_list = []  # 各个回归模型预测的y值列表
    for model in model_dic:  # 读出每个回归模型对象
        pre_y_list.append(model.fit(X_train, y_train).predict(X_test))  # 将回归训练中得到的预测y存入列表，要分训练集和预测集
    # 模型效果指标评估
    n_samples, n_features = X.shape  # 总样本量,总特征数
    model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score]  # 回归评估指标对象集
    model_metrics_list = []  # 回归评估指标列表
    for i in range(5):  # 循环每个模型索引
        tmp_list = []  # 每个内循环的临时结果列表
        for m in model_metrics_name:  # 循环每个指标对象
            tmp_score = m(y_test, pre_y_list[i])  # 计算每个回归指标结果
            tmp_list.append(tmp_score)  # 将结果存入每个内循环的临时结果列表
        model_metrics_list.append(tmp_list)  # 将结果存入回归评估指标列表
    df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2'])  # 建立回归指标的数据框
    print ('samples: %d \t features: %d' % (n_samples, n_features))  # 打印输出样本量和特征数量
    print (70 * '-')  # 打印分隔线
    print ('regression metrics:')  # 打印输出标题
    print (df2)  # 打印输出回归指标的数据框
    print (70 * '-')  # 打印分隔线
    print ('short name \t full name')  # 打印输出缩写和全名标题
    print ('ev \t explained_variance')
    print ('mae \t mean_absolute_error')
    print ('mse \t mean_squared_error')
    print ('r2 \t r2')
    print (70 * '-')  # 打印分隔线
    
    return pre_y_list, y_test

### one-step prediction

#### one-step for long

In [53]:
prediction_step = 1

In [54]:
X_Long = np.array(re_Long.iloc[:,-1].head(min_len-prediction_step)) # cause y has to shift prediction_step units用这一个reconstructed space中的state variable 预测下一个状态的original space中的state variable

need to reshap X, an array of dimension (number,)doesn't work for sklearn predictor

In [55]:
X_Long = X_Long.reshape([min_len-prediction_step,1]) 

In [56]:
y_Long = np.array(re_Long.iloc[:,-1].tail(min_len-prediction_step))
y_Long = y_Long.reshape([min_len-prediction_step,1]) 

In [57]:
pre_y_list, y_test = prediction_process(X_Long,y_Long)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


samples: 96261 	 features: 1
----------------------------------------------------------------------
regression metrics:
              ev       mae       mse        r2
KNN    -0.220273  0.042621  3.648316 -0.220292
KF     -0.203962  0.062589  3.599656 -0.204016
ANN    -0.203962  0.062590  3.599657 -0.204016
LS-SVM -0.000005  0.114636  2.993926 -0.001411
GBRT   -0.141086  0.080732  3.412313 -0.141354
----------------------------------------------------------------------
short name 	 full name
ev 	 explained_variance
mae 	 mean_absolute_error
mse 	 mean_squared_error
r2 	 r2
----------------------------------------------------------------------


In [58]:
np.set_printoptions(edgeitems=100)
y_test

array([[119.97971 ],
       [119.979496],
       [119.9792  ],
       [119.979004],
       [119.979004],
       [119.978961],
       [119.97858 ],
       [119.977875],
       [119.977731],
       [119.977255],
       [119.977255],
       [119.977143],
       [119.977055],
       [119.976691],
       [119.976548],
       [119.976548],
       [119.976548],
       [119.976548],
       [119.976548],
       [119.976548],
       [119.976433],
       [119.975818],
       [119.975076],
       [119.974245],
       [119.973343],
       [119.973158],
       [119.973158],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.972905],
       [119.972401],
       [119.971726],
       [119.971043],
       [119.971248],
       [119.971736],
       [119.972191],
       [119.972353],
       [119.972498],
       [119.972576],
       [119.972635],
       [119.97289 ],
       [119.9

In [59]:
pre_y_list[0]

array([[119.9797684],
       [119.9797282],
       [119.9794778],
       [119.9789054],
       [119.9792684],
       [119.9792684],
       [119.9788268],
       [119.9781854],
       [119.9777768],
       [119.9777386],
       [119.9772654],
       [119.9772654],
       [119.9771072],
       [119.9771382],
       [119.9766094],
       [119.9765506],
       [119.9765506],
       [119.9765506],
       [119.9765506],
       [119.9765506],
       [119.9765506],
       [119.976384 ],
       [119.9758676],
       [119.975048 ],
       [119.973987 ],
       [119.9734166],
       [119.972948 ],
       [119.972948 ],
       [119.9731154],
       [119.9731154],
       [119.9731154],
       [119.9731154],
       [119.9731154],
       [119.9731154],
       [119.9731154],
       [119.9731154],
       [119.9729216],
       [119.9724148],
       [119.9716346],
       [119.9710992],
       [119.9712964],
       [119.97168  ],
       [119.9721646],
       [119.9724184],
       [119.972485 ],
       [11

In [60]:
y_test-pre_y_list[0]

array([[-5.84000000e-05],
       [-2.32200000e-04],
       [-2.77800000e-04],
       [ 9.86000000e-05],
       [-2.64400000e-04],
       [-3.07400000e-04],
       [-2.46800000e-04],
       [-3.10400000e-04],
       [-4.58000000e-05],
       [-4.83600000e-04],
       [-1.04000000e-05],
       [-1.22400000e-04],
       [-5.22000000e-05],
       [-4.47200000e-04],
       [-6.14000000e-05],
       [-2.60000000e-06],
       [-2.60000000e-06],
       [-2.60000000e-06],
       [-2.60000000e-06],
       [-2.60000000e-06],
       [-1.17600000e-04],
       [-5.66000000e-04],
       [-7.91600000e-04],
       [-8.03000000e-04],
       [-6.44000000e-04],
       [-2.58600000e-04],
       [ 2.10000000e-04],
       [ 1.57000000e-04],
       [-1.04000000e-05],
       [-1.04000000e-05],
       [-1.04000000e-05],
       [-1.04000000e-05],
       [-1.04000000e-05],
       [-1.04000000e-05],
       [-1.04000000e-05],
       [-2.10400000e-04],
       [-5.20600000e-04],
       [-6.88800000e-04],
       [-5.9

In [61]:
test = abs(y_test-pre_y_list[0])

In [62]:
len(test[test>0.01])

15

In [63]:
len(test[test>0.1])

14

one-step error even for unreconstructed data is small.

output these data for visualization purpose.

In [64]:
y_no_noise_loc = np.where(test < 0.1)[0]
y_test_filtered = y_test[y_no_noise_loc].reshape(-1,1)
y_pred_filtered = pre_y_list[0][y_no_noise_loc]

np.c_[y_test_filtered, y_pred_filtered]

array([[119.97971  , 119.9797684],
       [119.979496 , 119.9797282],
       [119.9792   , 119.9794778],
       [119.979004 , 119.9789054],
       [119.979004 , 119.9792684],
       [119.978961 , 119.9792684],
       [119.97858  , 119.9788268],
       [119.977875 , 119.9781854],
       [119.977731 , 119.9777768],
       [119.977255 , 119.9777386],
       [119.977255 , 119.9772654],
       [119.977143 , 119.9772654],
       [119.977055 , 119.9771072],
       [119.976691 , 119.9771382],
       [119.976548 , 119.9766094],
       [119.976548 , 119.9765506],
       [119.976548 , 119.9765506],
       [119.976548 , 119.9765506],
       [119.976548 , 119.9765506],
       [119.976548 , 119.9765506],
       [119.976433 , 119.9765506],
       [119.975818 , 119.976384 ],
       [119.975076 , 119.9758676],
       [119.974245 , 119.975048 ],
       [119.973343 , 119.973987 ],
       [119.973158 , 119.9734166],
       [119.973158 , 119.972948 ],
       [119.973105 , 119.972948 ],
       [119.973105 ,

In [66]:
long_1_y = y_test
long_1_y_pred = pre_y_list[0]

In [65]:
mean_absolute_error(y_test_filtered,y_pred_filtered)

0.0002475605904671898

#### one-step for Lat

In [67]:
prediction_step = 1
X_Lat = np.array(re_Lat.iloc[:,-1].head(min_len-prediction_step))
X_Lat = X_Lat.reshape([min_len-prediction_step,1]) 
y_Lat = np.array(re_Lat.iloc[:,-1].tail(min_len-prediction_step))
y_Lat = y_Lat.reshape([min_len-prediction_step,1]) 

In [68]:
pre_y_list, y_test = prediction_process(X_Lat,y_Lat)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


samples: 96261 	 features: 1
----------------------------------------------------------------------
regression metrics:
              ev       mae       mse        r2
KNN    -0.577673  0.018112  0.332413 -0.577775
KF     -0.202264  0.020985  0.253309 -0.202313
ANN    -0.202199  0.020633  0.253295 -0.202249
LS-SVM  0.001848  0.091588  0.216442 -0.027329
GBRT   -0.160047  0.020974  0.244443 -0.160235
----------------------------------------------------------------------
short name 	 full name
ev 	 explained_variance
mae 	 mean_absolute_error
mse 	 mean_squared_error
r2 	 r2
----------------------------------------------------------------------


In [69]:
np.set_printoptions(edgeitems=100)
y_test

array([[31.785443],
       [31.785275],
       [31.785091],
       [31.784968],
       [31.784968],
       [31.784931],
       [31.784673],
       [31.784148],
       [31.784039],
       [31.783698],
       [31.783698],
       [31.783615],
       [31.783503],
       [31.783188],
       [31.783083],
       [31.783083],
       [31.783083],
       [31.783083],
       [31.783083],
       [31.783083],
       [31.783   ],
       [31.782546],
       [31.781975],
       [31.781321],
       [31.780689],
       [31.780573],
       [31.780573],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780398],
       [31.780031],
       [31.779636],
       [31.779298],
       [31.778806],
       [31.778085],
       [31.777403],
       [31.777141],
       [31.776921],
       [31.776838],
       [31.776778],
       [31.776474],
       [31.776293],
       [31.776293],
       [31.776126],


In [70]:
pre_y_list[0]

array([[31.7857406],
       [31.7856668],
       [31.7851404],
       [31.7851132],
       [31.784952 ],
       [31.784952 ],
       [31.7848202],
       [31.7844028],
       [31.7844116],
       [31.7841234],
       [31.7838118],
       [31.7838118],
       [31.7837906],
       [31.7834258],
       [31.783194 ],
       [31.7830586],
       [31.7830586],
       [31.7830586],
       [31.7830586],
       [31.7830586],
       [31.7830586],
       [31.7827034],
       [31.7823306],
       [31.7816266],
       [31.7815098],
       [31.7808168],
       [31.780557 ],
       [31.780557 ],
       [31.7805434],
       [31.7805434],
       [31.7805434],
       [31.7805434],
       [31.7805434],
       [31.7805434],
       [31.7805434],
       [31.7805434],
       [31.7802716],
       [31.7800554],
       [31.7794886],
       [31.7789408],
       [31.778788 ],
       [31.7781506],
       [31.7774626],
       [31.7770284],
       [31.7769014],
       [31.7770168],
       [31.7765698],
       [31.77

In [71]:
y_test-pre_y_list[0]

array([[-2.97600000e-04],
       [-3.91800000e-04],
       [-4.94000000e-05],
       [-1.45200000e-04],
       [ 1.60000000e-05],
       [-2.10000000e-05],
       [-1.47200000e-04],
       [-2.54800000e-04],
       [-3.72600000e-04],
       [-4.25400000e-04],
       [-1.13800000e-04],
       [-1.96800000e-04],
       [-2.87600000e-04],
       [-2.37800000e-04],
       [-1.11000000e-04],
       [ 2.44000000e-05],
       [ 2.44000000e-05],
       [ 2.44000000e-05],
       [ 2.44000000e-05],
       [ 2.44000000e-05],
       [-5.86000000e-05],
       [-1.57400000e-04],
       [-3.55600000e-04],
       [-3.05600000e-04],
       [-8.20800000e-04],
       [-2.43800000e-04],
       [ 1.60000000e-05],
       [-1.90000000e-05],
       [-5.40000001e-06],
       [-5.40000001e-06],
       [-5.40000001e-06],
       [-5.40000001e-06],
       [-5.40000001e-06],
       [-5.40000001e-06],
       [-5.40000001e-06],
       [-1.45400000e-04],
       [-2.40600000e-04],
       [-4.19400000e-04],
       [-1.9

In [72]:
test = abs(y_test-pre_y_list[0])

In [73]:
len(test[test>0.01])

30

In [74]:
len(test[test>0.1])

30

In [75]:
y_no_noise_loc = np.where(test < 0.1)[0]
y_test_filtered = y_test[y_no_noise_loc].reshape(-1,1)
y_pred_filtered = pre_y_list[0][y_no_noise_loc]

np.c_[y_test_filtered, y_pred_filtered]

array([[31.785443 , 31.7857406],
       [31.785275 , 31.7856668],
       [31.785091 , 31.7851404],
       [31.784968 , 31.7851132],
       [31.784968 , 31.784952 ],
       [31.784931 , 31.784952 ],
       [31.784673 , 31.7848202],
       [31.784148 , 31.7844028],
       [31.784039 , 31.7844116],
       [31.783698 , 31.7841234],
       [31.783698 , 31.7838118],
       [31.783615 , 31.7838118],
       [31.783503 , 31.7837906],
       [31.783188 , 31.7834258],
       [31.783083 , 31.783194 ],
       [31.783083 , 31.7830586],
       [31.783083 , 31.7830586],
       [31.783083 , 31.7830586],
       [31.783083 , 31.7830586],
       [31.783083 , 31.7830586],
       [31.783    , 31.7830586],
       [31.782546 , 31.7827034],
       [31.781975 , 31.7823306],
       [31.781321 , 31.7816266],
       [31.780689 , 31.7815098],
       [31.780573 , 31.7808168],
       [31.780573 , 31.780557 ],
       [31.780538 , 31.780557 ],
       [31.780538 , 31.7805434],
       [31.780538 , 31.7805434],
       [31

In [76]:
lat_1_y = y_test
lat_1_y_pred = pre_y_list[0]

In [77]:
mean_absolute_error(y_test_filtered,y_pred_filtered)

0.00025874052957397704

### Multi-step prediction

In [78]:
prediction_step = 60

In [104]:
X_Long = np.array(re_Long.iloc[:,-1].head(min_len-prediction_step)) # cause y has to shift prediction_step units用这一个reconstructed space中的state variable 预测下一个状态的original space中的state variable
X_Long = X_Long.reshape([min_len-prediction_step,1]) 
y_Long = np.array(re_Long.iloc[:,-1].tail(min_len-prediction_step))
y_Long = y_Long.reshape([min_len-prediction_step,1]) 

In [105]:
multi_pre_y_list, multi_y_test = prediction_process(X_Long,y_Long)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


samples: 96202 	 features: 1
----------------------------------------------------------------------
regression metrics:
                  ev       mae       mse        r2
KNN    -2.002555e-01  0.055625  3.590650 -0.200256
KF      9.403496e-09  0.073121  2.992113 -0.000181
ANN     9.258032e-09  0.073122  2.992113 -0.000181
LS-SVM -1.119784e-06  0.107101  2.994853 -0.001097
GBRT   -5.668308e-02  0.079016  3.161450 -0.056786
----------------------------------------------------------------------
short name 	 full name
ev 	 explained_variance
mae 	 mean_absolute_error
mse 	 mean_squared_error
r2 	 r2
----------------------------------------------------------------------


In [106]:
multi_y_test

array([[119.977055],
       [119.976691],
       [119.976548],
       [119.976548],
       [119.976548],
       [119.976548],
       [119.976548],
       [119.976548],
       [119.976433],
       [119.975818],
       [119.975076],
       [119.974245],
       [119.973343],
       [119.973158],
       [119.973158],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.973105],
       [119.972905],
       [119.972401],
       [119.971726],
       [119.971043],
       [119.971248],
       [119.971736],
       [119.972191],
       [119.972353],
       [119.972498],
       [119.972576],
       [119.972635],
       [119.97289 ],
       [119.973029],
       [119.973029],
       [119.97316 ],
       [119.97341 ],
       [119.97341 ],
       [119.97341 ],
       [119.97341 ],
       [119.97341 ],
       [119.973428],
       [119.973239],
       [119.972671],
       [119.972219],
       [119.9

In [107]:
multi_pre_y_list[0]

array([[119.968642 ],
       [119.9737716],
       [119.9737716],
       [119.970773 ],
       [119.9756564],
       [119.9675276],
       [119.9727206],
       [119.9682176],
       [119.9710722],
       [119.9637596],
       [119.9637596],
       [119.9637596],
       [119.9637596],
       [119.9768434],
       [119.967054 ],
       [119.9730012],
       [119.979222 ],
       [119.9767342],
       [119.9739268],
       [119.971404 ],
       [119.971294 ],
       [119.9778262],
       [119.9788062],
       [119.9788062],
       [119.9788062],
       [119.9788062],
       [119.967717 ],
       [119.9673696],
       [119.9686482],
       [119.9724116],
       [119.9729864],
       [119.970073 ],
       [119.970073 ],
       [119.9715914],
       [119.9715914],
       [119.9686988],
       [119.9710676],
       [119.9699478],
       [119.9701824],
       [119.973984 ],
       [119.9716624],
       [119.9708072],
       [119.9708072],
       [119.9708072],
       [119.9708072],
       [11

In [108]:
len(multi_pre_y_list[0])

19241

In [109]:
multi_y_test-multi_pre_y_list[0]

array([[ 0.008413 ],
       [ 0.0029194],
       [ 0.0027764],
       [ 0.005775 ],
       [ 0.0008916],
       [ 0.0090204],
       [ 0.0038274],
       [ 0.0083304],
       [ 0.0053608],
       [ 0.0120584],
       [ 0.0113164],
       [ 0.0104854],
       [ 0.0095834],
       [-0.0036854],
       [ 0.006104 ],
       [ 0.0001038],
       [-0.006117 ],
       [-0.0036292],
       [-0.0008218],
       [ 0.001701 ],
       [ 0.001811 ],
       [-0.0047212],
       [-0.0057012],
       [-0.0059012],
       [-0.0064052],
       [-0.0070802],
       [ 0.003326 ],
       [ 0.0038784],
       [ 0.0030878],
       [-0.0002206],
       [-0.0006334],
       [ 0.002425 ],
       [ 0.002503 ],
       [ 0.0010436],
       [ 0.0012986],
       [ 0.0043302],
       [ 0.0019614],
       [ 0.0032122],
       [ 0.0032276],
       [-0.000574 ],
       [ 0.0017476],
       [ 0.0026028],
       [ 0.0026028],
       [ 0.0026208],
       [ 0.0024318],
       [ 0.0018638],
       [ 0.0014118],
       [-0.00

In [110]:
test = abs(multi_y_test-multi_pre_y_list[0])

In [111]:
len(test[test>0.1])

24

In [112]:
len(test[test>0.01])

3850

In [113]:
y_no_noise_loc = np.where(test < 0.1)[0]
y_test_filtered = multi_y_test[y_no_noise_loc].reshape(-1,1)
y_pred_filtered = multi_pre_y_list[0][y_no_noise_loc]

In [114]:
long_multi_y = multi_y_test
long_multi_y_pred = multi_pre_y_list[0]

In [115]:
mean_absolute_error(y_test_filtered,y_pred_filtered)

0.005750184836342627

#### prediction test for latitude

In [92]:
prediction_step = 60
X_Lat = np.array(re_Lat.iloc[:,-1].head(min_len-prediction_step))
X_Lat = X_Lat.reshape([min_len-prediction_step,1]) 
y_Lat = np.array(re_Lat.iloc[:,-1].tail(min_len-prediction_step))
y_Lat = y_Lat.reshape([min_len-prediction_step,1]) 

In [93]:
multi_pre_y_list_Lat, multi_y_test_Lat = prediction_process(X_Lat,y_Lat)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


samples: 96202 	 features: 1
----------------------------------------------------------------------
regression metrics:
                  ev       mae       mse        r2
KNN    -4.280815e-01  0.028231  0.301101 -0.428266
KF      5.764970e-07  0.027390  0.210851 -0.000165
ANN     1.792645e-06  0.027384  0.210850 -0.000164
LS-SVM  7.011514e-04  0.069192  0.213803 -0.014171
GBRT   -1.170266e-02  0.026173  0.213300 -0.011785
----------------------------------------------------------------------
short name 	 full name
ev 	 explained_variance
mae 	 mean_absolute_error
mse 	 mean_squared_error
r2 	 r2
----------------------------------------------------------------------


In [94]:
multi_pre_y_list_Lat[0]

array([[31.789951 ],
       [31.8128132],
       [31.8128132],
       [31.7999574],
       [31.7824204],
       [31.7940972],
       [31.7927562],
       [31.798676 ],
       [31.7893386],
       [31.7933736],
       [31.7933736],
       [31.7933736],
       [31.7933736],
       [31.7789222],
       [31.8011168],
       [31.786832 ],
       [25.4235764],
       [31.787185 ],
       [31.7879764],
       [31.779205 ],
       [31.7836576],
       [31.7961042],
       [25.4346164],
       [25.4346164],
       [25.4346164],
       [25.4346164],
       [31.7839298],
       [31.7873968],
       [31.7994614],
       [31.7988268],
       [31.7829956],
       [31.7765472],
       [31.7765472],
       [31.7754794],
       [31.7754794],
       [31.782231 ],
       [31.7796592],
       [31.7855828],
       [31.7942288],
       [31.7778744],
       [31.7843388],
       [31.7737848],
       [31.7737848],
       [31.7737848],
       [31.7737848],
       [31.7737848],
       [31.7737848],
       [31.78

In [95]:
multi_y_test_Lat

array([[31.783503],
       [31.783188],
       [31.783083],
       [31.783083],
       [31.783083],
       [31.783083],
       [31.783083],
       [31.783083],
       [31.783   ],
       [31.782546],
       [31.781975],
       [31.781321],
       [31.780689],
       [31.780573],
       [31.780573],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780538],
       [31.780398],
       [31.780031],
       [31.779636],
       [31.779298],
       [31.778806],
       [31.778085],
       [31.777403],
       [31.777141],
       [31.776921],
       [31.776838],
       [31.776778],
       [31.776474],
       [31.776293],
       [31.776293],
       [31.776126],
       [31.775826],
       [31.775826],
       [31.775826],
       [31.775826],
       [31.775826],
       [31.775798],
       [31.775473],
       [31.775024],
       [31.774563],
       [31.774443],
       [31.774173],
       [31.774101],


In [96]:
abs(multi_pre_y_list_Lat[0]-multi_y_test_Lat)

array([[6.4480000e-03],
       [2.9625200e-02],
       [2.9730200e-02],
       [1.6874400e-02],
       [6.6260000e-04],
       [1.1014200e-02],
       [9.6732000e-03],
       [1.5593000e-02],
       [6.3386000e-03],
       [1.0827600e-02],
       [1.1398600e-02],
       [1.2052600e-02],
       [1.2684600e-02],
       [1.6508000e-03],
       [2.0543800e-02],
       [6.2940000e-03],
       [6.3569616e+00],
       [6.6470000e-03],
       [7.4384000e-03],
       [1.3330000e-03],
       [3.1196000e-03],
       [1.5566200e-02],
       [6.3459216e+00],
       [6.3457816e+00],
       [6.3454146e+00],
       [6.3450196e+00],
       [4.6318000e-03],
       [8.5908000e-03],
       [2.1376400e-02],
       [2.1423800e-02],
       [5.8546000e-03],
       [3.7380000e-04],
       [2.9080000e-04],
       [1.2986000e-03],
       [9.9460000e-04],
       [5.9380000e-03],
       [3.3662000e-03],
       [9.4568000e-03],
       [1.8402800e-02],
       [2.0484000e-03],
       [8.5128000e-03],
       [2.041200

In [97]:
test_Lat = abs(multi_pre_y_list_Lat[0]-multi_y_test_Lat)

In [98]:
len(test_Lat[test_Lat>0.1])

41

In [99]:
len(test_Lat[test_Lat>0.01])

6880

In [100]:
y_no_noise_loc = np.where(test_Lat < 0.1)[0]
y_test_filtered = multi_y_test_Lat[y_no_noise_loc].reshape(-1,1)
y_pred_filtered = multi_pre_y_list_Lat[0][y_no_noise_loc]

In [102]:
lat_multi_y = multi_y_test_Lat
lat_multi_y_pred = multi_pre_y_list_Lat[0]

In [103]:
mean_absolute_error(y_test_filtered,y_pred_filtered)

0.008738788520833379

误差主要存在于mae再0.1-0.01这个范围，但是embedding以后的效果比这个好很多

### Output for Visualization

In [117]:
pd.DataFrame(np.c_[long_1_y,long_1_y_pred,lat_1_y,lat_1_y_pred]).to_csv("unrecontrstructed_prediction_output_1step.csv",index=False)

In [118]:
pd.DataFrame(np.c_[long_multi_y,long_multi_y_pred,lat_multi_y,lat_multi_y_pred]).to_csv("unrecontrstructed_prediction_output_multi_step.csv",index=False)