**Import statements**


In [1]:
import pandas as pd
from datetime import datetime
from datetime import timedelta
import numpy as np
import pykrige.kriging_tools as kt
from pykrige.ok import OrdinaryKriging
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle

import constants
import functions5 as functions

%matplotlib inline

**Load Data**

In [2]:
all_static_data = functions.load_static_sensors_calibrated_2()
all_static_data = functions.transform_to_grid_coordinates(all_static_data)
mobile_sensor_data = functions.load_mobile_sensors_2()
mobile_sensor_data = functions.transform_to_grid_coordinates(mobile_sensor_data)

**Compute predictions for stationary sensors**

In [3]:
def correct(pred_value, error_dict, time=None):
    if len(error_dict) < 10:
        return pred_value
    if pred_value + np.mean([y for x,y in error_dict]) < 0:
    #if pred_value + max(error_dict, key=lambda item:item[0])[1] < 0:
        return 0.0
    return pred_value + np.mean([y for x,y in error_dict])
    #return pred_value + max(error_dict, key=lambda item:item[0])[1]

In [4]:
mae = []
mse = []
squares = []

start_time = '2018-07-23 00:00:00'
start_window = '2018-07-23 13:45:00'
end_window = '2018-07-23 16:00:00'

number_of_windows = 20
#Length of window
window = 15
par_grid = functions.create_par(c=0.00002, epsilon=0.001)

# Load the 1st dataset pickle file
# grid_of_errors = pickle.load( open( "tmp_error_grid/grid_of_errors.p", "rb" ) )
# Empty error grid
grid_of_errors = [[[] for x in range(20)] for y in range(20)]

timeint_on_first_window = 0
tmp_to_test_filename = 'tmp_to_test/with_mobile_data_{}_{}.csv'.format(start_window, number_of_windows)

for i in range(1, number_of_windows+1):
    end_time = start_window
    # treinar sem dados moveis
    # testar em dados fixos em vez de móveis
    print(start_window)
    print(end_window)
    timeint, par_grid, grid_of_errors = functions.train(all_static_data, mobile_sensor_data, start_time, end_time, par_grid, window, grid_of_errors, timeint_on_first_window)
    to_test = functions.test_mobile(par_grid, timeint, mobile_sensor_data, start_window, end_window, 1)

    to_test.head()
    
    # Calcular o erro numa das colunas do to_test
    # Calculate the error in one of the to_test columns
    to_test['error_PM2.5'] = to_test['PM2.5'] - to_test['pred_PM2.5']
    to_test['start_window'] = start_window
    
    # Guardar os erros raw numa coluna pq depois vou descontar os bias na pred_PM2.5
    to_test['pred_PM2.5_raw'] = to_test['pred_PM2.5']
    
    # Guardar o tempo em que aconteceu
    to_test['Timestamp'] = start_window
    
    # Corrigir as previsões com os erros
    for index, row in to_test.iterrows():
        # aplicar a função correct que aplica a função mean
        # Fazer aqui o alargamento dos erros.
        errors = grid_of_errors[row['lat_grid']][row['long_grid']]
        
        to_test.at[index, 'pred_PM2.5'] = correct(row['pred_PM2.5'], errors)
        
    # Meter os erros na grid
    for index, row in to_test.iterrows():
        #O uso de index deve ser mudado para o timestamp
        grid_of_errors[int(row['lat_grid'])][int(row['long_grid'])].append((row['start_window'], row['error_PM2.5']))
        if len(grid_of_errors[int(row['lat_grid'])][int(row['long_grid'])]) > 10:
            grid_of_errors[int(row['lat_grid'])][int(row['long_grid'])].pop(0)
    
    
    squares.append(to_test.shape[0])
    mae.append(mean_absolute_error(to_test['PM2.5'], to_test['pred_PM2.5']))
    mse.append(mean_squared_error(to_test['PM2.5'], to_test['pred_PM2.5']))

    start_time = end_time
    start_window = (datetime.strptime(start_window, '%Y-%m-%d %H:%M:%S') + timedelta(minutes=15)).strftime("%Y-%m-%d %H:%M:%S")
    end_window = (datetime.strptime(end_window, '%Y-%m-%d %H:%M:%S') + timedelta(minutes=15)).strftime("%Y-%m-%d %H:%M:%S")
    timeint_on_first_window = timeint+1
    

hello
hello
hello
hello
hello
hello
2018-07-23 13:45:00
2018-07-23 14:00:00
                    Timestamp       PM1     PM2.5      PM10  temperature  \
0  2018-07-23 13:59:25.516000  0.528595  0.750815  2.633142         23.6   
1  2018-07-23 13:59:35.885000  0.619950  0.771881  0.781830         23.6   
2  2018-07-23 13:59:46.915000  0.551724  0.819529  4.493389         23.6   
3  2018-07-23 13:59:56.024000  0.611184  0.854221  0.933319         23.6   

    humidity        lat      long  lat_grid  long_grid  hour  
0  60.900002  55.944708 -3.187169         4         12    13  
1  60.900002  55.944708 -3.187169         4         12    13  
2  60.900002  55.944702 -3.187170         4         12    13  
3  60.900002  55.944702 -3.187170         4         12    13  
13
2018-07-23 14:00:00
2018-07-23 14:15:00
                     Timestamp       PM1     PM2.5        PM10  temperature  \
4   2018-07-23 14:00:06.388000  0.752892  2.224889   35.208408    23.700001   
5   2018-07-23 14:00:16.097

                      Timestamp       PM1      PM2.5       PM10  temperature  \
93   2018-07-23 14:15:06.052000  1.200547   1.576934   2.409930    24.500000   
94   2018-07-23 14:15:23.563000  1.456661   2.262476   7.551739    24.500000   
95   2018-07-23 14:15:30.737000  1.637878   2.379382   3.117072    24.500000   
96   2018-07-23 14:15:39.140000  1.112655   2.148991  10.510995    24.500000   
97   2018-07-23 14:15:48.211000  1.265826   2.074138  10.206800    24.500000   
98   2018-07-23 14:16:03.823000  0.894437   1.502059  10.287084    24.500000   
99   2018-07-23 14:16:12.875000  0.687240   1.155703   2.652196    24.500000   
100  2018-07-23 14:16:21.992000  1.208372   2.123311   4.498982    24.500000   
101  2018-07-23 14:16:30.391000  0.961288   1.847505  66.007027    24.500000   
102  2018-07-23 14:16:38.823000  1.040229   1.328835   1.603833    24.500000   
103  2018-07-23 14:16:46.608000  0.894291   1.275765   3.434155    24.500000   
104  2018-07-23 14:17:03.476000  1.17458

                      Timestamp       PM1     PM2.5       PM10  temperature  \
182  2018-07-23 14:30:00.684000  1.731278  2.431964   3.313440    24.600000   
183  2018-07-23 14:30:09.114000  1.612133  2.705272   6.207786    24.600000   
184  2018-07-23 14:30:17.546000  1.538367  2.686589  16.000565    24.600000   
185  2018-07-23 14:30:25.363000  1.325882  1.841724   3.766316    24.600000   
186  2018-07-23 14:30:41.552000  1.158594  1.770619  27.090498    24.600000   
187  2018-07-23 14:30:49.356000  1.383762  1.822909   1.905572    24.600000   
188  2018-07-23 14:30:58.456000  1.173685  1.999181  26.116425    24.600000   
189  2018-07-23 14:31:06.205000  1.440342  2.583135  21.248945    24.600000   
190  2018-07-23 14:32:05.886000  0.856731  1.620804  12.749604    25.000000   
191  2018-07-23 14:32:22.109000  0.852074  1.991443   6.278054    25.200001   
192  2018-07-23 14:32:31.190000  1.242380  3.395884   8.812357    25.200001   
193  2018-07-23 14:32:39.622000  0.714024  1.179614 

KeyError: 'pred_PM2.5'

In [None]:
#grid_of_errors[5][12]


In [None]:
print(mae)
print(squares)

In [None]:
lens=[]
maxi = 1
max_i = 0
max_j = 0
for i in range(20):
    for j in range(20):
        lens.append(len(grid_of_errors[i][j]))
        if maxi <= len(grid_of_errors[i][j]):
            max_i = i
            max_j = j
            maxi = len(grid_of_errors[i][j])
            print("-----------")
            print ("i: {}\nj: {}\nmax: {} ".format(max_i,max_j, maxi))
            print("-----------")

In [None]:
bins = range(18)
plt.hist(lens, align='mid', bins = bins)
def bins_labels(bins, **kwargs):
    bin_w = (max(bins) - min(bins)) / (len(bins) - 1)
    plt.xticks(np.arange(min(bins)+bin_w/2, max(bins), bin_w), bins, **kwargs)
    plt.xlim(bins[0], bins[-1])
bins_labels(bins, fontsize=12)
plt.xlabel("Number of 15 minute windows with mobile data")
plt.ylabel("Number of grid cells")
# plt.savefig("graphs/histogram_mobile.eps")


Previsao

tiro lhe a media do erro

essa e a minha real previsao

guardar na estrutura o erro obtido ANTES da subtração da media do erro



ver a noçao de locality

olhar para a previsao de celulas adjacentes e erro de celulas adjacentes

In [None]:
x_hist=[]
y_hist=[]
for i in range(20):
    for j in range(20):
        for k in range(len(grid_of_errors[i][j])):
            x_hist.append(19-i)
            y_hist.append(j)


In [None]:
hist = plt.hist2d(y_hist, x_hist, bins=[20,20])
plt.colorbar(label="Number 15 minute windows with mobile data")
x_static = [19-x for x,y in constants.gridStaticCoords.values()]
y_static = [y for x,y in constants.gridStaticCoords.values()]
plt.scatter(x=y_static, y=x_static, c='r')
plt.xlabel("Grid columns")
plt.ylabel("Grid rows")
plt.xticks(color='w')
plt.yticks(color='w')
# plt.savefig("graphs/2D_mobile_data.eps")
