# Generate new weather features (2nd version)

## Find 10 closet stations, after removing outliers, calculate weighted mean for current station

In [23]:
# data engineer
import pandas as pd
import numpy as np
import queue
from pprint import pprint
from sklearn.ensemble import IsolationForest
import datetime
from tqdm import tqdm
import math
FILE_FOLDER = '../prediction_data/%s'
AFTER_FILE_FOLDER = '../prediction_data/afterAnalysis/%s'

In [24]:
def point_dis_square(x1, y1, x2, y2):
    dis = (x1 * 1.0 - x2) * (x1 * 1.0 - x2) + (y1 * 1.0 - y2) * (y1 * 1.0 - y2)
    return math.sqrt(dis)

def get_nearest_stations(df_grid_stations=pd.read_csv(AFTER_FILE_FOLDER % 'grid_stations.csv'),closest_num=10):
    df_aq_stations = pd.read_csv(AFTER_FILE_FOLDER % 'aq_stations.csv')
#     df_grid_stations = pd.read_csv(AFTER_FILE_FOLDER % 'grid_stations.csv')

    aq_len = len(df_aq_stations.stationId)
    grid_len = len(df_grid_stations)

#     closest_num = 10
    aq_closest_dict = {}

    for i in range(aq_len):

        aq_stationId = df_aq_stations.stationId[i]

        aq_la = df_aq_stations.latitude[i]
        aq_lo = df_aq_stations.longitude[i]
        aq_closest_dict[aq_stationId] = []
        aq_q = queue.PriorityQueue()

        for j in range(grid_len):
            grid_la = df_grid_stations.latitude[j]
            grid_lo = df_grid_stations.longitude[j]
            stationId = df_grid_stations.gridStationId[j]

            y = (point_dis_square(aq_la, aq_lo, grid_la, grid_lo))
            if (aq_q.qsize() < closest_num):
                aq_q.put((-1.0 * y, stationId))
            else:
                if (aq_q.queue[0][0] < -1.0 * y):
                    aq_q.get()
                    aq_q.put((-1.0 * y, stationId))

        while (not aq_q.empty()):
            current_ele = aq_q.get()
            aq_closest_dict[aq_stationId].append((-1*current_ele[0],current_ele[1]))
        # pprint(aq_closest_dict[aq_stationId])

    return aq_closest_dict,df_aq_stations

In [25]:
def tupleArry_to_list(tupleArry, i_ele):
    if(i_ele == 0):
        w_list = list(map(lambda x:1.0 / x[i_ele],tupleArry))
        return w_list
    return list(map(lambda x:x[i_ele],tupleArry))

def get_mean_without_outliers(df, feature_name):
    df_sub = df.copy()
    pre_mean = df_sub[feature_name].mean()
    """remove abs(num - mean) / mean > 50%"""
    df_sub = df_sub[np.abs((df_sub[feature_name] - df_sub[feature_name].mean())/pre_mean) <= 0.5]
    df_nearest_3 = df_sub.tail(3)
    
    weight_sum = df_nearest_3['weight'].sum()
    df_nearest_3['weight'] = df_nearest_3['weight'] / weight_sum
    df_nearest_3['after_weighted'] = df_nearest_3['weight'] * df_sub[feature_name]
    return np.around(df_nearest_3['after_weighted'].sum(), decimals=2)


def generate_features_for_aqStations(aq_closest_dict,df_grid, start_time, end_time):
    starttime = datetime.datetime.now()
    
    df_grid_stations = df_grid.set_index(['station_id'])
    dict_making = {
        'aq_station_id':[],
        'temperature':[],
        'humidity':[],
        'pressure':[],
        'wind_speed':[],
        'time':[]
    }
    
    time_span = list(pd.date_range(start = start_time, end = end_time, closed=None, freq='H'))
    time_span = list(map(lambda x:str(x),time_span))
    
    for aq_station_id,aq_station_value in tqdm(aq_closest_dict.items(),total=len(aq_closest_dict)):
        stationId_list = tupleArry_to_list(aq_station_value, 1)
        weight_dis = tupleArry_to_list(aq_station_value, 0)
        
        df_grid_temp = df_grid_stations.loc[stationId_list].reset_index().set_index(['time'])
        for t_hour in time_span:
#             print(df_grid_inOneHour)
            try:
                df_grid_inOneHour = df_grid_temp.loc[t_hour]
            
                df_merged = df_grid_inOneHour
                df_merged.station_id = df_merged.station_id.astype("category")
                df_merged.station_id.cat.set_categories(stationId_list, inplace=True)
                df_merged = df_merged.sort_values(['station_id']).reset_index()
                df_merged['weight'] = pd.Series(weight_dis)
                for f_name in ['temperature', 'pressure', 'humidity', 'wind_speed']:
                    try:
                        new_f = get_mean_without_outliers(df_merged[[f_name,'weight']],f_name)
#                         new_f = df_merged[f_name].mean()
                        dict_making[f_name].append(new_f)
                    except:
                        print(aq_station_id,'at',t_hour,'is NAN')
                        dict_making[f_name].append(np.nan)
            except:
                print(aq_station_id,'at',t_hour,'is NAN')
                for f_name in ['temperature', 'pressure', 'humidity', 'wind_speed']:
                    dict_making[f_name].append(np.nan)
                
            dict_making['aq_station_id'].append(aq_station_id)
            dict_making['time'].append(t_hour)
#             break
#         break
            
    endtime = datetime.datetime.now()

    print('cost time:',(endtime - starttime).seconds)
    new_df = pd.DataFrame.from_dict(dict_making)
    return new_df


In [6]:
selected_features = ['station_id', 'temperature', 'pressure','humidity', 'wind_speed', 'time', 'datetime']
df_ow_stations_04 = pd.read_csv(FILE_FOLDER%'observedWeather_201804.csv')
# print(df_ow_stations_04)
df_ow_stations_04['datetime'] = pd.to_datetime(df_ow_stations_04['time'])
# df_ow_stations_04
df_grid_stations_04 = pd.read_csv(FILE_FOLDER%'gridWeather_201804.csv')
df_grid_stations_04['datetime'] = pd.to_datetime(df_grid_stations_04['time'])

df_ow_stations_04.columns = ['id', 'station_id', 'time', 'weather', 'temperature', 'pressure',
       'humidity', 'wind_speed', 'wind_direction', 'datetime']
df_ow_stations_04.drop('id',axis=1,inplace=True)
print(df_ow_stations_04.columns)

df_grid_stations_04.columns = ['id', 'station_id', 'time', 'weather', 'temperature', 'pressure',
       'humidity', 'wind_direction', 'wind_speed', 'datetime']
df_grid_stations_04.drop('id',axis=1,inplace=True)
print(df_grid_stations_04.columns)

df_ow_stations_04 = df_ow_stations_04[selected_features]
print(df_ow_stations_04.columns)

df_grid_stations_04 = df_grid_stations_04[selected_features]
print(df_grid_stations_04.columns)

Index(['station_id', 'time', 'weather', 'temperature', 'pressure', 'humidity',
       'wind_speed', 'wind_direction', 'datetime'],
      dtype='object')
Index(['station_id', 'time', 'weather', 'temperature', 'pressure', 'humidity',
       'wind_direction', 'wind_speed', 'datetime'],
      dtype='object')
Index(['station_id', 'temperature', 'pressure', 'humidity', 'wind_speed',
       'time', 'datetime'],
      dtype='object')
Index(['station_id', 'temperature', 'pressure', 'humidity', 'wind_speed',
       'time', 'datetime'],
      dtype='object')


In [21]:
# start_date = '2017-03-01 00:00:00'
df_grid_stations_1701_1803 = pd.read_csv(FILE_FOLDER%'gridWeather_201701-201803.csv')
df_ow_stations_1701_1801 = pd.read_csv(FILE_FOLDER%'observedWeather_201701-201801.csv')
# start_date_18 = '2018-03-01 00:00:00'
selected_features = ['station_id', 'temperature', 'pressure','humidity', 'wind_speed', 'time', 'datetime']
df_grid_stations_1701_1803['datetime'] = pd.to_datetime(df_grid_stations_1701_1803['utc_time'])
# df_grid_0103 = df_grid_stations_1701_1803[df_grid_stations_1701_1803['datetime'] >= start_date]
# df_grid_0103.columns = ['station_id', 'longitude', 'latitude', 'time', 'temperature',
#        'pressure', 'humidity', 'wind_direction', 'wind_speed', 'datetime']
# print(df_grid_0103.columns)

# df_grid_0103 = df_grid_0103[selected_features]
# print(df_grid_0103.columns)

In [29]:
start_date = '2017-01-01 00:00:00'
end_date = '2018-01-01 00:00:00'
df_grid_2017_0212 = df_grid_stations_1701_1803[
    (df_grid_stations_1701_1803['datetime'] >= start_date) &
    (df_grid_stations_1701_1803['datetime'] < end_date) 
]
df_grid_2017_0212.columns = ['station_id', 'longitude', 'latitude', 'time', 'temperature',
       'pressure', 'humidity', 'wind_direction', 'wind_speed', 'datetime']


In [30]:
df_grid_2017_0212_neeeded  = df_grid_2017_0212[
    df_grid_2017_0212['station_id'].isin(needed_grid_list)
]
len(df_grid_2017_0212_neeeded)
# df_new_0103

508080

In [31]:
# df_grid_0103
df_new_2017 = generate_features_for_aqStations(aq_nearest_dict,df_grid_2017_0212_neeeded,
                                          '2017-01-01 00:00:00','2017-12-31 23:00:00')

100%|██████████| 35/35 [07:20<00:00, 12.63s/it]


cost time: 440


In [35]:
df_new_2017.head()

Unnamed: 0,aq_station_id,temperature,humidity,pressure,wind_speed,time
0,dongsi_aq,-5.943333,70.966667,1021.273333,4.553333,2017-01-01 00:00:00
1,dongsi_aq,-3.466667,60.483333,1021.153333,4.066667,2017-01-01 01:00:00
2,dongsi_aq,-0.993333,50.006667,1021.036667,3.67,2017-01-01 02:00:00
3,dongsi_aq,1.483333,39.526667,1020.913333,3.39,2017-01-01 03:00:00
4,dongsi_aq,2.64,36.936667,1020.22,3.996667,2017-01-01 04:00:00


In [26]:
df_aq_stations = pd.read_csv(AFTER_FILE_FOLDER % 'grid_stations.csv')
aq_nearest_dict, df_aq_stations = get_nearest_stations(df_aq_stations,10)
aq_nearest_dict
# grid_set = set()
grid_n_list = []
for k,v in aq_nearest_dict.items():
    cc = list(map(lambda x:x[1],v))
    grid_n_list += cc
needed_grid_list = list(set(grid_n_list))

In [19]:
# pprint(needed_grid_list)

In [27]:
test_grid_weather = pd.read_csv(FILE_FOLDER % 'gridWeather_20180501-20180502.csv')
test_grid_weather = test_grid_weather[['station_id', 'time', 'temperature', 'pressure','humidity', 'wind_speed']]

In [29]:
df_grid_test_neeeded = test_grid_weather[test_grid_weather['station_id'].isin(needed_grid_list)]
# df_grid_test_neeeded 
df_test = generate_features_for_aqStations(aq_nearest_dict,df_grid_test_neeeded,
                                          '2018-05-01 00:00:00','2018-05-02 23:00:00')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
100%|██████████| 35/35 [08:01<00:00, 14.42s/it]

cost time: 481





In [32]:
df_test.to_csv(AFTER_FILE_FOLDER % 'test_weather.csv',index=False)