In [13]:
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from scipy.interpolate import interp1d


from lib.io import load_pickle_data
from lib.noglobal import noglobal

In [44]:
@noglobal()
def apply_gauss_smoothing(df,params):
    
    SZ_1 = params["sz_1"]
    SZ_2 = params["sz_2"]
    SZ_CRIT = params['sz_crit']    
    
    
    pd_list = [];
    for key,each_df in df.groupby("phone"):
        tmp_df = each_df.copy();
        
        data = each_df[["latDeg","lngDeg"]].to_numpy();
        
        lat_g1 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_1))
        lon_g1 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_1))
        lat_g2 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_2))
        lon_g2 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_2))
        
        lat_dif = data[1:,0] - data[:-1,0]
        lon_dif = data[1:,1] - data[:-1,1]
        
        lat_crit = np.append(np.abs(gaussian_filter1d(lat_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lat_dif), np.sqrt(SZ_CRIT)))),[0])
        lon_crit = np.append(np.abs(gaussian_filter1d(lon_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lon_dif), np.sqrt(SZ_CRIT)))),[0])           
            
        tmp_df["latDeg"] = lat_g1 * lat_crit + lat_g2 * (1.0 - lat_crit)
        tmp_df['lngDeg'] = lon_g1 * lon_crit + lon_g2 * (1.0 - lon_crit)    
        
        pd_list.append(tmp_df);
                       
    
    return pd.concat(pd_list).sort_index()
    
def mean_with_other_phones(df):
    collections_list = df[['collectionName']].drop_duplicates().to_numpy()

    for collection in collections_list:
        phone_list = df[df['collectionName'].to_list() == collection][['phoneName']].drop_duplicates().to_numpy()

        phone_data = {}
        corrections = {}
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            phone_data[phone[0]] = df[cond][['millisSinceGpsEpoch', 'latDeg', 'lngDeg']].to_numpy()

        for current in phone_data:
            correction = np.ones(phone_data[current].shape, dtype=np.float)
            correction[:,1:] = phone_data[current][:,1:]
            
            # Telephones data don't complitely match by time, so - interpolate.
            for other in phone_data:
                if other == current:
                    continue

                loc = interp1d(phone_data[other][:,0], 
                               phone_data[other][:,1:], 
                               axis=0, 
                               kind='linear', 
                               copy=False, 
                               bounds_error=None, 
                               fill_value='extrapolate', 
                               assume_sorted=True)
                
                start_idx = 0
                stop_idx = 0
                for idx, val in enumerate(phone_data[current][:,0]):
                    if val < phone_data[other][0,0]:
                        start_idx = idx
                    if val < phone_data[other][-1,0]:
                        stop_idx = idx

                if stop_idx - start_idx > 0:
                    correction[start_idx:stop_idx,0] += 1
                    correction[start_idx:stop_idx,1:] += loc(phone_data[current][start_idx:stop_idx,0])                    

            correction[:,1] /= correction[:,0]
            correction[:,2] /= correction[:,0]
            
            corrections[current] = correction.copy()
        
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            
            df.loc[cond, ['latDeg', 'lngDeg']] = corrections[phone[0]][:,1:]            
            
    return df

In [39]:
from lib.kalman_filter import generate_kalmanfilter,apply_kalmanfilter
from tqdm.notebook import tqdm

train_path =  "/work/data/input/selfmade_dataset/baseline_with_derived_data_v5/train.pkl"
train_df = load_pickle_data(train_path);


s_list = [];
for key,each_df in tqdm(train_df.groupby("phone")):
    
    tmp_df = each_df.copy()
    kf = generate_kalmanfilter()
    s = apply_kalmanfilter(tmp_df[["latDeg","lngDeg"]].to_numpy(),kf)
    tmp_df[["latDeg","lngDeg"]] = s
        
    s_list.append(tmp_df)

after_kalman = pd.concat(s_list).sort_index()

after_kalman_gauss = apply_gauss_smoothing(after_kalman,{'sz_1' : 0.85, 'sz_2' : 5.65, 'sz_crit' : 1.5})


  0%|          | 0/73 [00:00<?, ?it/s]

In [31]:
after_gauss = apply_gauss_smoothing(train_df,{'sz_1' : 0.85, 'sz_2' : 5.65, 'sz_crit' : 1.5})

In [34]:
s_list = [];
for key,each_df in tqdm(after_gauss.groupby("phone")):
    
    tmp_df = each_df.copy()
    kf = generate_kalmanfilter()
    s = apply_kalmanfilter(tmp_df[["latDeg","lngDeg"]].to_numpy(),kf)
    tmp_df[["latDeg","lngDeg"]] = s
        
    s_list.append(tmp_df)

afuter_gauss_kalman = pd.concat(s_list).sort_index()




  0%|          | 0/73 [00:00<?, ?it/s]

In [45]:
from external_lib.evaluation_function import evaluate_function,calc_haversine

train_df["dist"] = calc_haversine(train_df["latDeg_gt"],train_df["lngDeg_gt"],train_df["latDeg"],train_df["lngDeg"]);
print("default",evaluate_function(train_df,"dist"))
after_gauss["dist"] = calc_haversine(after_gauss["latDeg_gt"],after_gauss["lngDeg_gt"],after_gauss["latDeg"],after_gauss["lngDeg"]);
print("after_gauss",evaluate_function(after_gauss,"dist"))
after_kalman["dist"] = calc_haversine(after_kalman["latDeg_gt"],after_kalman["lngDeg_gt"],after_kalman["latDeg"],after_kalman["lngDeg"]);
print("after_kalman",evaluate_function(after_kalman,"dist"))


afuter_gauss_kalman["dist"] = calc_haversine(afuter_gauss_kalman["latDeg_gt"],afuter_gauss_kalman["lngDeg_gt"],afuter_gauss_kalman["latDeg"],afuter_gauss_kalman["lngDeg"]);
print("after_gauss_kalman",evaluate_function(afuter_gauss_kalman,"dist"))

#s["dist"] = calc_haversine(s["latDeg_gt"],s["lngDeg_gt"],s["latDeg"],s["lngDeg"]);
#print("after_kalman_gauss",evaluate_function(s,"dist"))


c = mean_with_other_phones(afuter_gauss_kalman.copy())
c["dist"] = calc_haversine(c["latDeg_gt"],c["lngDeg_gt"],c["latDeg"],c["lngDeg"]);
print("after_kalman_gauss",evaluate_function(c,"dist"))


default 5.287859611805194
after_gauss 4.651036920160282
after_kalman 4.603418543273516
after_gauss_kalman 4.618812787583824
after_kalman_gauss 3.855470261772816


In [47]:
c[["latDeg","lngDeg"]].isnull().sum()

latDeg    0
lngDeg    0
dtype: int64