In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from lib.io import load_pickle_data
from lib.noglobal import noglobal
from lib.kalman_filter import generate_kalmanfilter,apply_kalmanfilter


from external_lib.visualize import visualize_trafic
from external_lib.evaluation_function import calc_haversine
from external_lib.outlier_correlation import outlier_correlation


In [2]:
train_df_path = "/work/data/input/selfmade_dataset/baseline_with_derived_data_v4/train.pkl"
train_df = load_pickle_data(train_df_path);
train_df = train_df.rename(columns = {"MillisSinceGpsEpoch":"millisSinceGpsEpoch"})


test_df_path = "/work/data/input/selfmade_dataset/baseline_with_derived_data_v4/test.pkl"
test_df = load_pickle_data(test_df_path);
test_df = test_df.rename(columns = {"MillisSinceGpsEpoch":"millisSinceGpsEpoch"})

best_submission = "/work/data/submission_list/baseline_with_merge_poitns_with_kalman/to_csv.csv"
best_sub = pd.read_csv(best_submission)



## reject outlier

In [3]:
@noglobal()
def add_distance_diff(arg_df):
    df = arg_df.copy()
    df['latDeg_prev'] = df['latDeg'].shift(1)
    df['latDeg_next'] = df['latDeg'].shift(-1)
    df['lngDeg_prev'] = df['lngDeg'].shift(1)
    df['lngDeg_next'] = df['lngDeg'].shift(-1)
    df['phone_prev'] = df['phone'].shift(1)
    df['phone_next'] = df['phone'].shift(-1)
    
    df['dist_prev'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_prev'], df['lngDeg_prev'])
    df['dist_next'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_next'], df['lngDeg_next'])
    
    df.loc[df['phone']!=df['phone_prev'], ['latDeg_prev', 'lngDeg_prev', 'dist_prev']] = np.nan
    df.loc[df['phone']!=df['phone_next'], ['latDeg_next', 'lngDeg_next', 'dist_next']] = np.nan
    
    return df



## phone mean prediction

In [4]:
@noglobal()
def make_lerp_data(df_arg):
    """
        Generate interplolated lat,lng values for different phone times in the same collection.    
    """
    df = df_arg.copy()
    
    org_columns = df.columns
        
    time_list = df[["collectionName","millisSinceGpsEpoch"]].drop_duplicates()
    phone_list = df[["collectionName","phoneName"]].drop_duplicates()
    tmp = time_list.merge(phone_list,on="collectionName",how = "outer")
    
    lerp_df = tmp.merge(df,on=["collectionName","millisSinceGpsEpoch","phoneName"],how="left")
    lerp_df["phone"] = lerp_df["collectionName"] + "_" + lerp_df["phoneName"] 
    lerp_df = lerp_df.sort_values(["phone","millisSinceGpsEpoch"]);
            
    lerp_df['latDeg_prev'] = lerp_df['latDeg'].shift(1)
    lerp_df['latDeg_next'] = lerp_df['latDeg'].shift(-1)
    lerp_df['lngDeg_prev'] = lerp_df['lngDeg'].shift(1)
    lerp_df['lngDeg_next'] = lerp_df['lngDeg'].shift(-1)
    lerp_df['phone_prev'] = lerp_df['phone'].shift(1)
    lerp_df['phone_next'] = lerp_df['phone'].shift(-1)
    lerp_df['time_prev'] = lerp_df['millisSinceGpsEpoch'].shift(1)
    lerp_df['time_next'] = lerp_df['millisSinceGpsEpoch'].shift(-1)
    
    lerp_df = lerp_df[(lerp_df['latDeg'].isnull())&(lerp_df['phone']==lerp_df['phone_prev'])&(lerp_df['phone']==lerp_df['phone_next'])].copy()
    
    
    lerp_df['latDeg'] = lerp_df['latDeg_prev'] + ((lerp_df['latDeg_next'] - lerp_df['latDeg_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    lerp_df['lngDeg'] = lerp_df['lngDeg_prev'] + ((lerp_df['lngDeg_next'] - lerp_df['lngDeg_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    
    lerp_df = lerp_df[~lerp_df['latDeg'].isnull()]
    
    

    return lerp_df[org_columns]
    
@noglobal()
def calc_mean_pred(df, lerp_df):
    '''
    Make a prediction based on the average of the predictions of phones in the same collection.
    '''
    add_lerp = pd.concat([df, lerp_df])
    mean_pred_result = add_lerp.groupby(['collectionName', 'millisSinceGpsEpoch'])[['latDeg', 'lngDeg']].mean().reset_index()
    mean_pred_df = df[['collectionName', 'phoneName', 'millisSinceGpsEpoch']].copy()
    mean_pred_df = mean_pred_df.merge(mean_pred_result[['collectionName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']], on=['collectionName', 'millisSinceGpsEpoch'], how='left')
    return mean_pred_df



In [5]:
target_dataset = train_df
target_columns_all = target_dataset.columns

train_ro = outlier_correlation(train_df)


pd_list = []
for key,each_df in tqdm(train_ro.groupby("phone")):
    kf_ = generate_kalmanfilter()    
    num = each_df[["latDeg","lngDeg"]].to_numpy()    
    each_df[["latDeg","lngDeg"]] = apply_kalmanfilter(num,kf_)
    pd_list.append(each_df)

train_ro = pd.concat(pd_list,axis=0).sort_index()
    
train_ro["collectionName"] = train_ro["phone"].apply(lambda x:x.split("_")[0])
train_ro["phoneName"] = train_ro["phone"].apply(lambda x:x.split("_")[1])

100%|██████████| 73/73 [00:01<00:00, 51.21it/s]


  0%|          | 0/73 [00:00<?, ?it/s]

In [6]:
from external_lib.phone_mean_prediction import phone_mean_prediction

train_tmp = train_df



train_mean_pread = phone_mean_prediction(train_tmp)
train_mean_pread["phone"] = train_mean_pread["collectionName"] + train_mean_pread["phoneName"]

In [8]:
from external_lib.evaluation_function import evaluate_function


train_df["err"] = calc_haversine(train_df["latDeg"],train_df["lngDeg"],train_df["latDeg_gt"],train_df["lngDeg_gt"])
print("適用前",evaluate_function(train_df,"err"))
train_df["err"] = calc_haversine(train_mean_pread["latDeg"],train_mean_pread["lngDeg"],train_df["latDeg_gt"],train_df["lngDeg_gt"])
print("適用後",evaluate_function(train_df,"err"))


train_mean_pread[["latDeg","lngDeg"]].isnull().sum()

適用前 5.287859611805194
適用後 4.771118181425187


latDeg    0
lngDeg    0
dtype: int64

sample_sub = pd.read_csv("/work/data/input/google-smartphone-decimeter-challenge/sample_submission.csv")
sample_sub["latDeg"] = train_mean_pread["latDeg"]
sample_sub["lngDeg"] = train_mean_pread["lngDeg"]
sample_sub.to_csv("./test.csv",index=False)