In [1]:
import math
import numpy as np
import pandas as pd
from math import sin, cos, atan2, sqrt
from pyproj import Proj, transform

from cv2 import Rodrigues
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import accuracy_score
import lightgbm as lgb

from tqdm import tqdm
import pyproj

from lib.noglobal import noglobal




from external_lib.Predict_with_IMU import execute
from external_lib.Predict_with_IMU  import generate_train_data,generate_test_data,training,ECEF_to_WGS84,WGS84_to_ECEF,prepare_imu_data

from lib.io import load_pickle_data
from external_lib.evaluation_function import calc_haversine

## データ取得

In [2]:
dir_ = "/work/data/input/google-smartphone-decimeter-challenge"
bl_trn_fname = "/work/data/input/google-smartphone-decimeter-challenge/baseline_locations_train.csv"
bl_tst_fname = "/work/data/input/google-smartphone-decimeter-challenge/baseline_locations_test.csv"

sample_fname  = f"{dir_}/sample_submission.csv"

In [3]:
#train_df = pd.read_csv(bl_trn_fname)
#test_df = pd.read_csv(bl_tst_fname)
bl_trn_df = pd.read_csv(bl_trn_fname)
bl_tst_df = pd.read_csv(bl_tst_fname)


## Modeling

### LightGBM

In [4]:
tgt_cns = ['2021-04-22-US-SJC-1', '2021-04-28-US-SJC-1', '2021-04-29-US-SJC-2']
tgt_cns = bl_trn_df[bl_trn_df['collectionName'].str.contains("|".join(tgt_cns))]["phone"].unique()
cname_test = '2021-04-29-US-SJC-3'
pname_test = 'SamsungS20Ultra'
test_target_list = [cname_test+"_"+pname_test]


train_collection_list = tgt_cns
test_collection_list = test_target_list

params = {
    'metric':'mse',
    'objective':'regression',
    'seed':2021,
    'boosting_type':'gbdt',
    'early_stopping_rounds':10,
    'subsample':0.7,
    'feature_fraction':0.7,
    'bagging_fraction': 0.7,
    'reg_lambda': 10
}

window_size = 30
verbose_flag = True
folds = 5

## generate train data
df_train,lat_lng_df_train = generate_train_data(bl_trn_df,train_collection_list,verbose =True);

## generate test data;
df_test,lat_lng_df_test = generate_test_data(bl_tst_df,test_collection_list)

100%|██████████| 6/6 [08:02<00:00, 80.49s/it] 


Final Dataset shape： (14333, 846)


In [5]:
for key,each_df in df_train.groupby("phone"):
    print(key,each_df.shape)

2021-04-22-US-SJC-1_Pixel4 (2860, 846)
2021-04-22-US-SJC-1_SamsungS20Ultra (2796, 846)
2021-04-28-US-SJC-1_Pixel4 (1984, 846)
2021-04-28-US-SJC-1_SamsungS20Ultra (2053, 846)
2021-04-29-US-SJC-2_Pixel4 (2300, 846)
2021-04-29-US-SJC-2_SamsungS20Ultra (2340, 846)


In [6]:

## training;
pred_valid_x, pred_test_x = training(df_train, df_test, 'X', params,window_size,folds)
pred_valid_y, pred_test_y = training(df_train, df_test, 'Y', params,window_size,folds)
pred_valid_z, pred_test_z = training(df_train, df_test, 'Z', params,window_size,folds)

lng_pred, lat_pred, _ = ECEF_to_WGS84(pred_valid_x,pred_valid_y,pred_valid_z)
lng_test_pred, lat_test_pred, _ = ECEF_to_WGS84(pred_test_x, pred_test_y, pred_test_z)

Each Fold's MSE：[89.46, 122.55, 187.69, 100.07, 94.83], Average MSE：118.9215
------------------------------------------------------------
Each Fold's MSE：[195.58, 201.21, 374.93, 180.54, 146.74], Average MSE：219.7992
------------------------------------------------------------
Each Fold's MSE：[149.84, 160.64, 424.72, 157.81, 123.47], Average MSE：203.2980
------------------------------------------------------------


In [7]:
val_compare_df = pd.DataFrame({"phone":df_train["phone"].values,
                               'Xgt':df_train['Xgt'].values, 'Xpred':pred_valid_x,
                               'Ygt':df_train['Ygt'].values, 'Ypred':pred_valid_y,
                                'Zgt':df_train['Zgt'].values, 'Zpred':pred_valid_z})

lat_lng_df_train['dist'] = calc_haversine(lat_lng_df_train.latDeg_gt, lat_lng_df_train.lngDeg_gt, 
                                lat_lng_df_train.latDeg_bl, lat_lng_df_train.lngDeg_bl)
print('dist_50:',np.percentile(lat_lng_df_train['dist'],50) )
print('dist_95:',np.percentile(lat_lng_df_train['dist'],95) )
print('avg_dist_50_95:',(np.percentile(lat_lng_df_train['dist'],50) + np.percentile(lat_lng_df_train['dist'],95))/2)
print('avg_dist:', lat_lng_df_train['dist'].mean())

dist_50: 6.129357526334204
dist_95: 33.83701581185293
avg_dist_50_95: 19.983186669093566
avg_dist: 10.508449405316293


In [8]:
lng_gt, lat_gt, _ = ECEF_to_WGS84(val_compare_df['Xgt'].values,val_compare_df['Ygt'].values,val_compare_df['Zgt'].values)
lng_pred, lat_pred, _ = ECEF_to_WGS84(val_compare_df['Xpred'].values,val_compare_df['Ypred'].values,val_compare_df['Zpred'].values)
lng_test_pred, lat_test_pred, _ = ECEF_to_WGS84(pred_test_x, pred_test_y, pred_test_z)


In [9]:
val_compare_df["latDeg_gt"] = lat_gt
val_compare_df["lngDeg_gt"] = lng_gt
val_compare_df["latDeg_pred"] = lat_pred
val_compare_df["lngDeg_pred"] = lng_pred

val_compare_df['dist'] = calc_haversine(val_compare_df["latDeg_pred"], val_compare_df["lngDeg_pred"],val_compare_df["latDeg_gt"], val_compare_df["lngDeg_gt"] )

print('dist_50:',np.percentile(val_compare_df['dist'],50) )
print('dist_95:',np.percentile(val_compare_df['dist'],95) )
print('avg_dist_50_95:',(np.percentile(val_compare_df['dist'],50) + np.percentile(val_compare_df['dist'],95))/2)
print('avg_dist:', val_compare_df['dist'].mean())
val_compare_df.shape[0]



dist_50: 6.871014510138368
dist_95: 19.517525375823915
avg_dist_50_95: 13.19426994298114
avg_dist: 8.394499894226666


14333

dist_50: 6.871014510138368
dist_95: 19.517525375823915
avg_dist_50_95: 13.19426994298114
avg_dist: 8.394499894226666

In [10]:
s = [];

output_flag = False
for key, each_df in  val_compare_df.groupby("phone"):
    print(key,each_df["dist"].mean())
    s = s + each_df["dist"].to_numpy().tolist()
    display(each_df[["latDeg_pred","lngDeg_pred","latDeg_gt","lngDeg_gt"]])
    #print(key,each_df["dist"].mean())
    if (not output_flag):
        #display(each_df)
        #display(each_df[["latDeg_pred","lngDeg_pred","latDeg_gt","lngDeg_gt"]])
        output_flag = False

    
#2021-04-22-US-SJC-1_Pixel4 8.60813030306382
#2021-04-22-US-SJC-1_SamsungS20Ultra 8.627481809453734
#2021-04-28-US-SJC-1_Pixel4 7.034029055379216
#2021-04-28-US-SJC-1_SamsungS20Ultra 7.185483071296991
#2021-04-29-US-SJC-2_Pixel4 7.239235975952164
#2021-04-29-US-SJC-2_SamsungS20Ultra 7.268193760581256
        
print(np.mean(s))

2021-04-22-US-SJC-1_Pixel4 8.60813030306382


Unnamed: 0,latDeg_pred,lngDeg_pred,latDeg_gt,lngDeg_gt
0,37.334554,-121.899431,37.334582,-121.899445
1,37.334569,-121.899533,37.334586,-121.899438
2,37.334609,-121.899443,37.334603,-121.899418
3,37.334597,-121.899417,37.334636,-121.899380
4,37.334635,-121.899402,37.334679,-121.899320
...,...,...,...,...
2855,37.334618,-121.899393,37.334599,-121.899427
2856,37.334527,-121.899385,37.334599,-121.899427
2857,37.334434,-121.899282,37.334599,-121.899427
2858,37.334538,-121.899463,37.334599,-121.899427


2021-04-22-US-SJC-1_SamsungS20Ultra 7.6159091115129804


Unnamed: 0,latDeg_pred,lngDeg_pred,latDeg_gt,lngDeg_gt
2860,37.334544,-121.899541,37.334595,-121.899426
2861,37.334584,-121.899437,37.334622,-121.899397
2862,37.334605,-121.899434,37.334661,-121.899347
2863,37.334622,-121.899390,37.334707,-121.899273
2864,37.334701,-121.899335,37.334757,-121.899181
...,...,...,...,...
5651,37.334593,-121.899470,37.334600,-121.899424
5652,37.334590,-121.899538,37.334600,-121.899424
5653,37.334602,-121.899484,37.334600,-121.899424
5654,37.334609,-121.899486,37.334600,-121.899424


2021-04-28-US-SJC-1_Pixel4 9.04347569279449


Unnamed: 0,latDeg_pred,lngDeg_pred,latDeg_gt,lngDeg_gt
5656,37.334168,-121.900202,37.334153,-121.900260
5657,37.334170,-121.900190,37.334153,-121.900260
5658,37.334165,-121.900196,37.334153,-121.900260
5659,37.334178,-121.900216,37.334153,-121.900260
5660,37.334175,-121.900200,37.334153,-121.900260
...,...,...,...,...
7635,37.334195,-121.900160,37.334171,-121.900233
7636,37.334226,-121.900201,37.334171,-121.900233
7637,37.334208,-121.900236,37.334171,-121.900233
7638,37.334230,-121.900167,37.334171,-121.900233


2021-04-28-US-SJC-1_SamsungS20Ultra 8.542695990783283


Unnamed: 0,latDeg_pred,lngDeg_pred,latDeg_gt,lngDeg_gt
7640,37.334187,-121.900333,37.334152,-121.900258
7641,37.334164,-121.900268,37.334152,-121.900258
7642,37.334163,-121.900318,37.334152,-121.900258
7643,37.334156,-121.900268,37.334152,-121.900258
7644,37.334149,-121.900272,37.334152,-121.900258
...,...,...,...,...
9688,37.334179,-121.900216,37.334169,-121.900231
9689,37.334171,-121.900214,37.334169,-121.900231
9690,37.334173,-121.900197,37.334169,-121.900231
9691,37.334142,-121.900227,37.334169,-121.900231


2021-04-29-US-SJC-2_Pixel4 8.992415527077497


Unnamed: 0,latDeg_pred,lngDeg_pred,latDeg_gt,lngDeg_gt
9693,37.334576,-121.899468,37.334481,-121.899594
9694,37.334520,-121.899476,37.334489,-121.899579
9695,37.334487,-121.899555,37.334505,-121.899554
9696,37.334490,-121.899441,37.334528,-121.899517
9697,37.334493,-121.899440,37.334559,-121.899468
...,...,...,...,...
11988,37.334492,-121.899616,37.334473,-121.899612
11989,37.334502,-121.899682,37.334473,-121.899612
11990,37.334504,-121.899616,37.334473,-121.899612
11991,37.334513,-121.899670,37.334473,-121.899612


2021-04-29-US-SJC-2_SamsungS20Ultra 7.795754737409283


Unnamed: 0,latDeg_pred,lngDeg_pred,latDeg_gt,lngDeg_gt
11993,37.334523,-121.899492,37.334547,-121.899493
11994,37.334571,-121.899435,37.334581,-121.899436
11995,37.334579,-121.899502,37.334622,-121.899367
11996,37.334611,-121.899369,37.334665,-121.899292
11997,37.334716,-121.899185,37.334708,-121.899216
...,...,...,...,...
14328,37.334490,-121.899604,37.334475,-121.899613
14329,37.334478,-121.899598,37.334475,-121.899613
14330,37.334490,-121.899566,37.334475,-121.899613
14331,37.334514,-121.899569,37.334475,-121.899613


8.394499894226696


In [28]:
df_train[["lngDeg_pred","latDeg_pred"]] = val_compare_df[["lngDeg_pred","latDeg_pred"]].values.tolist()
df_train[["lngDeg_gt","latDeg_gt"]] = val_compare_df[["lngDeg_gt","latDeg_gt"]].values.tolist()

df_test["lngDeg_pred"] = lng_test_pred;
df_test["latDeg_pred"] = lat_test_pred;

bl_trn_df[["latDeg_pred","lngDeg_pred"]] = bl_trn_df[["latDeg","lngDeg"]].values.tolist();
bl_tst_df[["latDeg_pred","lngDeg_pred"]] = bl_tst_df[["latDeg","lngDeg"]].values.tolist();

bl_trn_df["latDeg_gt"] = -1;
bl_trn_df["lngDeg_gt"] = -1;

output_flag = True
for key, each_df in  df_train.groupby("phone"):    
    lng_deg_pred_index = bl_trn_df.columns.get_loc("lngDeg_pred")
    lat_deg_pred_index = bl_trn_df.columns.get_loc("latDeg_pred")    
    
    fil = bl_trn_df[bl_trn_df["phone"] == key].index[window_size:]
    bl_trn_df.iloc[fil,[lng_deg_pred_index,lat_deg_pred_index]] = each_df[["lngDeg_pred","latDeg_pred"]].to_numpy().tolist();        
    
    lng_deg_pred_index_gt = bl_trn_df.columns.get_loc("lngDeg_gt")
    lat_deg_pred_index_gt = bl_trn_df.columns.get_loc("latDeg_gt")    
        
    bl_trn_df.iloc[fil,[lng_deg_pred_index_gt,lat_deg_pred_index_gt]] = each_df[["lngDeg_gt","latDeg_gt"]].to_numpy().tolist();        
    
    #display(bl_trn_df.iloc[fil,[lat_deg_pred_index,lng_deg_pred_index,lat_deg_pred_index_gt,lng_deg_pred_index_gt]])
    if (not output_flag):
        #display(bl_trn_df.iloc[fil,[lat_deg_pred_index,lng_deg_pred_index,lat_deg_pred_index_gt,lng_deg_pred_index_gt]])
        
        output_flag = False
    
        
for key, each_df in  df_test.groupby("phone"):    
    lng_deg_pred_index = bl_tst_df.columns.get_loc("lngDeg_pred")
    lat_deg_pred_index = bl_tst_df.columns.get_loc("latDeg_pred")    
    fil = bl_tst_df[bl_tst_df["phone"] == key].index[window_size:]    
    bl_tst_df.iloc[fil,[lng_deg_pred_index,lat_deg_pred_index]] = each_df[["lngDeg_pred","latDeg_pred"]];        

In [25]:
from external_lib.visualize import visualize_trafic


s = [];
for key, arg_each_df in  bl_trn_df[bl_trn_df["phone"].str.contains("|".join(tgt_cns))].groupby("phone"):    
    each_df = arg_each_df.copy()
    
    each_df[["latDeg","lngDeg"]] = each_df[["latDeg_pred","lngDeg_pred"]]
    
    #display(arg_each_df)
    each_df["label"] = "after"
    arg_each_df["label"] = "before"
    
    display(visualize_trafic(pd.concat([arg_each_df,each_df]),"label"))
    break;
    
    
    
    #s =  s + each_df["dist"].iloc[window_size:,].tolist()


In [13]:
sub = pd.read_csv("/work/data/input/google-smartphone-decimeter-challenge/sample_submission.csv")


sub[["latDeg","lngDeg"]] = bl_tst_df[["latDeg_pred","lngDeg_pred"]]

sub.to_csv("./test.csv",index=False)