# Refernece

https://www.kaggle.com/wrrosa/gsdc-position-shift

# Import 

In [1]:
import sys
sys.path.append("/work/src/")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pathlib import Path
#!pip install pyproj
import pyproj

from pyproj import Proj, transform
import glob
from lib.noglobal import noglobal

# 関数定義

In [2]:
@noglobal(excepts=["np"])
def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

@noglobal(excepts=["calc_haversine"])
def compute_dist(oof, gt_df=None):
        
    if (gt_df is None):
        FILES = glob.glob(f"/work/data/input/google-smartphone-decimeter-challenge/train/*/*/ground_truth.csv")
        gt_list = [ pd.read_csv(f) for f in FILES ];
        gt_df = pd.concat(gt_list,axis=0);
            
    
    gt_df["phone"] = gt_df["collectionName"] + "_" + gt_df["phoneName"]
        
    df = oof.merge(gt_df, on = ['phone','millisSinceGpsEpoch'])
         
    dst_oof = calc_haversine(df.latDeg_x,df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
    
    scores = pd.DataFrame({'phone': df.phone,'dst': dst_oof})
    scores_grp = scores.groupby('phone')
    
    d50 = scores_grp.quantile(.50).reset_index()
    d50.columns = ['phone','q50']
    d95 = scores_grp.quantile(.95).reset_index()
    d95.columns = ['phone','q95']
    
    return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean())/2, d50.merge(d95)

def WGS84_to_ECEF(lat, lon, alt):
    # convert to radians
    rad_lat = lat * (np.pi / 180.0)
    rad_lon = lon * (np.pi / 180.0)
    a    = 6378137.0
    # f is the flattening factor
    finv = 298.257223563
    f = 1 / finv   
    # e is the eccentricity
    e2 = 1 - (1 - f) * (1 - f)    
    # N is the radius of curvature in the prime vertical
    N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
    x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lon)
    y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lon)
    z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
    return x, y, z


transformer = pyproj.Transformer.from_crs(
    {"proj":"geocent","ellips":"WGS84","datum":"WGS84"},
    {"proj":"latlong","ellips":"WGS84","datum":"WGS84"},
)

def ECEF_to_WGS84(x,y,z):
    return transformer.transform(x,y,z,radians=False);

# data loader

In [3]:
datadir = "/work/data/input/google-smartphone-decimeter-challenge"

train_dir = f"{datadir}/train"
test_dir = f"{datadir}/test"

sample_sub = pd.read_csv(f"{datadir}/sample_submission.csv")
sub_columns = sample_sub.columns

baseline_train = pd.read_csv(f"{datadir}/baseline_locations_train.csv")
baseline_test = pd.read_csv(f"{datadir}/baseline_locations_test.csv")

msge = 'millisSinceGpsEpoch'


FILES = glob.glob(f"{train_dir}/*/*/ground_truth.csv")
gt_list = [ pd.read_csv(f) for f in FILES ];
gt = pd.concat(gt_list,axis=0);



# ベースラインの推定誤差

In [8]:
score, scores = compute_dist(baseline_train,gt);
print(score)

dst    5.287971
dtype: float64


# optuna

In [22]:
!pip install optuna

Collecting optuna
  Downloading optuna-2.7.0-py3-none-any.whl (293 kB)
[K     |████████████████████████████████| 293 kB 3.8 MB/s eta 0:00:01
[?25hCollecting cliff
  Downloading cliff-3.8.0-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 5.2 MB/s eta 0:00:01
Collecting colorlog
  Downloading colorlog-5.0.1-py2.py3-none-any.whl (10 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting cmd2>=1.0.0
  Downloading cmd2-1.5.0-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 6.4 MB/s eta 0:00:01
Collecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.6.0-py2.py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 7.0 MB/s eta 0:00:01
[?25hCollecting stevedore>=2.0.1
  Downloading stevedore-3.3.0-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 3.7 MB/s eta 0:00:011
[?25hCollecting PrettyTable>=0.7.2
  Downloading prettytable-2.1.0-py3-none-any.whl (22 kB)
Collecti

In [51]:
import optuna

#@noglobal
def position_shift(df,a):
    
    d = df.copy()
    
    d['heightAboveWgs84EllipsoidM'] = 63.5
    d['x'], d['y'], d['z'] = zip(*d.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, x.heightAboveWgs84EllipsoidM), axis=1))
    
    # a = -0.2
    d.sort_values(["phone",msge],inplace=True);
    
    for fi in ["x","y","z"]:
        d[[fi+'p']] = d[fi].shift().where(d['phone'].eq(d['phone'].shift()))
        d[[fi+"diff"]] = d[fi] - d[fi+"p"]
        
    d[['dist']] = np.sqrt(d['xdiff']**2 + d['ydiff']**2+ d['zdiff']**2)
    
    for fi in ["x","y","z"]:
        d[[fi+'new']] = d[fi+'p'] + d[fi+'diff']*(1-a/d['dist']);
    
    lng, lat, alt = ECEF_to_WGS84(d['xnew'].values,d['ynew'].values,d['znew'].values)
    
    lng[np.isnan(lng)] = d.loc[np.isnan(lng),'lngDeg']
    lat[np.isnan(lat)] = d.loc[np.isnan(lat),'latDeg']
    d['latDeg'] = lat
    d['lngDeg'] = lng
    
    d.sort_values(['phone',msge],inplace = True)
        
    return d

#@noglobal(excepts=["baseline_train","gt"])
def objective(trial):
    a = trial.suggest_uniform("a",-1,1)            
    score,scores = compute_dist(position_shift(baseline_train,a),gt)
    return score

#study = optuna.create_study()
#study.optimize(objective, n_trials=30)

a = 0.7884495383712343


#study = optuna.create_study();    
#study.optimize(objective, n_trials=100)



sample_sub = pd.read_csv(f"{datadir}/sample_submission.csv")
sub_columns = sample_sub.columns
sub = position_shift(sample_sub, a)




                        

In [52]:
sub[sub_columns].to_csv("/work/submission/baseline_plus_positionshift.csv",index=False)