In [1]:
%%HTML
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>

## import 

In [2]:
import pandas as pd
import numpy as np
import os
from lib.noglobal import noglobal
import simdkalman
import glob
from tqdm.notebook import tqdm
import pickle

## データファイルパス

In [3]:

data_dir = "/work/data/input/google-smartphone-decimeter-challenge/"
train_file = f"{data_dir}/baseline_locations_train.csv"
test_file = f"{data_dir}/baseline_locations_test.csv"
sample_file = f"{data_dir}/sample_submission.csv"
 

## 関数定義

### GNSS data loader function

In [4]:
# from https://www.kaggle.com/sohier/loading-gnss-logs

@noglobal
def gnss_log_to_dataframes(path):
    print('Loading ' + path, flush=True)
    gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for dataline in datalines:
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            gnss_map[dataline[0]] = dataline[1:]
        elif not is_header:
            datas[dataline[0]].append(dataline[1:])

    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            results[k][col] = pd.to_numeric(results[k][col])

    return results

### 距離計算

In [5]:
# from https://www.kaggle.com/jpmiller/baseline-from-host-data
# simplified haversine distance
@noglobal
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist

### カルマンフィルター

In [6]:
# from https://www.kaggle.com/emaerthin/demonstration-of-the-kalman-filter
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

@noglobal
def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[phone_col].unique()
    for phone in tqdm(unique_paths):
        data = df.loc[df[phone_col] == phone][[lat_col, lon_col]].values
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[df[phone_col] == phone, lat_col] = smoothed.states.mean[0, :, 0]
        df.loc[df[phone_col] == phone, lon_col] = smoothed.states.mean[0, :, 1]
    return df

## ファイル読み込み

In [7]:
trn = pd.read_csv(train_file)
tst = pd.read_csv(test_file)
sub = pd.read_csv(sample_file)

## データセット準備

In [8]:
cname_col = 'collectionName'
pname_col = 'phoneName'
phone_col = 'phone'
ts_col = 'millisSinceGpsEpoch'
dt_col = 'datetime'
lat_col = 'latDeg'
lon_col = 'lngDeg'

prev_lat_col = 'prev_lat'
prev_lon_col = 'prev_lng'


added_data = ["correctedPrM","xSatPosM","ySatPosM","zSatPosM"]

#df_tst = sub[[phone_col, ts_col]].merge(tst[[phone_col, ts_col, lat_col, lon_col, 'prev_lat', 'prev_lon']], how='left', on=[phone_col, ts_col], suffixes=('', '_basepred'))

### add shift lat and lng information per path

#### add function

In [9]:
@noglobal()
def add_shift_lat_lng_info(df):            
    phone_col = 'phone'
    ts_col = 'millisSinceGpsEpoch'
    lat_col = 'latDeg'
    lon_col = 'lngDeg'
    prev_lat_col = 'prev_lat'
    prev_lon_col = 'prev_lng'
    prev_lon_col = 'prev_lng'
    
    
    df.sort_values([phone_col, ts_col], inplace=True)
        
    df[[prev_lat_col]] = df[[lat_col]].shift().where(df[phone_col].eq(df[phone_col].shift())).fillna(0)
    df[[prev_lon_col]] = df[[lon_col]].shift().where(df[phone_col].eq(df[phone_col].shift())).fillna(0)
    #df[[prev_height_Wgs84_col]] = df[[height_Wgs84_col]].shift().where(df[phone_col].eq(df[phone_col].shift())).fillna(0)

#### execute function

In [10]:
add_shift_lat_lng_info(trn);
add_shift_lat_lng_info(tst);

In [11]:
display(trn.head(3))
display(tst.head(3))

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,prev_lat,prev_lng
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4,0.0,0.0
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4,37.423575,-122.094091
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4,37.423578,-122.094101


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,prev_lat,prev_lng
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416628,-122.082053,-30.69,2020-05-15-US-MTV-1_Pixel4,0.0,0.0
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416646,-122.08204,-31.76,2020-05-15-US-MTV-1_Pixel4,37.416628,-122.082053
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416652,-122.082039,-31.65,2020-05-15-US-MTV-1_Pixel4,37.416646,-122.08204


### add ground truth file

In [12]:
cols = [phone_col, ts_col, lat_col, lon_col]

ground_truth_files = glob.glob(f"/work/data/input/google-smartphone-decimeter-challenge/train/*/*/ground_truth.csv")

df_list = [ pd.read_csv(t,usecols=[cname_col,pname_col,ts_col,lat_col])  for t in tqdm(ground_truth_files,total=73)];

df_label = pd.concat(df_list,ignore_index=True);
df_label[phone_col] = df_label[cname_col] + "_" + df_label[pname_col]
df_trn = df_label.merge(trn[cols + [prev_lat_col,prev_lon_col]],how="inner",on = [phone_col,ts_col],suffixes=('_gt', '')).drop([cname_col, pname_col], axis=1)

  0%|          | 0/73 [00:00<?, ?it/s]

In [13]:
df_trn.head(10)

Unnamed: 0,millisSinceGpsEpoch,latDeg_gt,phone,latDeg,lngDeg,prev_lat,prev_lng
0,1275339493441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416345,-122.080528,0.0,0.0
1,1275339494441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416355,-122.080497,37.416345,-122.080528
2,1275339495441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416346,-122.080499,37.416355,-122.080497
3,1275339496441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416353,-122.080508,37.416346,-122.080499
4,1275339497441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416349,-122.080509,37.416353,-122.080508
5,1275339498441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416364,-122.08051,37.416349,-122.080509
6,1275339499441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416351,-122.080509,37.416364,-122.08051
7,1275339500441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416376,-122.080524,37.416351,-122.080509
8,1275339501441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416356,-122.080504,37.416376,-122.080524
9,1275339502441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416364,-122.080492,37.416356,-122.080504


### add test infomation

In [14]:
df_tst = sub[[phone_col, ts_col]].merge(tst[[phone_col, ts_col, lat_col, lon_col, prev_lat_col,prev_lon_col]], 
                                        how='left', on=[phone_col, ts_col], suffixes=('', '_basepred'))

df_tst.head(3)

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg,prev_lat,prev_lng
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416628,-122.082053,0.0,0.0
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416646,-122.08204,37.416628,-122.082053
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416652,-122.082039,37.416646,-122.08204


### add second since GPS time

#### 関数定義

In [15]:
@noglobal()
def add_second_since_GPSepoch(df):
    ts_col = 'millisSinceGpsEpoch'
    df["secondsSinceGpsEpoch"] = df[ts_col]//1000
    return df

#### 関数実行

In [16]:
add_second_since_GPSepoch(df_trn)
add_second_since_GPSepoch(df_tst)

display(df_trn.head(3))
display(df_tst.head(3))

Unnamed: 0,millisSinceGpsEpoch,latDeg_gt,phone,latDeg,lngDeg,prev_lat,prev_lng,secondsSinceGpsEpoch
0,1275339493441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416345,-122.080528,0.0,0.0,1275339493
1,1275339494441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416355,-122.080497,37.416345,-122.080528,1275339494
2,1275339495441,37.416314,2020-06-04-US-MTV-1_Pixel4,37.416346,-122.080499,37.416355,-122.080497,1275339495


Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg,prev_lat,prev_lng,secondsSinceGpsEpoch
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416628,-122.082053,0.0,0.0,1273608785
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416646,-122.08204,37.416628,-122.082053,1273608786
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416652,-122.082039,37.416646,-122.08204,1273608787


### add Derived Data information

#### 関数定義

In [17]:
@noglobal()
def add_correctedPrm(df):
    df['correctedPrM'] = (df['rawPrM'] + df['satClkBiasM'] - df['isrbM'] - df['ionoDelayM'] - df['tropoDelayM'])
    return df;    

@noglobal()
def load_derived_files(datatype:str,cols):
    data_dir = "/work/data/input/google-smartphone-decimeter-challenge/"
    phone_col = 'phone'
    ts_col = 'millisSinceGpsEpoch'    
    pname_col = 'phoneName'
    cname_col = 'collectionName'

    
    if (datatype=="train"):
        derived_files = glob.glob(f"{data_dir}/train/**/*_derived.csv",recursive=True)
    elif (datatype=="test"):
        derived_files = glob.glob(f"{data_dir}/test/**/*_derived.csv",recursive=True)
    else:
        raise Exception("miss arg (datatype) value: Only specify train or test, not"+datatype);
            
    ### load load derived files;
    df_list = [ add_correctedPrm(pd.read_csv(t))[[cname_col,pname_col] + cols]   for t in tqdm(derived_files)];    
    df_train_derived = pd.concat(df_list,ignore_index=True);    
    
    df_train_derived[phone_col] = df_train_derived[cname_col] + "_" + df_train_derived[pname_col];
    df_train_derived.drop([cname_col,pname_col],axis=1,inplace=True)
    return df_train_derived;
        

@noglobal()
def create_derived_pivot_dataframe_per_svid(df,column,aggrication_func=np.mean):
    data_dir = "/work/data/input/google-smartphone-decimeter-challenge/"
    phone_col = 'phone'
    ts_col = 'millisSinceGpsEpoch'    
    pname_col = 'phoneName'
    cname_col = 'collectionName'
        
    df_train_derived_pivot = pd.pivot_table(df,values=column, index=[phone_col,ts_col],columns=["svid"],aggfunc=aggrication_func)
    df_train__derived_pivot = pd.pivot_table(df,values=column, index=[phone_col,ts_col],columns=["svid"],aggfunc=aggrication_func)
    df_train_derived_pivot.columns = [f'svid_{column}_{x}' for x in df_train_derived_pivot.columns]
    
    
    df_train_derived_pivot.reset_index(inplace=True)
    return df_train_derived_pivot;        


@noglobal(excepts=["create_derived_pivot_dataframe_per_svid"])
def create_derived_pivot_dataframe(df,column_list,aggrication_func=np.mean):
        
    if (len(column_list) == 0):
        raise Exception("size of column_list muse be one or more.");
    elif (len(column_list) == 1):
        return create_derived_pivot_dataframe_per_svid(column_list[0]);
    else:
        ret_df = create_derived_pivot_dataframe_per_svid(df,column_list[0],aggrication_func)
        
        for column in tqdm(column_list[1:]):            
            added_df = create_derived_pivot_dataframe_per_svid(df,column,aggrication_func)
            ret_df = ret_df.merge(added_df,how="left",on = ["phone","millisSinceGpsEpoch"],suffixes=["","_"+column])
        
    
    return ret_df;      

#### load Derived file

In [18]:

cols = [ts_col, 'svid'] + added_data


df_train_derived = load_derived_files("train",cols);
df_test_derived = load_derived_files("test",cols);

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

In [19]:
df_train_derived.head(5)

Unnamed: 0,millisSinceGpsEpoch,svid,correctedPrM,xSatPosM,ySatPosM,zSatPosM,phone
0,1275339494441,5,22597220.0,-1899382.0,-26371410.0,-915369.4,2020-06-04-US-MTV-1_Pixel4
1,1275339494441,29,24678650.0,-26582060.0,-171745.3,8476383.0,2020-06-04-US-MTV-1_Pixel4
2,1275339494441,15,25321040.0,-27302500.0,-10045520.0,5444145.0,2020-06-04-US-MTV-1_Pixel4
3,1275339494441,16,23517050.0,-7305856.0,-21850120.0,-11101900.0,2020-06-04-US-MTV-1_Pixel4
4,1275339494441,20,19922540.0,-16937310.0,-8049983.0,17268690.0,2020-06-04-US-MTV-1_Pixel4


#### derived pivot dataframe

In [20]:
df_train_derived_pivot = create_derived_pivot_dataframe(df_train_derived,added_data)
df_test_derived_pivot = create_derived_pivot_dataframe(df_test_derived,added_data)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
column = ["phone","millisSinceGpsEpoch"] + sum([["svid_correctedPrM_"+str(idx),"svid_xSatPosM_"+str(idx),"svid_ySatPosM_"+str(idx),"svid_zSatPosM_"+str(idx)]  for idx in range(1,38)],[])
df_train_derived_pivot = df_train_derived_pivot[column]
df_test_derived_pivot = df_train_derived_pivot[column]

#### add second since GPS time

In [22]:
df_train_derived_pivot = add_second_since_GPSepoch(df_train_derived_pivot);
df_test_derived_pivot = add_second_since_GPSepoch(df_test_derived_pivot);

In [23]:
df_test_derived_pivot.head(3)

Unnamed: 0,phone,millisSinceGpsEpoch,svid_correctedPrM_1,svid_xSatPosM_1,svid_ySatPosM_1,svid_zSatPosM_1,svid_correctedPrM_2,svid_xSatPosM_2,svid_ySatPosM_2,svid_zSatPosM_2,...,svid_zSatPosM_35,svid_correctedPrM_36,svid_xSatPosM_36,svid_ySatPosM_36,svid_zSatPosM_36,svid_correctedPrM_37,svid_xSatPosM_37,svid_ySatPosM_37,svid_zSatPosM_37,secondsSinceGpsEpoch
0,2020-05-14-US-MTV-1_Pixel4,1273529464442,25072940.0,-16870910.0,-1134636.0,24292170.0,20899910.0,-5503339.32,-18254200.0,19155120.0,...,,,,,,,,,,1273529464
1,2020-05-14-US-MTV-1_Pixel4,1273529465442,25072560.0,-16871480.0,-1137008.0,24291660.0,20900130.0,-5501259.007,-18253210.0,19156710.0,...,,,,,,,,,,1273529465
2,2020-05-14-US-MTV-1_Pixel4,1273529466442,25072160.0,-16872050.0,-1139381.0,24291160.0,20900350.0,-5499178.468,-18252230.0,19158290.0,...,,,,,,,,,,1273529466


### dataframeに結合

In [24]:
#secondsSinceGpsEpoch
df_trn = df_trn.merge(df_train_derived_pivot, how="left",on = [ phone_col,"secondsSinceGpsEpoch"],suffixes=["","_2"])
df_trn.drop(['secondsSinceGpsEpoch', ts_col + '_2'], axis=1, inplace=True)


df_tst = df_tst.merge(df_test_derived_pivot, how="left",on = [ phone_col,"secondsSinceGpsEpoch"],suffixes=["","_2"])
df_tst.drop(['secondsSinceGpsEpoch', ts_col + '_2'], axis=1, inplace=True)

### diff baseline and true

leakのような気がするので、削除

In [25]:
#df_trn['diff_lat'] = df_trn['latDeg_gt'] - df_trn[lat_col]
#df_trn['diff_lng'] = df_trn['lngDeg_gt'] - df_trn[lon_col]
#df_trn[['diff_lat', 'diff_lng']].describe()

## データ保存

In [26]:
df_trn.to_csv("/work/data/input/selfmade_dataset/baseline_with_derived_data_v1/train.csv",index=False)
df_tst.to_csv("/work/data/input/selfmade_dataset/baseline_with_derived_data_v1/test.csv",index=False)

pickle.dump(df_trn,open("/work/data/input/selfmade_dataset/baseline_with_derived_data_v1/train.pkl","wb"))
pickle.dump(df_tst,open("/work/data/input/selfmade_dataset/baseline_with_derived_data_v1/test.pkl","wb"))