This notebook shares EDA on the aggregate and phone level data, then trains a neural network model to predict the residuals of base estimations provided with aggregated features.

The contents of the notebook are organized as follows:
1. Aggregated Data EDA
2. Phone Level Data EDA
3. Feature Generation: generates aggregated features for training. Currently we only use previous lat/long and `correctedPrM` from derived files.
4. Model Training: trains a neural network with a skip connection in Keras on TPU.

Credits to other notebooks:
* [Baseline from host data](https://www.kaggle.com/jpmiller/baseline-from-host-data) by @jpmiller: for the distance calculation with `calc_haversine()`
* [Demonstration of the Kalman filter](https://www.kaggle.com/emaerthin/demonstration-of-the-kalman-filter) by @emaerthin: for Kalman filtering with `apply_kf_smoothing()`
* [Loading GNSS logs](https://www.kaggle.com/sohier/loading-gnss-logs) by organizers: for GNSS log loading with `gnss_log_to_dataframes()`
* [Ѫ Start Here: Simple Folium Heatmap for Geo-Data](https://www.kaggle.com/dannellyz/start-here-simple-folium-heatmap-for-geo-data) by @dannellyz: for geospatial heatmap with `simple_folium()`

Thanks for sharing.

# Load Libraries & Data

In [None]:
!pip install simdkalman

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np # linear algebra
from pathlib import Path
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import sparse
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import simdkalman
import tensorflow as tf
from tensorflow import keras
from tqdm.notebook import tqdm
from warnings import simplefilter

simplefilter('ignore')
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)

In [None]:
model_name = 'nn_v2'

data_dir = Path('../input/google-smartphone-decimeter-challenge')
train_file = data_dir / 'baseline_locations_train.csv'
test_file = data_dir / 'baseline_locations_test.csv'
sample_file = data_dir / 'sample_submission.csv'

build_dir = Path('./build')
build_dir.mkdir(parents=True, exist_ok=True)
predict_val_file = build_dir / f'{model_name}.val.txt'
predict_tst_file = build_dir / f'{model_name}.tst.txt'
submission_file = 'submission.csv'

cname_col = 'collectionName'
pname_col = 'phoneName'
phone_col = 'phone'
ts_col = 'millisSinceGpsEpoch'
dt_col = 'datetime'
lat_col = 'latDeg'
lon_col = 'lngDeg'

lrate = .001
batch_size = 1024
epochs = 100
n_stop = 10
n_fold = 5
seed = 42

In [None]:
# from https://www.kaggle.com/sohier/loading-gnss-logs
def gnss_log_to_dataframes(path):
    print('Loading ' + path, flush=True)
    gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for dataline in datalines:
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            gnss_map[dataline[0]] = dataline[1:]
        elif not is_header:
            datas[dataline[0]].append(dataline[1:])

    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            results[k][col] = pd.to_numeric(results[k][col])

    return results

In [None]:
# from https://www.kaggle.com/dannellyz/start-here-simple-folium-heatmap-for-geo-data
import folium
from folium import plugins


def simple_folium(df:pd.DataFrame, lat_col:str, lon_col:str):
    """
    Descrption
    ----------
        Returns a simple Folium HeatMap with Markers
    ----------
    Parameters
    ----------
        df : padnas DataFrame, required
            The DataFrane with the data to map
        lat_col : str, required
            The name of the column with latitude
        lon_col : str, required
            The name of the column with longitude
    """
    #Preprocess
    #Drop rows that do not have lat/lon
    df = df[df[lat_col].notnull() & df[lon_col].notnull()]

    # Convert lat/lon to (n, 2) nd-array format for heatmap
    # Then send to list
    df_locs = list(df[[lat_col, lon_col]].values)

    #Set up folium map
    fol_map = folium.Map([df[lat_col].median(), df[lon_col].median()])

    # plot heatmap
    heat_map = plugins.HeatMap(df_locs)
    fol_map.add_child(heat_map)

    # plot markers
    markers = plugins.MarkerCluster(locations = df_locs)
    fol_map.add_child(markers)

    #Add Layer Control
    folium.LayerControl().add_to(fol_map)

    return fol_map

In [None]:
# from https://www.kaggle.com/jpmiller/baseline-from-host-data
# simplified haversine distance
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist

In [None]:
# from https://www.kaggle.com/emaerthin/demonstration-of-the-kalman-filter
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[phone_col].unique()
    for phone in tqdm(unique_paths):
        data = df.loc[df[phone_col] == phone][[lat_col, lon_col]].values
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[df[phone_col] == phone, lat_col] = smoothed.states.mean[0, :, 0]
        df.loc[df[phone_col] == phone, lon_col] = smoothed.states.mean[0, :, 1]
    return df

In [None]:
trn = pd.read_csv(train_file)
print(trn.shape)
trn.head()

In [None]:
tst = pd.read_csv(test_file)
print(tst.shape)
tst.head()

In [None]:
sub = pd.read_csv(sample_file)
print(sub.shape)
sub.head()

# Aggregated Data EDA

## `collectionName`, `phoneName`

In [None]:
for col in [cname_col, pname_col]:
    print(f'# of unique {col:>14s} in training: {trn[col].nunique():4d}')
    print(f'# of unique {col:>14s}     in test: {tst[col].nunique():4d}')

In [None]:
trn[pname_col].value_counts()

In [None]:
tst[pname_col].value_counts()

In [None]:
print(f'# of unique phone in training: {trn[phone_col].nunique():4d}')
print(f'    # of unique phone in test: {tst[phone_col].nunique():4d}')

In [None]:
trn[phone_col].value_counts()

In [None]:
tst[phone_col].value_counts()

Each phone has fair amount of data points ranging between 577 and 3,517.

In [None]:
overlapping_phones = [x for x in tst[phone_col] if x in trn[phone_col]]
print(len(overlapping_phones))

There's **no** overlapping phone between the training and test data.

## `millisSinceGpsEpoch`

In [None]:
tst[ts_col].min(), tst[ts_col].max()

From the data description, `millisSinceGpsEpoch` is "an integer number of milliseconds since the GPS epoch (1980/1/6 midnight UTC). Its value equals". We can convert them to `datatime64` using `pd.to_datetime()` as follows:

In [None]:
dt_offset = pd.to_datetime('1980-01-06 00:00:00')
print(dt_offset)
dt_offset_in_ms = int(dt_offset.value / 1e6)

In [None]:
trn[dt_col] = pd.to_datetime(trn[ts_col] + dt_offset_in_ms, unit='ms')
tst[dt_col] = pd.to_datetime(tst[ts_col] + dt_offset_in_ms, unit='ms')
print(f'Training data range: {trn[dt_col].min()} - {trn[dt_col].max()}')
print(f'    Test data range: {tst[dt_col].min()} - {tst[dt_col].max()}')

## `latDeg` and `lngDeg`

First, let's see how estimated locations between the training and test data look like. The ground truth for training data is available per `phone` in `{collectionName}/{phoneName}/ground_truth.csv`.

In [None]:
latlon_trn = trn[[lat_col, lon_col]].round(3)
latlon_trn['counts'] = 1
latlon_trn = latlon_trn.groupby([lat_col, lon_col]).sum().reset_index()
latlon_trn.head()

Let's see the heatmap for the training data.

In [None]:
simple_folium(latlon_trn, lat_col, lon_col)

Let's see the heatmap for the test data too.

In [None]:
latlon_tst = tst[[lat_col, lon_col]].round(3)
latlon_tst['counts'] = 1
latlon_tst = latlon_tst.groupby([lat_col, lon_col]).sum().reset_index()

simple_folium(latlon_tst, lat_col, lon_col)

# Phone Level Data EDA

## GNSS Logs

In [None]:
cname = trn[cname_col][0]
pname = trn[pname_col][0]
dfs = gnss = gnss_log_to_dataframes(str(data_dir / 'train' / cname / pname / f'{pname}_GnssLog.txt'))
print(dfs.keys())

In [None]:
df_raw = dfs['Raw']
print(df_raw.shape)
df_raw.head()

In [None]:
df_raw.info()

From the [post](https://www.kaggle.com/c/google-smartphone-decimeter-challenge/discussion/238583) by @sohier and [slides](https://www.kaggle.com/google/android-smartphones-high-accuracy-datasets?select=ION+GNSS+2020+Slides+Android+Raw+GNSS+Measurement+Datasets+for+Precise+Positioning.pdf) by the data provider: 

Measurements from GNSS chipsets of mobile phones are often noisier and more erroneous. Example of filters your can apply (to exclude) are:
1. `FullBiasNanos` (GNSS Raw) is zero or invalid
2. `BiasUncertaintyNanos` (GNSS Raw) too large (> 1e6)
3. Arrival time is negative or unrealistically large - can be calculated from `rawPrM` (Derived)
4. Unknown constellation (`constellationType == 0`) (Derived, GNSS Raw)
5. `TimeNanos` is empty (GNSS Raw)
6. `State` is not in (`STATE_TOW_DECODED`, `STATE_TOW_KNOWN`, `STATE_GLO_TOD_DECODED`, `STATE_GLO_TOD_KNOWN`) (GNSS Raw)
7. `ReceivedSvTimeUncertaintyNanos` is high (500 ns) (GNSS Raw)
8. `AccumulatedDeltaRangeState` violating this condition: `ADR_STATE_VALID == 1 & ADR_STATE_RESET == 0 & ADR_STATE_CYCLE_SLIP == 0` (GNSS Raw)
9. `AccumulatedDeltaRangeUncertaintyMeters` is high (GNSS Raw)
10. `Cn0DbHz` is less than 20 db-Hz (GNSS Raw)

In [None]:
df_raw['ArrivalTime'] = df_raw['TimeNanos'] - df_raw['FullBiasNanos'] - df_raw['BiasNanos']
print(df_raw['ArrivalTime'].describe())
df_raw['ArrivalTime'].hist(bins=20)

In [None]:
print(df_raw['BiasUncertaintyNanos'].describe())
df_raw['BiasUncertaintyNanos'].hist(bins=20)

In [None]:
print(df_raw['ReceivedSvTimeUncertaintyNanos'].describe())
df_raw['ReceivedSvTimeUncertaintyNanos'].hist(bins=20)

In [None]:
print(df_raw.AccumulatedDeltaRangeUncertaintyMeters.describe())
df_raw.AccumulatedDeltaRangeUncertaintyMeters.hist(bins=20)

In [None]:
print(df_raw.Cn0DbHz.describe())
df_raw.Cn0DbHz.hist(bins=20)

In [None]:
df_raw = df_raw.loc[
    ~pd.isnull(df_raw.FullBiasNanos) &
    (df_raw.BiasUncertaintyNanos < 100) &
    (df_raw.ArrivalTime > 0) &
    (df_raw.ConstellationType != 0) &
    ~pd.isnull(df_raw.TimeNanos) &
    (df_raw.State != 3) & (df_raw.State != 14) & (df_raw.State != 7) & (df_raw.State != 15) &
    (df_raw.ReceivedSvTimeUncertaintyNanos < 100) &
    (df_raw.AccumulatedDeltaRangeUncertaintyMeters < 0.3) &
    (df_raw.Cn0DbHz > 20)
]
print(df_raw.shape)

See organizer's [Loading GNSS logs](https://www.kaggle.com/sohier/loading-gnss-logs) notebook for more details.

## Derived Values

Derived values are used to generate baseline location estimates in `baseline_locations_{train|test}.csv`.

In [None]:
derived = pd.read_csv(data_dir / 'train' / cname / pname / f'{pname}_derived.csv')
print(derived.shape)
derived.head()

In [None]:
derived.info()

In [None]:
derived = derived.loc[derived.constellationType != 0]
print(derived.shape)

Let's calculate `correctedPrM` as described in the data description:
```
correctedPrM = rawPrM + satClkBiasM - isrbM - ionoDelayM - tropoDelayM
```
"The baseline locations are computed using correctedPrM and the satellite positions, using a standard Weighted Least Squares (WLS) solver, with the phone's position (x, y, z), clock bias (t), and isrbM for each unique signal type as states for each epoch."

In [None]:
derived['correctedPrM'] = (derived['rawPrM'] + derived['satClkBiasM'] - derived['isrbM'] - 
                           derived['ionoDelayM'] - derived['tropoDelayM'])
sns.pairplot(data=derived, vars=['correctedPrM', 'rawPrM'], size=3)

In [None]:
derived[dt_col] = pd.to_datetime(derived[ts_col] + dt_offset_in_ms, unit='ms')
print(f'Data range for {cname}/{pname}: {derived[dt_col].min()} - {derived[dt_col].max()}')

The data is for 30 minutes or 1,800 seconds. However, we have a lot more samples (55K). This is because, for each second, there are multiple samples with different `constellationType`, `svid`, and `signalType`.

In [None]:
derived[['constellationType', 'svid', 'signalType']].value_counts()

In [None]:
derived[[ts_col, 'constellationType', 'correctedPrM']].groupby([ts_col, 'constellationType']).agg(['mean', 'std', 'count']).describe()

In [None]:
derived.loc[derived.constellationType == 1][[ts_col, 'svid', 'correctedPrM']].groupby([ts_col, 'svid']).agg(['mean', 'std', 'count']).describe()

Each epoch, given the constellation type of `1` (or GPS), from the same satellite, `coorectedPrM` can be different - because of different signal types.

In [None]:
derived.loc[derived.signalType == 'GPS_L1'][[ts_col, 'svid', 'correctedPrM']].groupby([ts_col, 'svid']).agg(['mean', 'std', 'count'])

In [None]:
derived.loc[derived.signalType == 'GPS_L1'][[ts_col, 'svid', 'correctedPrM']].groupby([ts_col, 'svid']).agg(['mean', 'std', 'count']).describe()

Each epoch, given the signal type of `GPS_L1`, from the same satellite, `correctedPrM` is unique.

In [None]:
derived.loc[derived.signalType == 'GPS_L1'][[ts_col, 'svid']].drop_duplicates().groupby([ts_col]).agg(['mean', 'std', 'count']).describe()

Each epoch, given the signal type of `GPS_L1`, there are signals from at least 3 satellites.

In [None]:
gps_l1 = derived.loc[derived.signalType == 'GPS_L1'][[ts_col, 'svid', 'correctedPrM']].drop_duplicates([ts_col, 'svid'])
print(gps_l1.shape)
gps_l1.head()

## Ground Truth

In [None]:
label = pd.read_csv(data_dir / 'train' / cname / pname / 'ground_truth.csv')
print(label.shape)
label.head()

In the `*derived.csv`, we have 55K rows, but in the `ground_truth.csv`, we only have 1,740 rows.

In [None]:
label[dt_col] = pd.to_datetime(label[ts_col] + dt_offset_in_ms, unit='ms')
print(f'Labels range for {cname}/{pname}: {label[dt_col].min()} - {label[dt_col].max()}')

Hmm, this is weird. The label data starts 1 second earlier than the derived data. This means that if we join the derived and label data, the first second will have NaNs for derived columns. Let's check another phone data.

In [None]:
cname = trn[cname_col][10]
pname = trn[pname_col][10]
derived2 = pd.read_csv(data_dir / 'train' / cname / pname / f'{pname}_derived.csv')
label2 = pd.read_csv(data_dir / 'train' / cname / pname / 'ground_truth.csv')
print(f"Derived data starts at: {pd.to_datetime(derived2[ts_col].min() + dt_offset_in_ms, unit='ms')}")
print(f"  Label data starts at: {pd.to_datetime(label2[ts_col].min() + dt_offset_in_ms, unit='ms')}")

It's the same. We don't have the first second data in the derived data. Let's take a note and move on.

# Feature Generation

## Label Data Aggregation

First, let's add previous latitude and longitude estimates as features.

In [None]:
trn.sort_values([phone_col, ts_col], inplace=True)
trn[['prev_lat']] = trn[lat_col].shift().where(trn[phone_col].eq(trn[phone_col].shift()))
trn[['prev_lon']] = trn[lon_col].shift().where(trn[phone_col].eq(trn[phone_col].shift()))

tst.sort_values([phone_col, ts_col], inplace=True)
tst[['prev_lat']] = tst[lat_col].shift().where(tst[phone_col].eq(tst[phone_col].shift()))
tst[['prev_lon']] = tst[lon_col].shift().where(tst[phone_col].eq(tst[phone_col].shift()))
trn.head()

In [None]:
# from https://www.kaggle.com/jpmiller/baseline-from-host-data
label_files = (data_dir / 'train').rglob('ground_truth.csv')
cols = [phone_col, ts_col, lat_col, lon_col]

df_list = []
for t in tqdm(label_files, total=73):
    label = pd.read_csv(t, usecols=[cname_col, pname_col, ts_col, lat_col, lon_col])
    df_list.append(label)

df_label = pd.concat(df_list, ignore_index=True)
df_label[phone_col] = df_label[cname_col] + '_' + df_label[pname_col]

df = df_label.merge(trn[cols + ['prev_lat', 'prev_lon']], how='inner', on=[phone_col, ts_col], 
                    suffixes=('_gt', '')).drop([cname_col, pname_col], axis=1)
df['sSinceGpsEpoch'] = df[ts_col] // 1000
print(df.shape)
df.head()

In [None]:
df_tst = sub[[phone_col, ts_col]].merge(tst[[phone_col, ts_col, lat_col, lon_col, 'prev_lat', 'prev_lon']], 
                                        how='left', on=[phone_col, ts_col], suffixes=('', '_basepred'))
df_tst['sSinceGpsEpoch'] = df_tst[ts_col] // 1000
print(df_tst.shape)
df_tst.head()

## Derived Data Aggregation

In [None]:
derived_files = (data_dir / 'train').rglob('*_derived.csv')
cols = [ts_col, 'svid', 'correctedPrM']

df_list = []
for t in tqdm(derived_files, total=73):
    derived = pd.read_csv(t).drop_duplicates([ts_col, 'svid'])
    derived['correctedPrM'] = (derived['rawPrM'] + derived['satClkBiasM'] - derived['isrbM'] - 
                               derived['ionoDelayM'] - derived['tropoDelayM'])
    df_list.append(derived[[cname_col, pname_col, ts_col, 'svid', 'correctedPrM']])
    
df_derived = pd.concat(df_list, ignore_index=True)
df_derived[phone_col] = df_derived[cname_col] + '_' + df_derived[pname_col]
df_derived.drop([cname_col, pname_col], axis=1, inplace=True)

print(df_derived.shape)
df_derived.head()

In [None]:
df_derived_pivot = pd.pivot_table(df_derived, 
                                  values='correctedPrM', 
                                  index=[phone_col, ts_col],
                                  columns=['svid'],
                                  aggfunc=np.mean)
df_derived_pivot.columns = [f'svid_{x}' for x in df_derived_pivot.columns]
df_derived_pivot.reset_index(inplace=True)
df_derived_pivot['sSinceGpsEpoch'] = df_derived_pivot[ts_col] // 1000

print(df_derived_pivot.shape)
df_derived_pivot.head()

In [None]:
df = df.merge(df_derived_pivot, how='left', on=[phone_col, 'sSinceGpsEpoch'], suffixes=['', '_2'])
df.drop(['sSinceGpsEpoch', ts_col + '_2'], axis=1, inplace=True)
print(df.shape)
df.head()

In [None]:
df['d_lat'] = df['latDeg_gt'] - df[lat_col]
df['d_lon'] = df['lngDeg_gt'] - df[lon_col]
df[['d_lat', 'd_lon']].describe()

In [None]:
derived_files = (data_dir / 'test').rglob('*_derived.csv')
cols = [ts_col, 'svid', 'correctedPrM']

df_list = []
for t in tqdm(derived_files, total=48):
    derived = pd.read_csv(t)
    derived['sSinceGpsEpoch'] = derived[ts_col] // 1000
    derived.drop_duplicates(['sSinceGpsEpoch', 'svid'], inplace=True)
    derived['correctedPrM'] = (derived['rawPrM'] + derived['satClkBiasM'] - derived['isrbM'] - 
                               derived['ionoDelayM'] - derived['tropoDelayM'])
    df_list.append(derived[[cname_col, pname_col, 'sSinceGpsEpoch', 'svid', 'correctedPrM']])
    
df_derived = pd.concat(df_list, ignore_index=True)
df_derived[phone_col] = df_derived[cname_col] + '_' + df_derived[pname_col]
df_derived.drop([cname_col, pname_col], axis=1, inplace=True)

df_derived_pivot = pd.pivot_table(df_derived, 
                                  values='correctedPrM', 
                                  index=[phone_col, 'sSinceGpsEpoch'],
                                  columns=['svid'],
                                  aggfunc=np.mean)
df_derived_pivot.columns = [f'svid_{x}' for x in df_derived_pivot.columns]
df_derived_pivot.reset_index(inplace=True)

df_tst = df_tst.merge(df_derived_pivot, how='left', 
                      on=[phone_col, 'sSinceGpsEpoch']).drop(['sSinceGpsEpoch'], axis=1)
print(df_tst.shape)
df_tst.head()

In [None]:
df_tst.describe()

## Raw Data Aggregation - To Be Updated

# Model Training

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
feature_cols = [x for x in df_tst.columns if x not in [phone_col, ts_col]]
target_cols = ['d_lat', 'd_lon']
input_dim = len(feature_cols)
output_dim = len(target_cols)

In [None]:
scaler = StandardScaler()
label_scaler = StandardScaler()
scaler.fit(pd.concat([df[feature_cols], df_tst[feature_cols]], axis=0).fillna(0).values)
X = scaler.transform(df[feature_cols].fillna(0).values)
X_tst = scaler.transform(df_tst[feature_cols].fillna(0).values)
Y = label_scaler.fit_transform(df[target_cols].values)
print(X.shape, Y.shape, X_tst.shape)

In [None]:
def build_model():
    inputs = keras.layers.Input((input_dim,))
    x = keras.layers.Dense(128, activation='relu')(inputs)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.Dropout(.3)(x)
    
    ox = x
    
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.Dropout(.3)(x)
    
    x = keras.layers.Add()([x, ox])
    
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.Dropout(.3)(x)
    
    outputs = keras.layers.Dense(output_dim, activation='linear')(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(lrate), loss='mean_squared_error')
    return model

In [None]:
with tpu_strategy.scope():
    model = build_model()
    model.summary()

In [None]:
def scheduler(epoch, lr, warmup=5):
    if epoch < warmup:
        return lr * 1.5
    else:
        return lr * tf.math.exp(-.1)

es = keras.callbacks.EarlyStopping(patience=n_stop, restore_best_weights=True)
lr = keras.callbacks.LearningRateScheduler(scheduler)

cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

P = np.zeros_like(Y, dtype=float)
P_tst = np.zeros((X_tst.shape[0], output_dim), dtype=float)
for i, (i_trn, i_val) in enumerate(cv.split(X), 1):
    print(f'Training for CV #{i}')
    model = build_model()
    history = model.fit(X[i_trn], Y[i_trn], validation_data=(X[i_val], Y[i_val]), 
                        epochs=epochs, batch_size=batch_size, callbacks=[es, lr], verbose=0)
    P[i_val] = label_scaler.inverse_transform(model.predict(X[i_val]))
    P_tst += label_scaler.inverse_transform(model.predict(X_tst)) / n_fold
    
    distance_i = calc_haversine(df.latDeg_gt.values[i_val], 
                                df.lngDeg_gt.values[i_val], 
                                P[i_val, 0] + df.latDeg.values[i_val], 
                                P[i_val, 1] + df.lngDeg.values[i_val]).mean()
    print(f'CV #{i}: {np.percentile(distance_i, [50, 95])}')

In [None]:
print(P.mean(axis=0), P_tst.mean(axis=0))
np.savetxt(predict_val_file, P, delimiter=',', fmt='%.6f')
np.savetxt(predict_tst_file, P_tst, delimiter=',', fmt='%.6f')

In [None]:
distance = calc_haversine(df.latDeg_gt, df.lngDeg_gt, P[:, 0] + df.latDeg, P[:, 1] + df.lngDeg)
print(f'CV All: {np.percentile(distance, [50, 95])}')

In [None]:
df.sort_values([phone_col, ts_col], inplace=True)
df_smoothed = df.copy()
df_smoothed[lat_col] = df[lat_col] + P[:, 0]
df_smoothed[lon_col] = df[lon_col] + P[:, 1]
df_smoothed = apply_kf_smoothing(df_smoothed)
distance = calc_haversine(df_smoothed.latDeg_gt, df_smoothed.lngDeg_gt, df_smoothed.latDeg, df_smoothed.lngDeg)
print(f'CV All (smoothed): {np.percentile(distance, [50, 95])}')

In [None]:
plt.plot(history.history['lr'])

# Submission File

In [None]:
distance_tst = calc_haversine(df_tst.latDeg, df_tst.lngDeg, P_tst[:, 0] + df_tst.latDeg, P_tst[:, 1] + df_tst.lngDeg)
print(f'CV All: {np.percentile(distance_tst, [50, 95])}')

In [None]:
df_tst.sort_values([phone_col, ts_col], inplace=True)
df_tst_smoothed = df_tst.copy()
df_tst_smoothed[lat_col] = df_tst_smoothed[lat_col] + P_tst[:, 0]
df_tst_smoothed[lon_col] = df_tst_smoothed[lon_col] + P_tst[:, 1]
df_tst_smoothed = apply_kf_smoothing(df_tst_smoothed)
distance_tst = calc_haversine(df_tst.latDeg, df_tst.lngDeg, df_tst_smoothed.latDeg, df_tst_smoothed.lngDeg)
print(f'CV All (smoothed): {np.percentile(distance_tst, [50, 95])}')

In [None]:
df_tst_smoothed[[phone_col, ts_col, lat_col, lon_col]].to_csv(submission_file, index=False)