Notebook for the GoogleDecimeterChallenge https://www.kaggle.com/competitions/smartphone-decimeter-2023


In [2]:
#reading data
import numpy as np 
import pandas as pd
from tqdm import tqdm, trange
import os

INPUT_PATH = 'sdc2023/'



In [3]:


test_input_data = []
test_gt_data = []

#iterate over all data files and store them in the respective arrays

#load test data
test_files = os.listdir(INPUT_PATH + "test")

for folder in test_files:
    smartphones = os.listdir(INPUT_PATH + "test/"+folder)
    for smartphone in smartphones:
        file =  "/device_gnss.csv"
        #store data in list while dropping first and 41st column (string data)
        test_input_data.append(pd.read_csv(INPUT_PATH + "test/" +folder+"/"+smartphone + file, usecols=[i for i in range(58) if i not in [0,40]], dtype=np.float32).to_numpy(dtype=np.float32).tolist())


In [4]:
#load train data
MAX_SAMPLES = 400
train_files = os.listdir(INPUT_PATH + "train")
sample_count=0

train_input_data = []
train_gt_data = []

for folder in train_files:
    smartphones = os.listdir(INPUT_PATH + "train/"+folder)
    for smartphone in smartphones:
        files = os.listdir(INPUT_PATH + "train/"+folder+"/"+smartphone)
        for file in files:
            if file.endswith(".csv"):
                if sample_count >= MAX_SAMPLES:
                    break   
                if "gnss" in file:
                    #store data in list while dropping first and 41st column (string data), as well as (porbably mostly) empty columns
                    #also all data points with the same timestep are seen as features of one timestep of a sample
                    train_input_data.append(pd.read_csv(INPUT_PATH + "train/" +folder+"/"+ smartphone+ "/" + file, usecols=[i for i in range(58) if i not in [0,2,3,4,22,23,24,26,28,19,30,31,32,33,34,35,40]], dtype=float).to_numpy(dtype=float).tolist())
                elif "ground_truth" in file:
                    sample_count +=1
                    #store data in list while dropping first and 2nd column (string data),(porbably mostly) empty columns
                    train_gt_data.append(pd.read_csv(INPUT_PATH + "train/"+folder+"/" + smartphone+ "/" + file,  usecols=[i for i in range(9) if i not in [0,1]], dtype=float).to_numpy(dtype=float).tolist())



In [None]:
import tensorflow as tf

#move all features with the same timestamp to one line 

for sample in range(len(train_input_data)):
    matches = 0
    for step in range(len(train_input_data[sample])):
        if step != 0:
            if train_input_data[sample][step-matches][0] == train_input_data[sample][step-1-matches][0]:
                train_input_data[sample][step-1-matches] = np.concatenate((train_input_data[sample][step-1-matches], train_input_data[sample][step-matches]))
                train_input_data[sample].pop(step-matches)
                matches +=1


#pad input data
max_timesteps = max([len(sample) for sample in train_input_data])
max_features = max([len(feature) for sample in train_input_data for feature in sample])
for sample in train_input_data:
    sample.extend([[-1]*max_features]*(max_timesteps-len(sample)))
train_input_data_padded = [tf.keras.preprocessing.sequence.pad_sequences(sample,value=-1, padding="post", dtype=np.int32, maxlen=max_features) for sample in train_input_data]
train_input_data_padded = np.array(train_input_data_padded)

del train_input_data

#pad ground truth data
max_timesteps_gt = max([len(sample) for sample in train_gt_data])
max_features_gt = max([len(feature) for sample in train_gt_data for feature in sample])
for sample in train_gt_data:
    sample.extend([[-1]*max_features_gt]*(max_timesteps_gt-len(sample)))
#convert to right data format
train_gt_data = np.array(train_gt_data)
train_gt_data_padded = [tf.keras.preprocessing.sequence.pad_sequences(sample, value=-1,padding="post", dtype=np.int32, maxlen=max_features_gt) for sample in train_gt_data]
train_gt_data_padded = np.array(train_gt_data_padded)
del train_gt_data


  trunc = np.asarray(trunc, dtype=dtype)


In [None]:
from sklearn.preprocessing import MinMaxScaler

#normalize data
scaler = MinMaxScaler()
for sample in train_input_data_padded:
    sample = scaler.fit_transform(sample)


In [None]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.backend import clear_session

clear_session()

#create model
model = Sequential()
model.add(Masking(mask_value=-1, input_shape=(train_input_data_padded.shape[1], train_input_data_padded.shape[2])))
model.add(LSTM(60, return_sequences=True))
model.add(Dense(7))

model.compile(optimizer='adam', loss='mean_squared_error')


  super().__init__(**kwargs)


In [None]:

print(train_input_data_padded.shape)
print(train_gt_data_padded.shape)

(10, 2147, 1722)
(10, 2147, 7)


In [None]:

# train the model
model.fit(train_input_data_padded, train_gt_data_padded, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 639ms/step - loss: 658649177760202752.0000 - val_loss: 658811355725299712.0000
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 549ms/step - loss: 658529262273298432.0000 - val_loss: 658811355725299712.0000
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 554ms/step - loss: 658469819925921792.0000 - val_loss: 658811355725299712.0000
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 554ms/step - loss: 658509058747138048.0000 - val_loss: 658811355725299712.0000
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 551ms/step - loss: 658615848813985792.0000 - val_loss: 658811355725299712.0000
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 558ms/step - loss: 658602723393929216.0000 - val_loss: 658811355725299712.0000
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 608ms/step - lo

<keras.src.callbacks.history.History at 0x20af2df8e50>