In [29]:
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt
from collections import Counter
import pickle
import utm
import random
import matplotlib as mpl
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

from utils import DataLoader, TimeKeeper

In [2]:
plt.style.use("dark_background")

%load_ext autoreload
%autoreload 2

In [3]:
label_dic = {1: 'Still', 2: 'Walking', 3: 'Run', 4: 'Bike', 5: 'Car', 6: 'Bus', 7: 'Train', 8: 'Subway'}
cmap = mpl.cm.get_cmap('viridis', 8)
label_cols = cmap(np.linspace(0, 1, 8))
label_cols = [mpl.colors.to_hex(i) for i in label_cols]
# label_cols

In [4]:
# data = DataLoader.SHLDataLoader('data/train/', ratio = 0.1)

In [5]:
# data.load_all(detail = True)

In [6]:
# save
# file = open('data/mid_res/data_detail.pickle', 'wb')
# pickle.dump(data, file)
# file.close()

In [81]:
# load
with open('data/mid_res/data_detail_full.pickle', 'rb') as file:
    data = pickle.load(file)

# Feature Extraction

In [82]:
# unit: 1s
data.loc['time'] = data.loc.apply(lambda x: x['time'].astype('int').round(-3), axis = 1)
data.gps['time'] = data.gps['time'].astype('int').round(-3)
data.wifi['time'] = data.wifi['time'].astype('int').round(-3)
data.cells['time'] = data.cells['time'].astype('int').round(-3)
# data type
data.gps['number'] = data.gps['number'].apply(pd.to_numeric)
data.wifi['number'] = data.wifi['number'].apply(pd.to_numeric)
data.cells['number'] = data.cells['number'].apply(pd.to_numeric)


In [83]:
data.loc = data.loc.groupby(['time'], as_index = False).mean()[data.loc.columns.to_list()]
data.gps = data.gps.groupby(['time'], as_index = False).mean()[data.gps.columns.to_list()]
data.wifi = data.wifi.groupby(['time'], as_index = False).mean()[data.wifi.columns.to_list()]
data.cells = data.cells.groupby(['time'], as_index = False).mean()[data.cells.columns.to_list()]

In [84]:
data.df = pd.merge(data.label, data.loc, on = ['time'], how = 'left')
data.df = pd.merge(data.df, data.gps.rename({"number": "num_gps"}, axis = 1), on = ['time'], how = 'left')
data.df = pd.merge(data.df, data.wifi.rename({"number": "num_wifi"}, axis = 1), on = ['time'], how = 'left')
data.df = pd.merge(data.df, data.cells.rename({"number": "num_cells"}, axis = 1), on = ['time'], how = 'left')

# data.df = pd.merge(data.loc, data.gps.rename({"number": "num_gps"}, axis = 1), on = ['time'], how = 'outer')
# data.df = pd.merge(data.df, data.wifi.rename({"number": "num_wifi"}, axis = 1), on = ['time'], how = 'outer')
# data.df = pd.merge(data.df, data.cells.rename({"number": "num_cells"}, axis = 1), on = ['time'], how = 'outer')
# data.df = pd.merge(data.df, data.label, on = ['time'], how = 'right')

## loc speed

In [85]:
# (latitude, longitude) -> east
def gps2utm_east(x):
    try:
        return utm.from_latlon(x['latitude'], x['longitude'])[0]
    except:
        return np.nan
# (latitude, longitude) -> north
def gps2utm_north(x):
    try:
        return utm.from_latlon(x['latitude'], x['longitude'])[1]
    except:
        return np.nan

In [86]:
# prepare
data.df['time_dlt'] = data.df['time'].diff().fillna(method = 'bfill')
data.df['valid_dlt'] = data.df.apply(lambda x: int(x['time_dlt'] <= 10000), axis = 1)
# utm loc
data.df['east'] = data.df.apply(lambda x: gps2utm_east(x), axis = 1)
data.df['north'] = data.df.apply(lambda x: gps2utm_north(x), axis = 1)
data.df['east_dlt'] = data.df['east'].diff(1)
data.df['north_dlt'] = data.df['north'].diff(1)
# speed
data.df['east_speed'] = data.df.apply(lambda x: x['east_dlt']/x['time_dlt']*1000 if x['valid_dlt'] == 1 else np.nan, axis = 1)
data.df['north_speed'] = data.df.apply(lambda x: x['north_dlt']/x['time_dlt']*1000 if x['valid_dlt'] == 1 else np.nan, axis = 1)
data.df['east_speed'] = data.df['east_speed'].apply(lambda x: x if np.abs(x) < 300 else np.nan)
data.df['north_speed'] = data.df['north_speed'].apply(lambda x: x if np.abs(x) < 300 else np.nan)
data.df['speed'] = data.df.apply(lambda x: np.sqrt(x['east_speed']**2 + x['north_speed']**2), axis = 1)
data.df['speed_dif'] = data.df.apply(lambda x: np.abs(x['east_speed'] - x['north_speed']), axis = 1)
# acc 
data.df['acc'] = data.df.apply(lambda x: x['speed']/x['time_dlt'] if x['valid_dlt'] == 1 else np.nan, axis = 1)

In [87]:
def plot_label_each(df, col_name, this_label):
    # print("------- Plotting {} with label {} -------".format(col_name, label_dic[this_label]))
    label_index = np.where(df.label == this_label)[0]
    p = plt.scatter(df.index[label_index], df[col_name][label_index], c = label_cols[this_label - 1], label = label_dic[this_label])
    return p

def plot_label(df, col_name):
    plt.figure(figsize = [12, 8])
    for this_label in np.unique(list(df.label)):
        plot_label_each(df, col_name, this_label)
    plt.legend(loc = 'best')
    

In [88]:
# plot_label(data.df, 'speed')

In [89]:
# plot_label(data.df, 'north_speed')

In [92]:
# plot_label(data.df, 'east_speed')

## WiFi

In [93]:
data.wifi_detail['time'] = data.wifi_detail['time'].astype("int").round(-3)
data.wifi_detail['wifi_rssi'] = data.wifi_detail['rssi'].apply(pd.to_numeric)
data.wifi_detail['wifi_freq'] = data.wifi_detail['freq'].apply(pd.to_numeric)
data.wifi_detail['wifi_freq'] = data.wifi_detail['wifi_freq'].apply(lambda x: 5 if x > 3000 else 2.4)

In [94]:
tmp_wifi_mode = data.wifi_detail[['time', 'wifi_rssi']].groupby(['time'], as_index = False).agg(lambda x: Counter(x).most_common()[0][0]).add_suffix("_mode")
tmp_wifi_mean = data.wifi_detail[['time', 'wifi_rssi']].groupby(['time'], as_index = False).mean().add_suffix("_mean")
tmp_wifi_min = data.wifi_detail[['time', 'wifi_rssi']].groupby(['time'], as_index = False).min().add_suffix("_min")
tmp_wifi_max = data.wifi_detail[['time', 'wifi_rssi']].groupby(['time'], as_index = False).max().add_suffix("_max")
tmp_wifi_std = data.wifi_detail[['time', 'wifi_rssi']].groupby(['time'], as_index = False).std().add_suffix("_std")

In [95]:
tmp_wifi = pd.merge(tmp_wifi_mode.rename({"time_mode": "time"}, axis = 1), tmp_wifi_mean.rename({"time_mean": "time"}, axis = 1), on = ['time'])
tmp_wifi = pd.merge(tmp_wifi, tmp_wifi_min.rename({"time_min": "time"}, axis = 1), on = ['time'])
tmp_wifi = pd.merge(tmp_wifi, tmp_wifi_max.rename({"time_max": "time"}, axis = 1), on = ['time'])
tmp_wifi = pd.merge(tmp_wifi, tmp_wifi_std.rename({"time_std": "time"}, axis = 1), on = ['time'])

data.df = pd.merge(data.df, tmp_wifi, on = ['time'], how = 'left')

In [96]:
# plot_label(data.df, 'wifi_rssi_mean')

## Cells

In [97]:
data.cells_detail['time'] = data.cells_detail['time'].astype("int").round(-3)
data.cells_detail['isRegistered'] = data.cells_detail['isRegistered'].apply(pd.to_numeric)
data.cells_detail['asuLevel'] = data.cells_detail['asuLevel'].apply(pd.to_numeric)
data.cells_detail['dbm'] = data.cells_detail['dbm'].apply(pd.to_numeric)
data.cells_detail['level'] = data.cells_detail['level'].apply(pd.to_numeric)

In [98]:
tmp_cells_mode = data.cells_detail[['time', 'ctype']].groupby(['time'], as_index = False).agg(lambda x: Counter(x).most_common()[0][0]).add_prefix("cells_").add_suffix("_mode")
tmp_cells_mean = data.cells_detail[['time', 'isRegistered', 'asuLevel', 'dbm', 'level']].groupby(['time'], as_index = False).mean().add_prefix("cells_").add_suffix("_mean")
tmp_cells_min = data.cells_detail[['time', 'asuLevel', 'dbm', 'level']].groupby(['time'], as_index = False).min().add_prefix("cells_").add_suffix("_min")
tmp_cells_max = data.cells_detail[['time', 'asuLevel', 'dbm', 'level']].groupby(['time'], as_index = False).max().add_prefix("cells_").add_suffix("_max")
tmp_cells_std = data.cells_detail[['time', 'asuLevel', 'dbm']].groupby(['time'], as_index = False).std().add_prefix("cells_").add_suffix("_std")

In [99]:
tmp_cells = pd.merge(tmp_cells_mode.rename({"cells_time_mode": "time"}, axis = 1), tmp_cells_mean.rename({"cells_time_mean": "time"}, axis = 1), on = ['time'])
tmp_cells = pd.merge(tmp_cells, tmp_cells_min.rename({"cells_time_min": "time"}, axis = 1), on = ['time'])
tmp_cells = pd.merge(tmp_cells, tmp_cells_max.rename({"cells_time_max": "time"}, axis = 1), on = ['time'])
tmp_cells = pd.merge(tmp_cells, tmp_cells_std.rename({"cells_time_std": "time"}, axis = 1), on = ['time'])

data.df = pd.merge(data.df, tmp_cells, on = ['time'], how = 'left')

In [100]:
# plot_label(data.df, 'cells_dbm_mean')

## GPS

In [101]:
data.gps_detail['time'] = data.gps_detail['time'].astype("int").round(-3)
data.gps_detail['gps_snr'] = data.gps_detail['snr'].apply(pd.to_numeric)

In [102]:
tmp_gps_mean = data.gps_detail[['time', 'gps_snr']].groupby(['time'], as_index = False).mean().add_suffix("_mean")
tmp_gps_min = data.gps_detail[['time', 'gps_snr']].groupby(['time'], as_index = False).min().add_suffix("_min")
tmp_gps_max = data.gps_detail[['time', 'gps_snr']].groupby(['time'], as_index = False).max().add_suffix("_max")
tmp_gps_std = data.gps_detail[['time', 'gps_snr']].groupby(['time'], as_index = False).std().add_suffix("_std")

In [103]:
tmp_gps = pd.merge(tmp_gps_mean.rename({"time_mean": "time"}, axis = 1), tmp_gps_min.rename({"time_min": "time"}, axis = 1), on = ['time'])
tmp_gps = pd.merge(tmp_gps, tmp_gps_max.rename({"time_max": "time"}, axis = 1), on = ['time'])
tmp_gps = pd.merge(tmp_gps, tmp_gps_std.rename({"time_std": "time"}, axis = 1), on = ['time'])

In [104]:
data.df = pd.merge(data.df, tmp_gps, on = ['time'], how = 'left')

In [107]:
data.df.to_csv('data/mid_res/data_df_full.csv', header = True, index = False)
# data.df = pd.read_csv('data/mid_res/data_df_full.csv')

## One Hot

In [108]:
def get_one_hot(df, class_col_name):
    X = df[class_col_name].values.reshape(-1, 1)
    enc = OneHotEncoder(handle_unknown = 'ignore', sparse = False).fit(X)
    col_names = ["{}_{}".format(class_col_name, i) for i in enc.categories_[0].tolist()]
    return pd.DataFrame(enc.transform(X), columns = col_names)

In [109]:
data.df_hot = data.df.copy()
# discrete v
dis_cols = ['cells_ctype_mode']
for col in dis_cols:
    data.df_hot[col].fillna("miss", inplace = True)
    df_hot = get_one_hot(data.df_hot, col)
    data.df_hot = pd.concat([data.df_hot, df_hot])
    data.df_hot.drop(col, axis = 1, inplace = True)

# continuous v
data.df_hot.fillna(0, inplace = True)

In [110]:
data.df_hot

Unnamed: 0,time,label,accuracy,latitude,longitude,altitude,num_gps,num_wifi,num_cells,time_dlt,...,cells_asuLevel_std,cells_dbm_std,gps_snr_mean,gps_snr_min,gps_snr_max,gps_snr_std,cells_ctype_mode_GSM,cells_ctype_mode_LTE,cells_ctype_mode_WCDMA,cells_ctype_mode_miss
0,1.490432e+12,4.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,1000.0,...,0.0,0.0,22.50,16.0,29.0,5.686241,0.0,0.0,0.0,0.0
1,1.490432e+12,4.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,1000.0,...,0.0,0.0,22.25,15.0,29.0,6.075909,0.0,0.0,0.0,0.0
2,1.490432e+12,4.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,1000.0,...,0.0,0.0,22.25,15.0,29.0,6.075909,0.0,0.0,0.0,0.0
3,1.490432e+12,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1000.0,...,0.0,0.0,22.00,15.0,28.0,5.715476,0.0,0.0,0.0,0.0
4,1.490432e+12,4.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,1000.0,...,0.0,0.0,21.75,15.0,28.0,5.852350,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980522,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,1.0,0.0,0.0
980523,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,1.0,0.0,0.0
980524,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,1.0,0.0,0.0
980525,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,1.0,0.0,0.0


In [77]:
col_drop = ['latitude',
 'longitude',
 'altitude',
 'time_dlt',
 'valid_dlt',
 'east',
 'north',
 'east_dlt',
 'north_dlt',
 'east_speed',
 'north_speed',
 ]

In [111]:
# data.df_hot.drop(col_drop, axis = 1).to_csv('data/mid_res/data_df_hot.csv', header = True, index = False)
data.df_hot.to_csv('data/mid_res/data_df_hot_full.csv', header = True, index = False)
# data.df_hot = pd.read_csv('data/mid_res/data_df_hot.csv')

## General

In [53]:
# tmp_df = data.df[['latitude', 'num_gps', 'num_cells', 'num_wifi', 'label']]
# plt.figure(figsize = [12, 8])
# p = plt.scatter(tmp_df.index, tmp_df.isnull().sum(axis = 1), c = tmp_df.label, alpha = 0.7, s = 0.7)
# plt.colorbar(p)

In [None]:
# data.df['latitude'] = data.df['latitude'].diff().fillna(method = 'bfill')
# data.df['longitude'] = data.df['longitude'].diff().fillna(method = 'bfill')
# data.df['altitude'] = data.df['altitude'].diff().fillna(method = 'bfill')

# data.df.replace(np.nan, 0, inplace = True)

In [81]:
data.df

Unnamed: 0,time,accuracy,num_gps,num_wifi,num_cells,label,speed,speed_dif,acc,wifi_rssi_mode,...,cells_level_min,cells_asuLevel_max,cells_dbm_max,cells_level_max,cells_asuLevel_std,cells_dbm_std,gps_snr_mean,gps_snr_min,gps_snr_max,gps_snr_std
0,1490431583000,,4.0,6.0,,4,,,,-48.0,...,,,,,,,22.500000,16.0,29.0,5.686241
1,1490431584000,,4.0,6.0,,4,,,,-48.0,...,,,,,,,22.250000,15.0,29.0,6.075909
2,1490431585000,,4.0,6.0,,4,,,,-48.0,...,,,,,,,22.250000,15.0,29.0,6.075909
3,1490431586000,,4.0,,,4,,,,,...,,,,,,,22.000000,15.0,28.0,5.715476
4,1490431587000,,4.0,6.0,,4,,,,-48.0,...,,,,,,,21.750000,15.0,28.0,5.852350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98047,1490861603000,16.0,14.0,8.0,,6,8.961595,0.276732,0.008962,-64.0,...,,,,,,,22.071429,12.0,30.0,4.968583
98048,1490861604000,16.0,14.0,8.0,,6,10.524812,0.288508,0.010525,-64.0,...,,,,,,,22.357143,12.0,30.0,5.569244
98049,1490861605000,16.0,14.0,8.0,,6,11.910520,0.106698,0.011911,-64.0,...,,,,,,,22.857143,11.0,32.0,6.249396
98050,1490861606000,16.0,14.0,8.0,,6,12.564799,0.269258,0.012565,-64.0,...,,,,,,,22.785714,11.0,32.0,6.459085


In [None]:
# plt.figure(figsize = [20, 10])
# p = plt.scatter(data.df.index, data.df['num_wifi'], c = data.df['label'], alpha = 0.4)
# plt.colorbar(p)

In [66]:
col_x = ['time',
 'label',
 'accuracy',
 'num_gps',
 'num_wifi',
 'num_cells',
 'speed',
 'acc',
 'wifi_rssi_mode',
 'wifi_rssi_mean',
 'wifi_rssi_min',
 'wifi_rssi_max',
 'wifi_rssi_std',
#  'cells_ctype_mode',
 'cells_isRegistered_mean',
 'cells_asuLevel_mean',
 'cells_dbm_mean',
 'cells_level_mean',
 'cells_asuLevel_min',
 'cells_dbm_min',
 'cells_level_min',
 'cells_asuLevel_max',
 'cells_dbm_max',
 'cells_level_max',
 'cells_asuLevel_std',
 'cells_dbm_std',
 'gps_snr',
 'gps_snr_mean',
 'gps_snr_min',
 'gps_snr_max',
 'gps_snr_std']

In [80]:
# data.df[col_x].to_csv('data/mid_res/data_df.csv', header = True, index = False)
# data.df = pd.read_csv('data/mid_res/data_df.csv')

## Plot (supplement)

In [None]:
plt.figure(figsize = [12, 8])
sns.violin(x = 'label', y = np.log(data.df.speed + 1), data = data.df)

In [29]:
label_dic

{1: 'Still',
 2: 'Walking',
 3: 'Run',
 4: 'Bike',
 5: 'Car',
 6: 'Bus',
 7: 'Train',
 8: 'Subway'}

# Validate

In [None]:
val = DataLoader.SHLDataLoader('data/validate/', ratio = None)

In [None]:
val.load_all(detail = False)

In [None]:
# unit: 1s
val.loc['time'] = val.loc.apply(lambda x: x['time'].astype('int').round(-3), axis = 1)
val.gps['time'] = val.gps['time'].astype('int').round(-3)
val.wifi['time'] = val.wifi['time'].astype('int').round(-3)
val.cells['time'] = val.cells['time'].astype('int').round(-3)
# data type
val.gps['number'] = val.gps['number'].apply(pd.to_numeric)
val.wifi['number'] = val.wifi['number'].apply(pd.to_numeric)
val.cells['number'] = val.cells['number'].apply(pd.to_numeric)

In [None]:
val.loc = val.loc.groupby(['time'], as_index = False).mean()[val.loc.columns.to_list()]
val.gps = val.gps.groupby(['time'], as_index = False).mean()[val.gps.columns.to_list()]
val.wifi = val.wifi.groupby(['time'], as_index = False).mean()[val.wifi.columns.to_list()]
val.cells = val.cells.groupby(['time'], as_index = False).mean()[val.cells.columns.to_list()]

In [None]:
val.df = pd.merge(val.loc, val.gps.rename({"number": "num_gps"}, axis = 1), on = ['time'], how = 'outer')
val.df = pd.merge(val.df, val.wifi.rename({"number": "num_wifi"}, axis = 1), on = ['time'], how = 'outer')
val.df = pd.merge(val.df, val.cells.rename({"number": "num_cells"}, axis = 1), on = ['time'], how = 'outer')
val.df = pd.merge(val.df, val.label, on = ['time'], how = 'right')

In [None]:
val.df['latitude'] = val.df['latitude'].diff().fillna(method = 'bfill')
val.df['longitude'] = val.df['longitude'].diff().fillna(method = 'bfill')
val.df['altitude'] = val.df['altitude'].diff().fillna(method = 'bfill')

val.df.replace(np.nan, 0, inplace = True)

In [None]:
pred_val = clf.predict(val.df[['latitude', 'longitude', 'altitude', 'num_gps', 'num_wifi']])

In [None]:
confusion_matrix(pred_val, val.df['label'])

In [None]:
precision_score(pred_val, val.df['label'], average ="micro")