In [7]:
import pandas as pd
import numpy as np
import re
import time

In [52]:
detail_num = 1000

In [105]:
class TimeKeeper():
    def __init__(self):
        self.current_time = time.time()

    def update_time(self):
        self.current_time = time.time()
    
    def set_time(self, customized_time):
        self.current_time = customized_time

    def get_time_current(self):
        return self.current_time
    
    def get_time_elapse(self):
        return time.time() - self.current_time

    def get_update_time(self, round_digit = 2):
        time_elapse = self.get_time_elapse()
        self.update_time()
        return round(time_elapse, 2)

In [106]:
def read_shl_data(root_path, detail_num = 1000):
    timer = TimeKeeper()
    # ------------------------------------- Label -------------------------------------
    label_names = ['time', 'label']
    label = pd.read_table('data/train/Label.txt', header = None, names = label_names, sep = "\t")
    print("Label 读取完成，共 {} 条数据，用时 {}s".format(label.shape[0], timer.get_update_time()))
    # ----------------------------------- Location ------------------------------------
    loc_names = ['time', 'ign1', 'ign2', 'accuracy', 'latitude', 'longitude', 'altitude']
    loc = pd.read_table(root_path + 'Location.txt', header = None, names = loc_names, sep = " ").drop(['ign1', 'ign2'], axis = 1)
    print("Location 读取完成，共 {} 条数据，用时 {}s".format(loc.shape[0], timer.get_update_time()))
    # ------------------------------------- Wifi --------------------------------------
    wifi = pd.read_table(root_path + 'Wifi.txt', header = None)
    wifi['time'] = wifi.apply(lambda x: x[0].split(";")[0], axis = 1)
    wifi['number'] = wifi.apply(lambda x: x[0].split(";")[3], axis = 1)
    print("Wifi 读取完成，共 {} 条数据，用时 {}s".format(wifi.shape[0], timer.get_update_time()))
    # detail info
    wifi_detail = wifi.iloc[:detail_num,:].apply(lambda x: wifi_detail_transformer(x[0].split(";")[4:], x[0].split(";")[0]), axis = 1)
    wifi_detail = pd.concat(list(wifi_detail)).reset_index(drop = True)
    print("\t-- Wifi 详细信息提取完成，共 {} 条数据，提取了 Wifi 中前 {} 行，用时 {}s".format(wifi_detail.shape[0], detail_num, timer.get_update_time()))
    # delete raw data
    wifi.drop([0], axis = 1, inplace = True)
    # -------------------------------------- GPS --------------------------------------
    gps = pd.read_table(root_path + 'GPS.txt', header = None)
    gps['time'] = gps.apply(lambda x: x[0].split(" ")[0], axis = 1)
    gps['number'] = gps.apply(lambda x: x[0].split(" ")[-1], axis = 1)
    print("GPS 读取完成，共 {} 条数据，用时 {}s".format(gps.shape[0], timer.get_update_time()))
    # detail info
    gps_detail = gps.iloc[:detail_num,:].apply(lambda x: gps_detail_transformer(x[0].split(" ")[3:-1], x[0].split(";")[0]), axis = 1)
    gps_detail = pd.concat(list(gps_detail)).reset_index(drop = True)
    print("\t-- GPS 详细信息提取完成，共 {} 条数据，提取了 GPS 中前 {} 行，用时 {}s".format(gps_detail.shape[0], detail_num, timer.get_update_time()))
    # delete raw data
    gps.drop([0], axis = 1, inplace = True)
    # -------------------------------------- Cells --------------------------------------
    cells = pd.read_table(root_path + 'Cells.txt', header = None)
    cells['time'] = cells.apply(lambda x: x[0].split(" ")[0], axis = 1)
    cells['number'] = cells.apply(lambda x: x[0].split(" ")[3], axis = 1)
    cells.drop([0], axis = 1, inplace = True)
    print("Cells 读取完成，共 {} 条数据，用时 {}s".format(gps.shape[0], timer.get_update_time()))

    return {"label": label, "loc": loc, "wifi": wifi, "wifi_detail": wifi_detail,
            "gps": gps, "gps_detail": gps_detail, "cells": cells}

In [107]:
data = read_shl_data('data/train/')

Label 读取完成，共 980527 条数据，用时 0.13s
Location 读取完成，共 911109 条数据，用时 0.55s
Wifi 读取完成，共 1459351 条数据，用时 34.89s
	-- Wifi 详细信息提取完成，共 6752 条数据，提取了 Wifi 中前 1000 行，用时 0.68s
GPS 读取完成，共 1322749 条数据，用时 26.88s
	-- GPS 详细信息提取完成，共 5516 条数据，提取了 GPS 中前 1000 行，用时 0.62s
Cells 读取完成，共 1322749 条数据，用时 25.1s


In [3]:
from utils import data_loader

data = data_loader.SHLDataLoader('data/train/')

In [4]:
data.load()

Label 读取完成，共 980527 条数据，用时 0.15s
Location 读取完成，共 911109 条数据，用时 0.56s
Wifi 读取完成，共 1459351 条数据，用时 34.69s
	-- Wifi 详细信息提取完成，共 1000 条数据，提取了 Wifi 中前 1000 行，用时 0.63s
GPS 读取完成，共 1322749 条数据，用时 25.9s
	-- GPS 详细信息提取完成，共 1000 条数据，提取了 GPS 中前 1000 行，用时 0.62s
Cells 读取完成，共 1324881 条数据，用时 24.63s


## Location

In [None]:
loc_names = ['time', 'ign1', 'ign2', 'accuracy', 'latitude', 'longitude', 'altitude']
loc = pd.read_table('data/train/Location.txt', header = None, names = loc_names, sep = " ").drop(['ign1', 'ign2'], axis = 1)
loc.head()

## Label

In [4]:
label_names = ['time', 'label']
label = pd.read_table('data/train/Label.txt', header = None, names = label_names, sep = "\t")
# label.groupby('label').count().apply(lambda x: x/label.shape[0], axis = 1)
label.head()

Unnamed: 0,time,label
0,1490431583000,4
1,1490431584000,4
2,1490431585000,4
3,1490431586000,4
4,1490431587000,4


## Wifi

In [5]:
def wifi_detail_transformer(wifi_list, this_time):
    res = np.array(wifi_list).reshape(-1, 5)
    res = pd.DataFrame(res, columns = ('bssid', 'ssid', 'rssi', 'freq', 'cap'))
    res.insert(loc = 0, column = 'time', value = [this_time] * res.shape[0])
    return res

In [56]:
time_recorder = []
time_recorder.append(time.time())
wifi = pd.read_table('data/train/Wifi.txt', header = None)
wifi['time'] = wifi.apply(lambda x: x[0].split(";")[0], axis = 1)
wifi['number'] = wifi.apply(lambda x: x[0].split(";")[3], axis = 1)
time_recorder.append(time.time())
# detail info
wifi_detail = wifi.iloc[:detail_num,:].apply(lambda x: wifi_detail_transformer(x[0].split(";")[4:], x[0].split(";")[0]), axis = 1)
wifi_detail = pd.concat(list(wifi_detail)).reset_index(drop = True)
time_recorder.append(time.time())
# delete raw data
wifi.drop([0], axis = 1, inplace = True)
print(pd.Series(time_recorder).diff(1))

0          NaN
1    33.852244
2     0.663604
dtype: float64


## GPS

In [57]:
def gps_detail_transformer(gps_list, this_time):
    res = np.array(gps_list).reshape(-1, 4)
    res = pd.DataFrame(res, columns = ('id', 'snr', 'azimuth', 'elevation'))
    res.insert(loc = 0, column = 'time', value = [this_time] * res.shape[0])
    # print(res)
    return res

In [58]:
time_recorder = []
time_recorder.append(time.time())
gps = pd.read_table('data/train/GPS.txt', header = None)
gps['time'] = gps.apply(lambda x: x[0].split(" ")[0], axis = 1)
gps['number'] = gps.apply(lambda x: x[0].split(" ")[-1], axis = 1)
time_recorder.append(time.time())
# detail info
gps_detail = gps.iloc[:detail_num,:].apply(lambda x: gps_detail_transformer(x[0].split(" ")[3:-1], x[0].split(";")[0]), axis = 1)
gps_detail = pd.concat(list(gps_detail)).reset_index(drop = True)
time_recorder.append(time.time())
time_recorder.append(time.time())
# delete raw data
gps.drop([0], axis = 1, inplace = True)

In [59]:
print(pd.Series(time_recorder).diff(1))

0          NaN
1    26.050008
2     0.588471
3     0.000024
dtype: float64


## Cells

In [68]:
cells = pd.read_table('data/train/Cells.txt', header = None)
cells['time'] = cells.apply(lambda x: x[0].split(" ")[0], axis = 1)
cells['number'] = cells.apply(lambda x: x[0].split(" ")[3], axis = 1)

# cells.drop([0], axis = 1, inplace = True)

In [70]:
cells.head()

Unnamed: 0,0,time,number
0,1490430923343 53750757282 242658854 3 LTE 1 12...,1490430923343,3
1,1490430923349 53756677074 242658854 3 LTE 1 12...,1490430923349,3
2,1490430947385 77792426028 242658854 13 LTE 1 1...,1490430947385,13
3,1490430953392 83799049986 242658854 3 LTE 1 12...,1490430953392,3
4,1490430958903 89310166652 242658854 3 LTE 1 12...,1490430958903,3


In [None]:
def cells_detail_transformer(cells_list, this_time):
    res = np.array(gps_list).reshape(-1, 4)
    res = pd.DataFrame(res, columns = ('id', 'snr', 'azimuth', 'elevation'))
    res.insert(loc = 0, column = 'time', value = [this_time] * res.shape[0])
    # print(res)
    return res

In [78]:
ite_names = 'LTE; Signal level; Signal strength; Signal level; 28-bit Cell Identity; 3-digit Mobile Country Code; 2 or 3-digit Mobile Network Code; Physical Cell Id; 16-bit Tracking Area Code'.split("; ")
gsm_names = 'GSM; Signal level calculated based on 3GPP RSRP; Signal strength; Signal level; 16-bit GSM Cell Identity described in TS 27.007; 16-bit Location Area Code; 3-digit Mobile Country Code; 2 or 3-digit Mobile Network Code'.split("; ")
wcdma_names = 'WCDMA; isRegistered; cid; lac; MCC; MNC; PSC; asuLevel; dBm; level'.split("; ")

In [82]:
ite_names

['LTE',
 'Signal level',
 'Signal strength',
 'Signal level',
 '28-bit Cell Identity',
 '3-digit Mobile Country Code',
 '2 or 3-digit Mobile Network Code',
 'Physical Cell Id',
 '16-bit Tracking Area Code']

In [81]:
cells.iloc[0:10,0].apply(lambda x: len(x.split(" ")[4:]))

0     30
1     30
2    124
3     30
4     30
5     30
6    134
7     40
8     40
9     20
Name: 0, dtype: int64

In [66]:
list(cells.iloc[0:10,0])

['1490430923343 53750757282 242658854 3 LTE 1 128000386 234 10 20 144 25 -115 2 LTE 0 2147483647 234 10 398 2147483647 22 -118 1 LTE 0 2147483647 234 10 4 2147483647 16 -124 1',
 '1490430923349 53756677074 242658854 3 LTE 1 128000386 234 10 20 144 25 -115 2 LTE 0 2147483647 234 10 398 2147483647 22 -118 1 LTE 0 2147483647 234 10 4 2147483647 16 -124 1',
 '1490430947385 77792426028 242658854 13 LTE 1 128000386 234 10 20 144 20 -120 1 GSM 0 2147483647 2147483647 234 10 10 -93 3 GSM 0 2147483647 2147483647 234 10 6 -101 2 GSM 0 2147483647 2147483647 234 10 5 -103 2 GSM 0 2147483647 2147483647 234 10 2 -109 0 GSM 0 2147483647 2147483647 234 10 2 -109 0 GSM 0 2147483647 2147483647 234 10 2 -109 0 WCDMA 0 2147483647 2147483647 234 10 250 3 -107 1 WCDMA 0 2147483647 2147483647 234 10 14 3 -107 1 WCDMA 0 2147483647 2147483647 234 10 158 0 -113 0 LTE 0 2147483647 234 10 398 2147483647 20 -120 1 LTE 0 2147483647 234 10 4 2147483647 19 -121 1 LTE 0 2147483647 234 10 20 2147483647 19 -121 1',
 '14

In [87]:
cells.iloc[4,0]

NameError: name 'ells' is not defined