In [6]:
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt

from utils import DataLoader, TimeKeeper

In [2]:
detail_num = 1000

In [3]:
data = DataLoader.SHLDataLoader('data/train/')

In [4]:
data.load()

Label 读取完成，共 980527 条数据，用时 0.15s
Location 读取完成，共 911109 条数据，用时 0.56s
Wifi 读取完成，共 1459351 条数据，用时 31.56s
	-- Wifi 详细信息提取完成，共 1000 条数据，提取了 Wifi 中前 1000 行，用时 0.59s
GPS 读取完成，共 1322749 条数据，用时 23.71s
	-- GPS 详细信息提取完成，共 1000 条数据，提取了 GPS 中前 1000 行，用时 0.68s
Cells 读取完成，共 1324881 条数据，用时 22.38s
	-- Cells 详细信息提取完成，共 1000 条数据，提取了 Cells 中前 1000 行，用时 1.52s


## Location

In [None]:
loc_names = ['time', 'ign1', 'ign2', 'accuracy', 'latitude', 'longitude', 'altitude']
loc = pd.read_table('data/train/Location.txt', header = None, names = loc_names, sep = " ").drop(['ign1', 'ign2'], axis = 1)
loc.head()

## Label

In [4]:
label_names = ['time', 'label']
label = pd.read_table('data/train/Label.txt', header = None, names = label_names, sep = "\t")
# label.groupby('label').count().apply(lambda x: x/label.shape[0], axis = 1)
label.head()

Unnamed: 0,time,label
0,1490431583000,4
1,1490431584000,4
2,1490431585000,4
3,1490431586000,4
4,1490431587000,4


## Wifi

In [5]:
def wifi_detail_transformer(wifi_list, this_time):
    res = np.array(wifi_list).reshape(-1, 5)
    res = pd.DataFrame(res, columns = ('bssid', 'ssid', 'rssi', 'freq', 'cap'))
    res.insert(loc = 0, column = 'time', value = [this_time] * res.shape[0])
    return res

In [56]:
time_recorder = []
time_recorder.append(time.time())
wifi = pd.read_table('data/train/Wifi.txt', header = None)
wifi['time'] = wifi.apply(lambda x: x[0].split(";")[0], axis = 1)
wifi['number'] = wifi.apply(lambda x: x[0].split(";")[3], axis = 1)
time_recorder.append(time.time())
# detail info
wifi_detail = wifi.iloc[:detail_num,:].apply(lambda x: wifi_detail_transformer(x[0].split(";")[4:], x[0].split(";")[0]), axis = 1)
wifi_detail = pd.concat(list(wifi_detail)).reset_index(drop = True)
time_recorder.append(time.time())
# delete raw data
wifi.drop([0], axis = 1, inplace = True)
print(pd.Series(time_recorder).diff(1))

0          NaN
1    33.852244
2     0.663604
dtype: float64


## GPS

In [57]:
def gps_detail_transformer(gps_list, this_time):
    res = np.array(gps_list).reshape(-1, 4)
    res = pd.DataFrame(res, columns = ('id', 'snr', 'azimuth', 'elevation'))
    res.insert(loc = 0, column = 'time', value = [this_time] * res.shape[0])
    # print(res)
    return res

In [58]:
time_recorder = []
time_recorder.append(time.time())
gps = pd.read_table('data/train/GPS.txt', header = None)
gps['time'] = gps.apply(lambda x: x[0].split(" ")[0], axis = 1)
gps['number'] = gps.apply(lambda x: x[0].split(" ")[-1], axis = 1)
time_recorder.append(time.time())
# detail info
gps_detail = gps.iloc[:detail_num,:].apply(lambda x: gps_detail_transformer(x[0].split(" ")[3:-1], x[0].split(";")[0]), axis = 1)
gps_detail = pd.concat(list(gps_detail)).reset_index(drop = True)
time_recorder.append(time.time())
time_recorder.append(time.time())
# delete raw data
gps.drop([0], axis = 1, inplace = True)

In [59]:
print(pd.Series(time_recorder).diff(1))

0          NaN
1    26.050008
2     0.588471
3     0.000024
dtype: float64


## Cells

In [160]:
def cells_detail_transformer(cells_list, this_time):
    # get cell type
    cells_type = re.findall(" LTE | GSM | WCDMA ", cells_list)
    # get detail info
    cells_info = re.split(" LTE | GSM | WCDMA ", cells_list)[1:]
    cells_detail = []
    for i, ctype in enumerate(cells_type):
        if re.match('.*LTE.*', ctype):
            info = ['LTE'] + np.array(cells_info[i].split(" "))[[0, 1, 4, 2, 3, 7, 6, 8]].tolist()
        elif re.match('.*GSM.*', ctype):
            info = ['GSM'] + np.array(cells_info[i].split(" "))[[0, 1, 2, 3, 4, 6, 5, 7]].tolist()
        elif re.match(".*WCDMA.*", ctype):
            info = ['WCDMA'] + np.array(cells_info[i].split(" "))[[0, 1, 2, 3, 4, 7, 6, 8]].tolist()
        else:
            raise ValueError("Unrecognized cell type {}".format(ctype))
        cells_detail.append(info)

    cells_detail = pd.DataFrame(cells_detail, columns = ('ctype', 'registered', 'id', 'lac', 'mmc', 'mnc', 'dbm', 'level1', 'level2'))
    cells_detail.insert(loc = 0, column = 'time', value = [this_time] * len(cells_type))

    return cells_detail

In [164]:
cells = pd.read_table('data/train/Cells.txt', header = None)
cells['time'] = cells.apply(lambda x: x[0].split(" ")[0], axis = 1)
cells['number'] = cells.apply(lambda x: x[0].split(" ")[3], axis = 1)
cells_detail = cells.iloc[:detail_num,:].apply(lambda x: cells_detail_transformer(x[0], x[0].split(" ")[0]), axis = 1)
cells_detail = pd.concat(cells_detail.to_list()).reset_index(drop = True)
cells.drop([0], axis = 1, inplace = True)

In [165]:
cells.head()

Unnamed: 0,time,number
0,1490430923343,3
1,1490430923349,3
2,1490430947385,13
3,1490430953392,3
4,1490430958903,3


In [166]:
cells_detail.head()

Unnamed: 0,time,ctype,registered,id,lac,mmc,mnc,dbm,level1,level2
0,1490430923343,LTE,1,128000386,20,234,10,-115,25,2
1,1490430923343,LTE,0,2147483647,398,234,10,-118,22,1
2,1490430923343,LTE,0,2147483647,4,234,10,-124,16,1
3,1490430923349,LTE,1,128000386,20,234,10,-115,25,2
4,1490430923349,LTE,0,2147483647,398,234,10,-118,22,1


In [15]:
ite_names = 'LTE; Signal level; Signal strength; Signal level; 28-bit Cell Identity; 3-digit Mobile Country Code; 2 or 3-digit Mobile Network Code; Physical Cell Id; 16-bit Tracking Area Code'.split("; ")
gsm_names = 'GSM; Signal level calculated based on 3GPP RSRP; Signal strength; Signal level; 16-bit GSM Cell Identity described in TS 27.007; 16-bit Location Area Code; 3-digit Mobile Country Code; 2 or 3-digit Mobile Network Code'.split("; ")
wcdma_names = 'WCDMA; isRegistered; cid; lac; MCC; MNC; PSC; asuLevel; dBm; level'.split("; ")

In [17]:
ite_names

['LTE',
 'Signal level',
 'Signal strength',
 'Signal level',
 '28-bit Cell Identity',
 '3-digit Mobile Country Code',
 '2 or 3-digit Mobile Network Code',
 'Physical Cell Id',
 '16-bit Tracking Area Code']

In [140]:
t = cells.iloc[0,0]
t2 = cells.iloc[2,0]

In [141]:
t

'1490430923343 53750757282 242658854 3 LTE 1 128000386 234 10 20 144 25 -115 2 LTE 0 2147483647 234 10 398 2147483647 22 -118 1 LTE 0 2147483647 234 10 4 2147483647 16 -124 1'

In [142]:
t2

'1490430947385 77792426028 242658854 13 LTE 1 128000386 234 10 20 144 20 -120 1 GSM 0 2147483647 2147483647 234 10 10 -93 3 GSM 0 2147483647 2147483647 234 10 6 -101 2 GSM 0 2147483647 2147483647 234 10 5 -103 2 GSM 0 2147483647 2147483647 234 10 2 -109 0 GSM 0 2147483647 2147483647 234 10 2 -109 0 GSM 0 2147483647 2147483647 234 10 2 -109 0 WCDMA 0 2147483647 2147483647 234 10 250 3 -107 1 WCDMA 0 2147483647 2147483647 234 10 14 3 -107 1 WCDMA 0 2147483647 2147483647 234 10 158 0 -113 0 LTE 0 2147483647 234 10 398 2147483647 20 -120 1 LTE 0 2147483647 234 10 4 2147483647 19 -121 1 LTE 0 2147483647 234 10 20 2147483647 19 -121 1'

In [145]:
cells_detail = cells_detail_transformer(t2, 1)

In [146]:
cells_detail

Unnamed: 0,time,ctype,registered,id,lac,mmc,mnc,dbm,level1,level2
0,1,LTE,1,128000386,20,234,10,-120,20,1
1,1,GSM,0,2147483647,2147483647,234,10,-93,10,3
2,1,GSM,0,2147483647,2147483647,234,10,-101,6,2
3,1,GSM,0,2147483647,2147483647,234,10,-103,5,2
4,1,GSM,0,2147483647,2147483647,234,10,-109,2,0
5,1,GSM,0,2147483647,2147483647,234,10,-109,2,0
6,1,GSM,0,2147483647,2147483647,234,10,-109,2,0
7,1,WCDMA,0,2147483647,2147483647,234,10,-107,3,1
8,1,WCDMA,0,2147483647,2147483647,234,10,-107,3,1
9,1,WCDMA,0,2147483647,2147483647,234,10,-113,0,0
