In [1]:
import os
import pandas as pd
import numpy as np

# 1. Data preprocessing
the aim is to convert each row in the '.dat' file to a dataframe of columns=['rxaddr', 'rssi'] form, and store ith file as the ith element of a list.
(1)Read raw files,The row in the raw files are like this:
[{'txAddr': '12:3B:6A:1B:9A:D2 ', 'rxAddr': '02', 'rssi': '-74', 'ts': 1571373024}]
(2)After reading the files, first is to sort the file by name to avoid disordering.
(3)The input is the work directory where all '.dat' files are stored.

In [2]:
"""define a function dat2df"""
def dat2df(workpath):
    """return a list hkstp containing all the file, hkstp[i] refers to a dataframe of the
    ith dat file"""
    filelist = os.listdir(workpath)
    # sort the .dat file in order
    filelist = sorted(filelist)

    # dataset (96,)
    dataset = []
    # file names under the directory
    f_name = []
    # file types under the directory
    f_type = []
    # sort the .dat file in order
    for file in filelist:
        # olddir = os.path.join(workpath, file)
        filename = os.path.splitext(file)[0]
        filetype = os.path.splitext(file)[1]
        if filetype == '.dat':
            f_name.append(filename)
            f_type.append(filetype)

    # only read every .dat file，files = f_name + f_type
    target_files = []
    df = []
    elem_len = []
    for i in range(len(f_name)):
        target_files.append(f_name[i] + f_type[i])
        df.append(pd.read_table(target_files[i], header=None))
        elem_len.append(len(df[i]))  # elem_len[i] the rows in the ith.dat file
        for j in range(len(df[i])):  # if error in the first column, j in range（1，len（））
            tempt = df[i][0][j].replace('[', '')
            tempt = tempt.replace(']', '')
            tempt = tempt.replace(' ', '')
            tempt = eval(tempt)  # str2dictionary
            df[i][0][j] = tempt
        dataset.append(df[i])  # dataset[i][0][j] is a dict

    # str to dataframe
    hkstp = []  # 96
    dataset_len = len(dataset)  # 96

    for i in range(dataset_len):
        rxaddr = []  # temporary variable
        rssi = []  # temporary variable
        for j in range(elem_len[i]):
            rxaddr.append(dataset[i][0][j]['rxAddr'])
            rssi.append(int(dataset[i][0][j]['rssi']))
        Dict = {'rxaddr': rxaddr,
                'rssi': rssi}
        hkstp.append(pd.DataFrame(Dict)) #(96,) containing dataframe of all dat files

    return hkstp

In [3]:
path = os.getcwd()
fingerprint_dataframe = dat2df(path)
type(fingerprint_dataframe)

list

In [4]:
len(fingerprint_dataframe)#length should be the number of files.

96

In [5]:
fingerprint_dataframe[0]#the first element, equal to file hkstp_fp_01_1

Unnamed: 0,rxaddr,rssi
0,2,-74
1,2,-74
2,2,-69
3,1,-76
4,2,-69
5,2,-71
6,2,-70
7,1,-70
8,1,-69
9,1,-69


# 2. Decomposition of the fixed positions(here we have 24) into 4 directions separately.
Therefore, we will have 4 lists, containing dir1, dir2, dir3, dir4 respectively.

In [6]:
"""define the decomposition function"""
def decomposition(hkstp):
    """This function decomposes the initial dataframe into 4 directions:
        direction1, direction2, direction3, direction4
    """
    direction_1 = []  # direction_1[i] hkstp_fp_i_1
    direction_2 = []  # direction_1[i] hkstp_fp_i_2
    direction_3 = []  # direction_1[i] hkstp_fp_i_3
    direction_4 = []  # direction_1[i] hkstp_fp_i_4
    for i in range(0, len(hkstp), 4):
        direction_1.append(hkstp[i])
    for i in range(1, len(hkstp), 4):
        direction_2.append(hkstp[i])
    for i in range(2, len(hkstp), 4):
        direction_3.append(hkstp[i])
    for i in range(3, len(hkstp), 4):
        direction_4.append(hkstp[i])

    return direction_1, direction_2, direction_3, direction_4

In [7]:
dir_1, dir_2, dir_3, dir_4 = decomposition(fingerprint_dataframe)
#they should have length all equal to 24(96/4)
print(len(dir_1))
print(len(dir_2))
print(len(dir_3))
print(len(dir_4))

24
24
24
24


In [8]:
dir_1[0]

Unnamed: 0,rxaddr,rssi
0,2,-74
1,2,-74
2,2,-69
3,1,-76
4,2,-69
5,2,-71
6,2,-70
7,1,-70
8,1,-69
9,1,-69


# 3. Creating a feature vector
I went through some fingerprinting algorithm, and the feature vector(radio map) I formulated is in a form of average rssi of each sensor. In our case, there are 23 sensors, the vector = [s1_average s2_average ... s23 average]. However, concerning future change in the setting, the sensor number is set as an input, so the vector is of 1 * n.
More importantly, this function is also used to extract sample data.
The radio map is store in numpy.array to convenient future computation.

In [9]:
"""convert string in the data to int"""
def str2int(x):
    return int(x)


"""define the radio map(feature vector)"""
def feature_vector(n, data_direction):
    """
    :param n: the number of sensors, a integer(e.g, in this case, n = 23)
    :param data: data_direction[i]: dataframe, two columns: rxaddr, rssi
    :return: a feature vector,np.array, (m,n), for one specific direction
    """
    """This function turns data processed by decomposition into a feature vector
       (of m rows, m = the number of fixed points), the form is:
       vector = [sensor1: average_rssi, S2: ave_rssi ... Sn:ave rssi]
       if no sensor rssi values received, accordingly, the ave_rssi = 0
       To be convenient, the vector is store in numpy.
    """
    data_len = len(data_direction)

    def statistics(data_direction):
        """
        This functions is used to get the mean of received rssi values and the corresponding sensors
        :param data_direction: data from decomposition()
        :return:
        """
        for i in range(data_len):
            """change rxaddr from str to int"""
            data_direction[i]['rxaddr'] = data_direction[i]['rxaddr'].apply(str2int)  # type should be numpy.int64

        rxadd = []
        rssi = []
        for i in range(data_len):
            rxadd.append(data_direction[i].groupby('rxaddr').mean().index.values)
            rssi.append(data_direction[i].groupby('rxaddr').mean()['rssi'].values)

        return rxadd, rssi

    rxaddr_list, rssi_list = statistics(data_direction)  # get rssi mean and sensors

    feature_vec = np.zeros([data_len, n])  # (m,n)
    """feature_vec:
       [ s1_mean s2_mean ... s23_mean ]
    """
    for i in range(data_len):
        for j in range(len(rxaddr_list[i])):
            feature_vec[i][rxaddr_list[i][j] - 1] = rssi_list[i][j]
    # feature_vec = (m, n), m is the file number in the given direction, n is the amount of sensor

    return feature_vec

In [12]:
radio_map_1 = feature_vector(23, dir_1)#feature vector of dir_1, n=23, the size is (m,n), m, the rows of input dir_1,
#n, the sensor number.
radio_map_2 = feature_vector(23, dir_2)
radio_map_3 = feature_vector(23, dir_3)
radio_map_4 = feature_vector(23, dir_4)
print(type(radio_map_1))
print(radio_map_1.shape)

<class 'numpy.ndarray'>
(24, 23)


In [13]:
radio_map_1[0]#first element -74.307 is the average rssi received of sensor1, second element sensor2, ....

array([-74.30769231, -70.11111111,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ])

# 4. Calculate distance using p-norm
Given a feature vector, the p-norm distance is computed as:
||x||p = (sum(1/wi * |xi|^p))^(1/p)
In our case, xi is a difference, a difference of yi - ai. yi refers to the feature vector of fixed points, ai refers to the feature vector of positions to be classified. Also, a weight can be introduced to every xi.

In [14]:
"""define the p-norm computation function"""

def p_norm_distance(sample_direction, feature_direction, w, p=2):
    """
    This function calculates the p-norm distance, and returns the closest one.
    :param sample_direction: a new data sample to be classified, needed to go through dat2df(), feature_vector(), and
                             can be (k, n), k depends on the size of the new data, n is the amount of sensors
    :param feature_direction: feature_direction from feature vector, in this case(24, 23)
    :param p: order, default: p=2, 2-norm
    :param w: weights, default a ones(1,n) array
    :return: the p-norm result and the computed classification

    Calculate the p-norm distance using equation below:
        distance = (sum(|yj - aj|^p))^(1/p), yj is the average rssi of feature from sensorj, aj is the average rssi of
        new data from sensorj
    """

    # (k,m) each row is the norm value at position j.
    norm_result = np.zeros([sample_direction.shape[0], feature_direction.shape[0]])

    for i in range(feature_direction.shape[0]): # ith position, refers to column of result
        difference = w * (sample_direction - feature_direction[i]) # yi - ai, difference = (k, n)
        for j in range(difference.shape[0]): # jth sample data, refers to rows of result
            norm = np.linalg.norm(difference[j])
            norm_result[j][i] = norm

    class_index = np.argmin(norm_result, axis=1) + 1  # sensor # = index + 1
    min_value = np.amin(norm_result, axis=1)

    return norm_result, class_index, min_value

In [15]:
w = np.ones([23]) #w set to 1 for all elements.
Euclidean_dis, class_index, min_value = p_norm_distance(radio_map_3, radio_map_1, w)#just to test, I set radio_map_3 as the positions to be
#classified, radio_map_1 be the fixed positions. Default, p=2 , 2-norm(Euclidean distance). In a serious context,
#always compare the data from same direction, or otherwise it is meaningless.
Euclidean_dis

array([[  8.54417836,   2.74992112,  85.77352712, 127.44826331,
        147.58692289, 143.25082086, 158.32078553, 183.04049169,
        178.26320316, 201.28016715, 197.429003  , 188.54300911,
        198.47527188, 183.04856295, 162.57309964, 185.80053601,
        187.52423535, 164.10393608, 156.53942276, 183.53997583,
        178.99978727, 200.42961157, 178.42667817, 198.02238267],
       [ 88.77948515,  86.75904524,   2.48754294,  90.08085428,
        125.01868613, 124.65371003, 146.49256545, 170.42345945,
        205.6958196 , 225.93518161, 222.51114989, 214.66594253,
        223.44000122, 209.85656292, 192.2576743 , 212.2612536 ,
        213.77169935, 193.55387477, 187.18330983, 210.28533788,
        206.33449649, 225.17777375, 205.83750894, 223.03780988],
       [183.26001363, 189.32420469, 175.2894019 , 156.36579136,
        128.47935295, 127.96557319,  92.12556212,  10.23343958,
        115.08955674, 143.11426275, 183.26865622, 172.84390796,
        185.08444641, 210.71800264, 19

In [16]:
class_index#classification

array([ 2,  3,  8,  8,  8,  8,  8, 10,  9, 12, 11, 12, 11, 15, 15, 16, 18,
       18, 21, 21, 23, 24, 24, 24])

In [17]:
min_value

array([ 2.74992112,  2.48754294, 10.23343958,  7.0752482 , 14.9109174 ,
        6.94400346,  7.3417028 , 74.53048477, 76.58447624, 89.8981359 ,
       86.02496102, 14.65070176, 86.35610119,  4.74711207,  7.53961197,
        3.06357257,  4.04318578,  4.04684253, 89.2307648 , 83.33797503,
       86.01355876, 95.19920356, 89.7782247 , 89.32088875])