In [1]:
import numpy as np
import matplotlib.pyplot as plt
import json
import pandas as pd

In [None]:
#---------------------------Data Analysis--------------------------------------
#ONLY RUN THIS CELL WHEN NEEDED! Otherwise its unnecessary.
#data file name is in structure of "label_attempt.json" (e.g "0_1.json"), each file contains a list of dictionaries {"MAC":mac, "RSSI":rssi}
#read through all files and create a set of MAC addresses that are in all files
num_labels = 25
num_attempts = 10
mac_set = set()
for label in range(num_labels):
    for attempt in range(num_attempts):
        file_name = str(label) + "_" + str(attempt) + ".json"
        with open(file_name, "r") as f:
            data = json.load(f)
            for item in data:
                mac_set.add(item["MAC"])
mac_set = list(mac_set)
print("Number of MAC addresses: ", len(mac_set))
#save mac_set to a file
with open("mac_set.txt", "w") as f:
    for mac in mac_set:
        f.write(mac + "\n")

In [None]:
#---------------------------Data Preprocessing--------------------------------------
#read mac_set.txt and create a list
mac_set = []
with open("mac_set.txt", "r") as f:
    for line in f:
        mac_set.append(line.strip())

#data training set
data_set = []
#read through all files
for label in range(num_labels):
    for attempt in range(num_attempts):
        file_name = str(label) + "_" + str(attempt) + ".json"
        try:
            with open(file_name,"r") as f:
                data = json.load(f) #list of dictionary with mac, rssi
                #create a 0 vector with length of mac_set
                vector = np.zeros(len(mac_set))
                #for each mac in data, find its index in mac_set and update the vector
                for item in data:
                    index = mac_set.index(item["MAC"])
                    vector[index] = item["RSSI"]
                #data point = (label, vector)
                data_set.append((label, vector))
        except:
            print("Error in file: ", file_name)

#data_set to dataframe
df = pd.DataFrame(data_set, columns=["label", "vector"])
#print dataframe
print(df)

In [None]:
#---------------------------Data Visualization--------------------------------------
#vector is len(mac_set) long, which is a large dimension. So let's use auto encoder to reduce the dimension and visualize the data
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df["vector"].values.tolist())

#PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

#plot
plt.figure(figsize=(8,6))
plt.scatter(pca_data[:,0], pca_data[:,1], c=df["label"], cmap="plasma")
plt.xlabel("First Principal Component")
plt.ylabel("Second Principal Component")
plt.show()