In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, confusion_matrix, ConfusionMatrixDisplay
import numpy as np

In [2]:
data = pd.read_csv("data.csv", index_col=0)
min_value = data.iloc[:, 5:].min().min()
data = data.fillna(min_value - 1)
data.iloc[:, 5:] = data.iloc[:, 5:] + (min_value - 1)*(-1)

In [3]:
name_mac_relation = pd.read_csv("mac_name_relation.csv", index_col=0)
good_aps = name_mac_relation[name_mac_relation['ap_name'].isin( ["Guest-CentraleSupelec", "eduroam", 'stop&go', 'CD91', 'fabrique2024'])]["ap_mac"].to_list()
columns_to_maintain  = good_aps + data.columns[:5].to_list()
data = data[data.columns.intersection(columns_to_maintain)]

In [10]:
AP_columns = list(data.columns[5:])
data = data[data["room_part"] != 5]

### Tloc Function

In [15]:
from scipy.special import gamma
from scipy.special import hyp2f1

import scipy

class TLoc:

    def __init__(self, train_data: pd.DataFrame):
        # self.non_null_minimum_percentage = 0.1
        self.train_data = train_data
        # self.aps = self.get_aps_with_non_zero_minimum_percentage(self.train_data)
        self.aps = list(self.train_data.columns[1:])
        if len(self.aps) == 1:
            self.max_power = int(self.train_data[self.aps].max())
        else:
            self.max_power = int(self.train_data[self.aps].max().max())


        self.spaces = list(self.train_data["room"].unique())

        self.power_probability_masks = {}
        self.power_prior_probability_distribution = {}
        self.eps = 1e-5


    def get_aps_with_non_zero_minimum_percentage(self, data):
        percentage_of_non_zeros = (data.iloc[:, 5:] != 0).sum() / data.shape[0]
        return data.columns[5:][percentage_of_non_zeros >= self.non_null_minimum_percentage]
    


    def get_mu_and_phi_estimation(self, data, router):
        mu = []
        phi = []
        data_of_router = data[["room", router]]
        for space in self.spaces:
            data_of_router_in_space = data_of_router[data_of_router["room"] == space]

            data_of_router_in_space_without_zero_values = data_of_router_in_space[data_of_router_in_space[router] != 0]
            if len(data_of_router_in_space_without_zero_values) == 0:
                mu.append(0.0)
            else:
                mu.append(data_of_router_in_space_without_zero_values[router].mean())
            phi.append(1 - data_of_router_in_space_without_zero_values.shape[0] / data_of_router_in_space.shape[0])

        return mu, phi



    def train(self):
        for router in self.aps:


            self.power_probability_masks[router] = {}
            self.power_prior_probability_distribution[router] = {}

            mu, phi = self.get_mu_and_phi_estimation(self.train_data, router)


            total_num_samples_in_router = self.train_data[router].shape[0]
            for power in range(0, self.max_power):
                self.power_probability_masks[router][
                    power] = self.approximate_position_density_function_given_router(power, np.array(mu),
                                                                                    np.array(phi))
                num_samples_with_value_power_in_router = (self.train_data[router] == power).sum()

                self.power_prior_probability_distribution[router][
                    power] = num_samples_with_value_power_in_router / total_num_samples_in_router

                    

    def cumulative_distribution_function_of_t_student(self, x, v):

        return 0.5 + x * gamma((v + 1) / 2) * hyp2f1(1 / 2, (v + 1) / 2, 3 / 2, -(x ** 2) / v) / (
                np.sqrt(v * np.pi) * gamma(v / 2))
    
    def cumulative_distribution_function_of_power(self, power, mu, phi, sigma, v):

        cdf = phi * np.heaviside(power, 1) + (1 - phi) * self.cumulative_distribution_function_of_t_student(
            (power - mu) / sigma, v)
        
        return cdf
    

    def approximate_position_density_function_given_router(self, power, mu, phi, sigma=5, num_samples_per_ap=30, t_score_alpha=0.05):

        v = np.ceil(num_samples_per_ap * (1 - phi) - 1)
        v = np.where(v <= 0, 1, v)

        t_score = scipy.stats.t.ppf(0.5 + t_score_alpha, v)

        density_function = self.cumulative_distribution_function_of_power(power + t_score * sigma, mu, phi, sigma, v) - self.cumulative_distribution_function_of_power(
                    power - t_score * sigma, mu, phi, sigma, v)  # power, mu, phi, sigma, v

        return density_function
    

    def pred(self, X_test):

        y_pred = []
        min_prob = self.eps * np.ones(len(self.spaces))

        for _, test_sample in X_test.iterrows():

            distribution_xy_given_bf = np.ones(len(self.spaces))

            for router in self.aps:

                    power = int(test_sample[router])

                    try:
                        prob_p_given_xybfr = self.power_probability_masks[router][power]
                    except KeyError:
                        # print(f"Error predicting router {router}, power {power}")
                        continue
                    


                    prob_p_given_xybfr = np.maximum(prob_p_given_xybfr, min_prob)
                    #prob_p_given_xybfr = prob_p_given_xybfr / prob_p_given_xybfr.sum()
                    #prob_xy_given_pbfr = prob_p_given_xybfr / (
                    #                tloc.eps + tloc.power_prior_probability_distribution[router][power])


                    distribution_xy_given_bf = distribution_xy_given_bf * prob_p_given_xybfr




            room_pred = self.spaces[distribution_xy_given_bf.argmax()]
            y_pred.append(room_pred)

        y_pred = np.array(y_pred)
        ground_truth = np.array(list(X_test["room"]))
        ac = np.sum(y_pred == ground_truth)/len(ground_truth)
        return ac, y_pred, ground_truth







# Forward Feature Selection

In [16]:
from sklearn.model_selection import train_test_split

train_data = data[data["device_id"] != "G"]


val_data ,test_data = train_test_split(data[data["device_id"] == "G" ], test_size=0.5, random_state=42)

In [19]:
best_features_for_guilherme = ['room',
 '94:d4:69:fd:b1:e6:',
 '94:d4:69:f9:7e:47:',
 '94:d4:69:f9:7f:c9:',
 '94:d4:69:f9:7d:6f:',
 '94:d4:69:f9:7d:c4:',
 '94:d4:69:fd:ac:e8:',
 '94:d4:69:f9:5b:e0:',
 '94:d4:69:fd:b1:07:',
 '94:d4:69:f6:c5:63:',
 '94:d4:69:fa:99:00:',
 '94:d4:69:fd:ae:c3:',
 '94:d4:69:fd:b1:0f:',
 '70:f3:5a:96:66:e4:',
 '94:d4:69:f6:e4:e8:',
 '94:d4:69:fd:b1:06:',
 '94:d4:69:f9:7f:c7:',
 '94:d4:69:f6:c5:6b:',
 'a4:88:73:4e:40:c0:',
 '94:d4:69:f6:b3:e0:',
 '94:d4:69:f7:90:88:']

In [None]:
best_features = ["room",
   ]


remaining_features = AP_columns




for aff in range(20):
    best_acc = -1
    best_feature = None
    best_feature_index = None

    for i, feature in enumerate(remaining_features):
        features_to_use = best_features + [feature]
        X_train_subset = train_data[features_to_use]
        X_test_subset = val_data[features_to_use]
        model = TLoc(X_train_subset)
        
        model.train()
        acc, a, b = model.pred(X_test_subset)


        if acc > best_acc:
            print(best_acc)
            best_acc = acc
            best_feature = feature
            best_feature_index = i

    print(aff)
    print(best_feature)

    best_features.append(best_feature)
    remaining_features = np.delete(remaining_features, best_feature_index)


In [20]:
model = TLoc(train_data[best_features_for_guilherme])

In [21]:
model.pred(test_data)

(0.07928679817905918,
 array(['LC410', 'LC410', 'LC410', ..., 'LC410', 'LC410', 'LC410'],
       dtype='<U5'),
 array(['LC410', 'LC413', 'LC448', ..., 'LC414', 'LC414', 'LC414'],
       dtype='<U5'))

In [22]:
test_data["room"].unique()

array(['LC410', 'LC413', 'LC448', 'LC455', 'LC426', 'LC443', 'LC416',
       'LC414', 'LC415', 'LC412', 'LC424', 'LC437', 'LC442', 'LC417'],
      dtype=object)

In [23]:
val_data["room"].unique()

array(['LC416', 'LC412', 'LC414', 'LC442', 'LC426', 'LC437', 'LC455',
       'LC410', 'LC424', 'LC415', 'LC443', 'LC417', 'LC448', 'LC413'],
      dtype=object)