In [204]:
import pandas as pd
import numpy as np
import os
import re

In [38]:
def room_division(dataRoom):
    """
    room_division
    -------------
    take all the data and divide the rooms of all PoliTo
    ### Output:
        - the dataframe with the room divided
    """
    Ap = "AP-AULA"
    dataRoom = dataRoom[dataRoom["name_ap"].notnull()]
    dataRoom = dataRoom[dataRoom["name_ap"].str.contains(Ap)]
    rooms = pd.DataFrame()
    rooms[["AP", "Room", "APnum", "NaN"]] = dataRoom["name_ap"].str.split(
        "-", expand=True
    )
    dataRoom = pd.concat([dataRoom, rooms], axis=1)
    dataRoom = dataRoom.drop(["AP", "NaN"], axis=1)
    return dataRoom

In [201]:
dataset_dict = {
    "aula": [],
    "n_devices": [],
    "n_users": [],
    "snr_mean": [],
    "snr_std": [],
    "rssi_mean": [],
    "rssi_std": [],
    "ch_util_2_4_mean": [],
    "ch_util_5_mean": [],
    "noise_2_4_mean": [],
    "noise_5_mean": [],
    "n_people": []
}

path = "ml_data/"
regex_aula = "(AULA.+)_"
regex_n_people = "_([0-9]+)\."


for i, file in enumerate(os.listdir(path)):
    df = pd.read_csv(path + file)
    df.drop(columns=["Unnamed: 0", "Timestamp_y", "ch_2_4", "ch_5", "class", "domain", "client_type", "code_ap"], inplace=True)
    df = room_division(df)

    aula = re.search(regex_aula, file).group(1)
    df_aula = df[df["Room"] == aula]

    if len(df_aula) == 0:
        print(f"{aula} is empty. Skipping the acquisition!")
        continue
    dataset_dict["aula"].append(aula)

    # Feature extrapolation
    df_dist_ap = df_aula.drop_duplicates(subset="APnum")
    n_devices = df_dist_ap["n_clients_2_4"].sum() + df_dist_ap["n_clients_5"].sum()
    dataset_dict["n_devices"].append(n_devices)

    df_dist_users = df_aula.drop_duplicates(subset="user_masked")
    n_users = len(df_dist_users[df_dist_users["snr"] > 20])
    dataset_dict["n_users"].append(n_users)

    snr_mean = df_aula["snr"].mean()
    dataset_dict["snr_mean"].append(snr_mean)
    snr_std = df_aula["snr"].std()
    dataset_dict["snr_std"].append(snr_std)

    rssi_mean = df_aula["rssi"].mean()
    dataset_dict["rssi_mean"].append(rssi_mean)
    rssi_std = df_aula["rssi"].std()
    dataset_dict["rssi_std"].append(rssi_std)

    ch_util_2_4_mean = df_dist_ap["ch_utilization_2_4"].mean()
    dataset_dict["ch_util_2_4_mean"].append(ch_util_2_4_mean)
    ch_util_5_mean = df_dist_ap["ch_utilization_5"].mean()
    dataset_dict["ch_util_5_mean"].append(ch_util_5_mean)

    noise_2_4_mean = df_dist_ap["noise_2_4"].mean()
    dataset_dict["noise_2_4_mean"].append(noise_2_4_mean)
    noise_5_mean = df_dist_ap["noise_5"].mean()
    dataset_dict["noise_5_mean"].append(noise_5_mean)


    x = re.search(regex_n_people, file)
    n_people = int(x.group(1))
    dataset_dict["n_people"].append(n_people)

In [235]:
dataset = pd.DataFrame.from_dict(dataset_dict)
dataset

Unnamed: 0,aula,n_devices,n_users,snr_mean,snr_std,rssi_mean,rssi_std,ch_util_2_4_mean,ch_util_5_mean,noise_2_4_mean,noise_5_mean,n_people
0,AULA3P,122,87,40.188034,6.49792,-51.393162,5.997064,32.5,27.5,-84.5,-92.25,111
1,AULAR2,183,112,36.335196,7.32162,-53.027933,6.09331,37.75,5.5,-81.0,-90.5,150
2,AULA3P,53,37,40.326923,9.791395,-52.403846,9.25512,38.5,1.0,-84.75,-92.25,54
3,AULAR2B,116,74,41.044248,6.377121,-52.0,6.212028,43.25,9.75,-81.333333,-92.75,75
4,AULA2P,127,67,35.377049,12.997983,-57.647541,11.972366,44.5,7.25,-83.25,-94.0,77
5,AULAR2,190,114,35.765027,7.300044,-53.338798,6.012347,32.25,13.0,-87.75,-89.0,149
6,AULA1P,219,138,40.885845,6.43462,-49.52968,5.846883,52.75,17.0,-70.75,-91.5,182
7,AULAR3,201,116,37.247423,7.811949,-54.273196,6.253091,39.75,14.25,-81.0,-90.0,132
8,AULAR4B,117,75,42.119658,6.328864,-51.888889,6.561089,40.0,28.75,-88.0,-92.75,83
9,AULA1P,217,132,39.637681,7.182715,-50.2657,6.368302,53.5,11.75,-80.5,-89.5,172


#### Comparison with baseline

In [236]:
from sklearn.metrics import r2_score
r2_score(dataset["n_people"], dataset["n_users"])

0.6276289002122808

In [213]:
y = dataset["n_people"].to_numpy() + (np.random.rand(len(dataset)) * 0.001)
dataset.drop(["aula", "n_people"], axis=1, inplace=True)
X = dataset.to_numpy() + (np.random.rand(len(dataset), len(dataset.columns)) * 0.001)

### TODO
- Choice of regression model at this [link](https://medium.com/@dnyaneshwalwadkar/regression-a-to-z-choosing-the-correct-type-of-regression-analysis-4cfb29ae5a1)
- Cross validation [here](https://scikit-learn.org/stable/modules/cross_validation.html)

In [215]:
from sklearn.model_selection import train_test_split

#### Static split

In [239]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
print("Train shape: ", X_train.shape)
print("Test shape: ", X_test.shape)


Train shape:  (22, 10)
Test shape:  (6, 10)


#### Linear regression

In [282]:
from sklearn.linear_model import LinearRegression

In [283]:
lin_reg_model = LinearRegression()
fitted = lin_reg_model.fit(X_train, y_train)

In [284]:
print("R2 train: ", fitted.score(X_train, y_train))
print("R2 test: ", fitted.score(X_test, y_test))

R2 train:  0.9760253943660981
R2 test:  0.4954207645187989


#### Cross validation

In [285]:
from sklearn.model_selection import cross_val_score, ShuffleSplit

In [295]:
cv = ShuffleSplit(n_splits=3, test_size=0.1)
scores = cross_val_score(lin_reg_model, X, y, cv=cv)
scores

array([0.85358618, 0.87616351, 0.71187658])

In [298]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_test, fitted.predict(X_test))

0.18912284263493642

### Gaussian process regression

In [237]:
from sklearn.gaussian_process import GaussianProcessRegressor

In [300]:
gaus_reg_model = GaussianProcessRegressor()
fitted = gaus_reg_model.fit(X_train, y_train)
fitted.score(X_test, y_test)
fitted.predict(X_train)

array([ 72.00031367,  87.00096552,  82.00037546, 172.00034287,
        77.00084616,  85.00046408, 149.00058376,  83.00042904,
       182.00080001,  73.00073026,   6.00004168, 123.00052723,
        54.00012859,  71.00006909,  89.0000398 ,  82.00024771,
        73.0005123 , 192.00016408,  76.00056466, 132.00078714,
       119.00031075, 100.00031115])

In [301]:
scores = cross_val_score(gaus_reg_model, X, y, cv=cv)
scores

array([-25.40549513,  -6.65103206, -25.87028725])

In [303]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_train, fitted.predict(X_train))

1.0000001155400613e-10