In [9]:
import pandas as pd
from modeling_old import clear_column_names, regressor

In [10]:
def load_data(path="./df_3ap_final.csv"):
    df = pd.read_csv(path, header=[0, 1], index_col=None)

    y = df[("throughput", "_")]
    X = df.drop(columns=[("throughput", "_")])

    X.columns = clear_column_names(X)

    return X, y

In [11]:
X, y = load_data()
X_test, y_test = load_data("./df_1_3ap_test_final.csv")
X.shape, X_test.shape

((691, 352), (105, 349))

In [12]:
def align_train_test(X_train, X_test):
    missing_cols = set(X_train.columns) - set(X_test.columns)
    print(f"Missing columns in test set: {missing_cols}")
    for c in missing_cols:
        X_test[c] = False
    X_test = X_test[X_train.columns]
    return X_test


X_test = align_train_test(X, X_test)
X.shape, X_test.shape

Missing columns in test set: {'mcs_nss_4_1', 'mcs_nss_9_1', 'mcs_nss_2_2'}


((691, 352), (105, 352))

In [13]:
y_test

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
100   NaN
101   NaN
102   NaN
103   NaN
104   NaN
Name: (throughput, _), Length: 105, dtype: float64

In [14]:
regressor(X, y, "xgboost")


Feature Importances:
                                             Feature  Importance
86        sta_to_ap_2_max_ant_rssi_in_nav_ed_percent    0.182993
340                                     mcs_nss_11_2    0.049350
46            sta_to_ap_1_max_ant_rssi_la_ed_percent    0.041655
111                 sta_to_ap_2_mean_ant_rssi_median    0.035187
189  sta_from_ap_1_max_ant_rssi_wavelet_coefficients    0.031624
..                                               ...         ...
198                 sta_from_ap_1_mean_ant_rssi_mean    0.000000
199                  sta_from_ap_1_mean_ant_rssi_var    0.000000
200             sta_from_ap_1_mean_ant_rssi_kurtosis    0.000000
205              sta_from_ap_1_mean_ant_rssi_entropy    0.000000
186   sta_from_ap_1_max_ant_rssi_approximate_entropy    0.000000

[352 rows x 2 columns]
{'mse': {'train': [213.2136344280594, 178.9522307287519, 167.7227687022988, 187.28973074701204, 183.63871253500207], 'test': [619.2022435828975, 732.5509585332553, 673.826828

{'shap_values':                                          Feature  SHAP Value  SHAP Sign
 86    sta_to_ap_2_max_ant_rssi_in_nav_ed_percent    0.868919        1.0
 45    sta_to_ap_1_max_ant_rssi_in_nav_ed_percent   -0.365517       -1.0
 6                sta_to_ap_0_max_ant_rssi_length   -0.361500       -1.0
 315        ap_from_ap_B_mean_rssi_le_nav_percent   -0.309879       -1.0
 94                 sta_to_ap_2_max_ant_rssi_mean    0.155364        1.0
 ..                                           ...         ...        ...
 240             sta_from_ap_2_mean_ant_rssi_mean    0.000000        0.0
 239              sta_from_ap_2_mean_ant_rssi_iqr    0.000000        0.0
 238            sta_from_ap_2_mean_ant_rssi_range    0.000000        0.0
 237           sta_from_ap_2_mean_ant_rssi_median    0.000000        0.0
 145  sta_from_ap_0_max_ant_rssi_grouping_entropy    0.000000        0.0
 
 [352 rows x 3 columns],
 'feature_importances':                                              Feature  Impo

In [15]:
import xgboost as xgb

model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42,
)

model.fit(X, y)
y_test_pred = model.predict(X_test)

In [16]:
# save predictions
y_test_pred = pd.Series(y_test_pred, name="throughput")
y_test_pred.to_csv("./results/y_1_3ap_test_pred.csv", index=False)