In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import csv
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import datetime as dt
import xgboost as xgb
import statsmodels.api as sm

In [3]:
def json_to_csv(json_file, csv_file):
    headers = ["timestamp", "ssid", "bssid", "rssi", "crowd_count"]

    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()

        try:
            with open(json_file, 'r', encoding='utf-8') as file:
                data = json.load(file)
                for entry in data:
                    # converting crowd_count to integer if necessary
                    entry['crowd_count'] = int(entry.get('crowd_count', 0))
                    writer.writerow(entry)
        except json.JSONDecodeError as e:
            print(f"Error loading JSON data from {json_file}: {e}")
        except ValueError as e:
            print(f"Error processing entry in {json_file}: {e}")

json_to_csv("rssi_data_roastd.json", "rssi_data_roastd.csv")
print("Data successfully written to rssi_data_roastd.csv")

Error processing entry in rssi_hiep.json: dict contains fields not in fieldnames: '2024-09-30 14:49:44'
Data successfully written to rssi_data_roastd.csv


#### Reading csv files to Dataframes

In [17]:
def load_data(file_paths):
    all_data = []
    for path in file_paths:
        df = pd.read_csv(path)
    return df
    
file_paths = ['rssi_data_PB.csv', 'rssi_data_stbkscc.csv', 'rssi_data_ccl.csv', 'rssi_data_roastd.csv'] 
data = load_data(file_paths)
print(data.head())

             timestamp         ssid               bssid  rssi  crowd_count
0  2024-11-09 11:11:36    Roastd-JC  38:3f:b3:45:d8:a0:   -49           20
1  2024-11-09 11:11:36  newportwifi  82:8a:20:25:91:ec:   -92           20
2  2024-11-09 11:11:36  newportwifi  82:8a:20:25:94:12:   -72           20
3  2024-11-09 11:11:36  newportwifi  7e:8a:20:24:91:ec:   -68           20
4  2024-11-09 11:11:36  newportwifi  7e:8a:20:24:94:12:   -59           20


In [19]:
# Step 2: Preprocess Data
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.dayofweek

label_encoders = {
    'ssid': LabelEncoder().fit(data['ssid']),
    'bssid': LabelEncoder().fit(data['bssid'])
}
data['ssid'] = label_encoders['ssid'].transform(data['ssid'])
data['bssid'] = label_encoders['bssid'].transform(data['bssid'])

# Using XGBoost

In [38]:
import xgboost as xgb

In [40]:
def load_and_preprocess_data():
    data = load_data(file_paths)
    
    # Converting timestamp to datetime and extract useful features
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data['hour'] = data['timestamp'].dt.hour
    data['day_of_week'] = data['timestamp'].dt.dayofweek
    
    # Encoding categorical features
    label_encoders = {}
    for col in ['ssid', 'bssid']:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
    
    # Droping the original timestamp column
    data = data.drop(columns=['timestamp'])
    
    # Spliting into features and target
    X = data.drop(columns=['crowd_count'])
    y = data['crowd_count']
    
    # Scaling numerical features
    scaler = StandardScaler()
    X[['rssi', 'hour', 'day_of_week']] = scaler.fit_transform(X[['rssi', 'hour', 'day_of_week']])
    
    return X, y, label_encoders

X, y, label_encoders = load_and_preprocess_data()

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training an XGBoost Regressor with hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10]
}

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)

y_pred = grid_search.best_estimator_.predict(X_test)

In [43]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

print("Best parameters found: ", grid_search.best_params_)

Mean Absolute Error (MAE): 2.230936465847925
Mean Squared Error (MSE): 8.166370484204776
Root Mean Squared Error (RMSE): 2.857686211641295
R-squared (R2): 0.20334172248840332
Best parameters found:  {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300}


In [50]:
data = load_data(file_paths)

data['timestamp'] = pd.to_datetime(data['timestamp'])
data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['is_weekend'] = data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

label_encoders = {}
for col in ['ssid', 'bssid']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

X = data[['rssi', 'hour', 'day_of_week', 'is_weekend', 'ssid', 'bssid']]
y = data['crowd_count']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [62]:
data.head()

Unnamed: 0_level_0,ssid,bssid,rssi,crowd_count,hour,day_of_week,is_weekend
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-11-09 11:11:36,55,23,-49,20,11,5,1
2024-11-09 11:11:36,138,127,-92,20,11,5,1
2024-11-09 11:11:36,138,128,-72,20,11,5,1
2024-11-09 11:11:36,138,119,-68,20,11,5,1
2024-11-09 11:11:36,138,120,-59,20,11,5,1


In [52]:
xgb_model = xgb.XGBRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [6, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=1.0; total time=   0.4s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=1.0; total time=   0.4s
[CV] END 

  _data = np.array(data, dtype=dtype, copy=copy,


In [54]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")
print(f"Best parameters found: {grid_search.best_params_}")

Mean Absolute Error (MAE): 2.2328830834319593
Mean Squared Error (MSE): 8.167220548254056
Root Mean Squared Error (RMSE): 2.8578349406944508
R-squared (R2): 0.20325875282287598
Best parameters found: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 100, 'subsample': 1.0}
