# XGBoost Regression for Latitude and Longitude Prediction
This notebook demonstrates how to use XGBoost to predict 'longitude' and 'latitude' from 'hour' and 'speed' columns in Master5G.csv.

In [1]:
# Import required libraries
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

import joblib

In [2]:
# Load the dataset
df = pd.read_csv('../clean_data/Master5G.csv')
df.head()

  df = pd.read_csv('../clean_data/Master5G.csv')


Unnamed: 0,time,Day,Year,Month,Date,hour,min,sec,timezone,latitude,...,Retransmissions,CWnd,cwnd_unit,Role-RX,Transfer size-RX,Transfer unit-RX,Bitrate-RX,bitrate_unit-RX,send_data,square_id
0,2022-07-03 19:43:37,Mon,2022.0,7.0,4.0,5.0,43.0,37.0,AEST,-37.737985,...,0.0,0.00708,MBytes,RX-C,0.988281,MBytes,8.29,Mbits/sec,0.0,square_94489280583
1,2022-07-03 19:43:38,Mon,2022.0,7.0,4.0,5.0,43.0,38.0,AEST,-37.738127,...,3.0,0.010645,MBytes,RX-C,0.972656,MBytes,8.16,Mbits/sec,0.0,square_94489280583
2,2022-07-03 19:43:39,Mon,2022.0,7.0,4.0,5.0,43.0,39.0,AEST,-37.738249,...,0.0,0.010645,MBytes,RX-C,0.957031,MBytes,8.03,Mbits/sec,0.0,square_94489280583
3,2022-07-03 19:43:40,Mon,2022.0,7.0,4.0,5.0,43.0,40.0,AEST,-37.738362,...,0.0,0.00708,MBytes,RX-C,0.953125,MBytes,7.99,Mbits/sec,0.014404,square_94489280583
4,2022-07-03 19:43:41,Mon,2022.0,7.0,4.0,5.0,43.0,41.0,AEST,-37.738491,...,6.0,0.007588,MBytes,RX-C,0.927734,MBytes,7.78,Mbits/sec,0.0,square_94489280583


In [10]:
# Add a new column 'svr_mean' as the mean of 'svr1', 'svr2', 'svr3', 'svr4'
df['svr_mean'] = df[['svr1', 'svr2', 'svr3', 'svr4']].mean(axis=1)
df.head()

Unnamed: 0,time,Day,Year,Month,Date,hour,min,sec,timezone,latitude,...,CWnd,cwnd_unit,Role-RX,Transfer size-RX,Transfer unit-RX,Bitrate-RX,bitrate_unit-RX,send_data,square_id,svr_mean
0,2022-07-03 19:43:37,Mon,2022.0,7.0,4.0,5.0,43.0,37.0,AEST,-37.737985,...,0.00708,MBytes,RX-C,0.988281,MBytes,8.29,Mbits/sec,0.0,square_94489280583,31.675
1,2022-07-03 19:43:38,Mon,2022.0,7.0,4.0,5.0,43.0,38.0,AEST,-37.738127,...,0.010645,MBytes,RX-C,0.972656,MBytes,8.16,Mbits/sec,0.0,square_94489280583,35.1
2,2022-07-03 19:43:39,Mon,2022.0,7.0,4.0,5.0,43.0,39.0,AEST,-37.738249,...,0.010645,MBytes,RX-C,0.957031,MBytes,8.03,Mbits/sec,0.0,square_94489280583,35.15
3,2022-07-03 19:43:40,Mon,2022.0,7.0,4.0,5.0,43.0,40.0,AEST,-37.738362,...,0.00708,MBytes,RX-C,0.953125,MBytes,7.99,Mbits/sec,0.014404,square_94489280583,27.125
4,2022-07-03 19:43:41,Mon,2022.0,7.0,4.0,5.0,43.0,41.0,AEST,-37.738491,...,0.007588,MBytes,RX-C,0.927734,MBytes,7.78,Mbits/sec,0.0,square_94489280583,24.125


In [11]:
# Prepare features and targets
X = df[['hour', 'speed', 'min', 'svr_mean']]
y_long = df['longitude']
y_lat = df['latitude']

In [12]:
# Split data into train and test sets
X_train, X_test, y_long_train, y_long_test = train_test_split(X, y_long, test_size=0.2, random_state=42)
_, _, y_lat_train, y_lat_test = train_test_split(X, y_lat, test_size=0.2, random_state=42)

In [13]:
# Train XGBoost regressors
xgb_long = XGBRegressor()
xgb_lat = XGBRegressor()
xgb_long.fit(X_train, y_long_train)
xgb_lat.fit(X_train, y_lat_train)

In [14]:
# Predict and evaluate
y_long_pred = xgb_long.predict(X_test)
y_lat_pred = xgb_lat.predict(X_test)
mse_long = mean_squared_error(y_long_test, y_long_pred)
mse_lat = mean_squared_error(y_lat_test, y_lat_pred)
print(f'Longitude MSE: {mse_long}')
print(f'Latitude MSE: {mse_lat}')

Longitude MSE: 0.0007662692106359211
Latitude MSE: 0.0008865733892465584


In [15]:
# Save the trained models as .pt files
# Convert XGBoost models to their underlying booster and save as bytes
xgb_long.save_model('xgb_longitude_model.pt')
xgb_lat.save_model('xgb_latitude_model.pt')
print('Models saved as xgb_longitude_model.pt and xgb_latitude_model.pt')

Models saved as xgb_longitude_model.pt and xgb_latitude_model.pt


  self.get_booster().save_model(fname)


In [16]:
# Save (value, prediction) pairs for longitude and latitude to CSV
results_df = pd.DataFrame({
    'longitude_true': np.array(y_long_test),
    'longitude_pred': y_long_pred,
    'latitude_true': np.array(y_lat_test),
    'latitude_pred': y_lat_pred
})
results_df.to_csv('latlon_predictions_vs_true.csv', index=False)
results_df.head()

Unnamed: 0,longitude_true,longitude_pred,latitude_true,latitude_pred
0,144.846505,144.814117,-37.742313,-37.74881
1,144.813258,144.798798,-37.704514,-37.761131
2,144.78655,144.796707,-37.8208,-37.761673
3,144.811472,144.793839,-37.760298,-37.761715
4,144.765277,144.794708,-37.742556,-37.771053


# Hyper-parameter Tuning for XGBoost
We use RandomizedSearchCV to find the best hyper-parameters for the XGBoost regressors.

In [17]:
# Define parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Longitude
rf_long = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=rf_param_grid,
    n_iter=20,
    cv=2,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)
rf_long.fit(X_train, y_long_train)
print('Best params for longitude (RF):', rf_long.best_params_)
print('Best score (MSE) for longitude (RF):', -rf_long.best_score_)

print("Halfway")

# Latitude
rf_lat = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=rf_param_grid,
    n_iter=20,
    cv=2,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)
rf_lat.fit(X_train, y_lat_train)
print('Best params for latitude (RF):', rf_lat.best_params_)
print('Best score (MSE) for latitude (RF):', -rf_lat.best_score_)


Fitting 2 folds for each of 20 candidates, totalling 40 fits


22 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/michael/workspace/github.com/nonexstnt/COS40007-Design-Project/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/michael/workspace/github.com/nonexstnt/COS40007-Design-Project/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/home/michael/workspace/github.com/nonexstnt/COS40007-Design-Project/.venv/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validat

Best params for longitude (RF): {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 9}
Best score (MSE) for longitude (RF): 0.0008034389140799297
Halfway
Fitting 2 folds for each of 20 candidates, totalling 40 fits


22 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/home/michael/workspace/github.com/nonexstnt/COS40007-Design-Project/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/michael/workspace/github.com/nonexstnt/COS40007-Design-Project/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/home/michael/workspace/github.com/nonexstnt/COS40007-Design-Project/.venv/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    valida

Best params for latitude (RF): {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 9}
Best score (MSE) for latitude (RF): 0.000923167054748341


# Train XGBoost with Best Parameters
Now we train the final models using the best parameters found.

In [18]:
xgb_long_best = XGBRegressor(**rf_long.best_params_)
xgb_lat_best = XGBRegressor(**rf_lat.best_params_)
xgb_long_best.fit(X_train, y_long_train)
xgb_lat_best.fit(X_train, y_lat_train)

# Predict and evaluate
y_long_pred_best = xgb_long_best.predict(X_test)
y_lat_pred_best = xgb_lat_best.predict(X_test)
mse_long_best = mean_squared_error(y_long_test, y_long_pred_best)
mse_lat_best = mean_squared_error(y_lat_test, y_lat_pred_best)
print(f'Longitude MSE (best): {mse_long_best}')
print(f'Latitude MSE (best): {mse_lat_best}')

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Longitude MSE (best): 0.0007285248736488201
Latitude MSE (best): 0.0008630018417371702


In [19]:
# Save the best-tuned XGBoost models
xgb_long_best.save_model('xgb_longitude_model_best.pt')
xgb_lat_best.save_model('xgb_latitude_model_best.pt')
print('Best-tuned models saved as xgb_longitude_model_best.pt and xgb_latitude_model_best.pt')

# Compare true vs predicted for best-tuned models and save to CSV
results_best_df = pd.DataFrame({
    'longitude_true': np.array(y_long_test),
    'longitude_pred_best': y_long_pred_best,
    'latitude_true': np.array(y_lat_test),
    'latitude_pred_best': y_lat_pred_best
})
results_best_df.to_csv('latlon_predictions_vs_true_best.csv', index=False)
results_best_df.head()

Best-tuned models saved as xgb_longitude_model_best.pt and xgb_latitude_model_best.pt


  self.get_booster().save_model(fname)


Unnamed: 0,longitude_true,longitude_pred_best,latitude_true,latitude_pred_best
0,144.846505,144.829712,-37.742313,-37.740967
1,144.813258,144.7995,-37.704514,-37.760456
2,144.78655,144.79921,-37.8208,-37.762333
3,144.811472,144.792023,-37.760298,-37.762768
4,144.765277,144.813141,-37.742556,-37.780411


# SVM

In [20]:
# Train SVM models for longitude and latitude
svm_long = SVR()
svm_lat = SVR()

svm_long.fit(X_train, y_long_train)
svm_lat.fit(X_train, y_lat_train)

In [21]:
# Predict and evaluate SVM results
y_long_pred_svm = svm_long.predict(X_test)
y_lat_pred_svm = svm_lat.predict(X_test)
mse_long_svm = mean_squared_error(y_long_test, y_long_pred_svm)
mse_lat_svm = mean_squared_error(y_lat_test, y_lat_pred_svm)
print(f'Longitude MSE (SVM): {mse_long_svm}')
print(f'Latitude MSE (SVM): {mse_lat_svm}')

Longitude MSE (SVM): 0.0014587360452676436
Latitude MSE (SVM): 0.001703118182074884


In [22]:
# Save (value, prediction) pairs for SVM longitude and latitude to CSV
results_svm_df = pd.DataFrame({
    'longitude_true': np.array(y_long_test),
    'longitude_pred_svm': y_long_pred_svm,
    'latitude_true': np.array(y_lat_test),
    'latitude_pred_svm': y_lat_pred_svm
})
results_svm_df.to_csv('latlon_predictions_vs_true_svm.csv', index=False)
results_svm_df.head()

Unnamed: 0,longitude_true,longitude_pred_svm,latitude_true,latitude_pred_svm
0,144.846505,144.818171,-37.742313,-37.730019
1,144.813258,144.818224,-37.704514,-37.730019
2,144.78655,144.818198,-37.8208,-37.730019
3,144.811472,144.8182,-37.760298,-37.730019
4,144.765277,144.818184,-37.742556,-37.730019


In [23]:
# Save the trained SVM models for longitude and latitude
joblib.dump(svm_long, 'svm_longitude_model.pt')
joblib.dump(svm_lat, 'svm_latitude_model.pt')
print('SVM models saved as svm_longitude_model.pt and svm_latitude_model.pt')

SVM models saved as svm_longitude_model.pt and svm_latitude_model.pt


# Hyper-parameter Tuning for SVM
We use RandomizedSearchCV to find the best hyper-parameters for the SVM regressors.

In [24]:
# Define parameter grid for SVM
svm_param_grid = {
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear', 'poly']
}

# Longitude
grid_search_long_svm = GridSearchCV(
    SVR(),
    param_grid=svm_param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)
grid_search_long_svm.fit(X_train, y_long_train)
print('Best params for longitude (SVM):', grid_search_long_svm.best_params_)
print('Best score (MSE) for longitude (SVM):', -grid_search_long_svm.best_score_)

# Latitude
grid_search_lat_svm = GridSearchCV(
    SVR(),
    param_grid=svm_param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)
grid_search_lat_svm.fit(X_train, y_lat_train)
print('Best params for latitude (SVM):', grid_search_lat_svm.best_params_)
print('Best score (MSE) for latitude (SVM):', -grid_search_lat_svm.best_score_)

Fitting 3 folds for each of 75 candidates, totalling 225 fits
Best params for longitude (SVM): {'C': 0.01, 'gamma': 'scale', 'kernel': 'poly'}
Best score (MSE) for longitude (SVM): 0.0014244609155584468
Fitting 3 folds for each of 75 candidates, totalling 225 fits
Best params for longitude (SVM): {'C': 0.01, 'gamma': 'scale', 'kernel': 'poly'}
Best score (MSE) for longitude (SVM): 0.0014244609155584468
Fitting 3 folds for each of 75 candidates, totalling 225 fits
Best params for latitude (SVM): {'C': 1e-05, 'gamma': 'scale', 'kernel': 'rbf'}
Best score (MSE) for latitude (SVM): 0.0016980815695335086
Best params for latitude (SVM): {'C': 1e-05, 'gamma': 'scale', 'kernel': 'rbf'}
Best score (MSE) for latitude (SVM): 0.0016980815695335086


# Train SVM with Best Parameters
Now we train the final SVM models using the best parameters found.

In [25]:
svm_long_best = SVR(**grid_search_long_svm.best_params_)
svm_lat_best = SVR(**grid_search_lat_svm.best_params_)
svm_long_best.fit(X_train, y_long_train)
svm_lat_best.fit(X_train, y_lat_train)

# Predict and evaluate
y_long_pred_svm_best = svm_long_best.predict(X_test)
y_lat_pred_svm_best = svm_lat_best.predict(X_test)
mse_long_svm_best = mean_squared_error(y_long_test, y_long_pred_svm_best)
mse_lat_svm_best = mean_squared_error(y_lat_test, y_lat_pred_svm_best)
print(f'Longitude MSE (SVM best): {mse_long_svm_best}')
print(f'Latitude MSE (SVM best): {mse_lat_svm_best}')

Longitude MSE (SVM best): 0.001425107057490679
Latitude MSE (SVM best): 0.001703118182074884


In [26]:
# Save the best-tuned SVM predictions
results_svm_best_df = pd.DataFrame({
    'longitude_true': np.array(y_long_test),
    'longitude_pred_svm_best': y_long_pred_svm_best,
    'latitude_true': np.array(y_lat_test),
    'latitude_pred_svm_best': y_lat_pred_svm_best
})
results_svm_best_df.to_csv('latlon_predictions_vs_true_svm_best.csv', index=False)
results_svm_best_df.head()

Unnamed: 0,longitude_true,longitude_pred_svm_best,latitude_true,latitude_pred_svm_best
0,144.846505,144.817077,-37.742313,-37.730019
1,144.813258,144.817041,-37.704514,-37.730019
2,144.78655,144.81704,-37.8208,-37.730019
3,144.811472,144.817041,-37.760298,-37.730019
4,144.765277,144.81704,-37.742556,-37.730019


In [27]:
# Save the trained SVM models for longitude and latitude
joblib.dump(svm_long_best, 'svm_longitude_model_best.pt')
joblib.dump(svm_lat_best, 'svm_latitude_model_best.pt')
print('SVM models saved as svm_longitude_model_best.pt and svm_latitude_model_best.pt')

SVM models saved as svm_longitude_model_best.pt and svm_latitude_model_best.pt
