In [1]:
# 1. Imports and find all CSVs
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

# Path to CSV Files
data_dir = 'data'  
file_paths = glob.glob(os.path.join(data_dir, '*-combined-kml.csv'))
print(f'Found {len(file_paths)} files:')
for fp in file_paths[:5]:
    print(' ', os.path.basename(fp))


Found 136 files:
  2022-07-04-garbo02-combined-kml.csv
  2022-07-08-garbo04-combined-kml.csv
  2022-07-21-garbo06-combined-kml.csv
  2022-07-06-garbo08-combined-kml.csv
  2022-07-19-garbo08-combined-kml.csv


In [2]:
# 2. Load just the first 3 files for initial check
sample_paths = file_paths[:3]
dfs = [pd.read_csv(p) for p in sample_paths]

for i, df in enumerate(dfs):
    print(f'\n--- File {i+1}: {os.path.basename(sample_paths[i])} ---')
    display(df.head())
    print(df.info())
    print(df.describe().T)


--- File 1: 2022-07-04-garbo02-combined-kml.csv ---


Unnamed: 0,time,Day,Year,Month,Date,hour,min,sec,timezone,latitude,...,Retransmissions,CWnd,cwnd_unit,Role-RX,Transfer size-RX,Transfer unit-RX,Bitrate-RX,bitrate_unit-RX,send_data,square_id
0,1656876656,,,,,,,,,,...,,,,,,,,,,
1,1656876656,,,,,,,,,,...,,,,,,,,,,
2,1656876656,,,,,,,,,,...,,,,,,,,,,
3,1656876657,Mon,2022.0,7.0,4.0,5.0,30.0,57.0,AEST,99.0,...,,,,,,,,,0.0,
4,1656876657,Mon,2022.0,7.0,4.0,5.0,30.0,57.0,AEST,99.0,...,,,,,,,,,0.0,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33031 entries, 0 to 33030
Data columns (total 32 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              33031 non-null  int64  
 1   Day               31603 non-null  object 
 2   Year              31603 non-null  float64
 3   Month             31603 non-null  float64
 4   Date              31603 non-null  float64
 5   hour              31603 non-null  float64
 6   min               31603 non-null  float64
 7   sec               31603 non-null  float64
 8   timezone          31603 non-null  object 
 9   latitude          31603 non-null  float64
 10  longitude         31603 non-null  float64
 11  speed             31603 non-null  float64
 12  truck             31603 non-null  object 
 13  svr1              28937 non-null  float64
 14  svr2              28937 non-null  float64
 15  svr3              28937 non-null  float64
 16  svr4              28937 non-null  float6

Unnamed: 0,time,Day,Year,Month,Date,hour,min,sec,timezone,latitude,...,Retransmissions,CWnd,cwnd_unit,Role-RX,Transfer size-RX,Transfer unit-RX,Bitrate-RX,bitrate_unit-RX,send_data,square_id
0,1657220675,,,,,,,,,,...,,,,,,,,,,
1,1657220675,,,,,,,,,,...,,,,,,,,,,
2,1657220675,,,,,,,,,,...,,,,,,,,,,
3,1657220676,,,,,,,,,,...,,,,,,,,,0.0,
4,1657220676,,,,,,,,,,...,,,,,,,,,0.0,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26103 entries, 0 to 26102
Data columns (total 32 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              26103 non-null  int64  
 1   Day               23277 non-null  object 
 2   Year              23277 non-null  float64
 3   Month             23277 non-null  float64
 4   Date              23277 non-null  float64
 5   hour              23277 non-null  float64
 6   min               23277 non-null  float64
 7   sec               23277 non-null  float64
 8   timezone          23277 non-null  object 
 9   latitude          23277 non-null  float64
 10  longitude         23277 non-null  float64
 11  speed             23277 non-null  float64
 12  truck             23277 non-null  object 
 13  svr1              22493 non-null  float64
 14  svr2              22493 non-null  float64
 15  svr3              22493 non-null  float64
 16  svr4              22493 non-null  float6

Unnamed: 0,time,Day,Year,Month,Date,hour,min,sec,timezone,latitude,...,Retransmissions,CWnd,cwnd_unit,Role-RX,Transfer size-RX,Transfer unit-RX,Bitrate-RX,bitrate_unit-RX,send_data,square_id
0,1658343035,Thu,2022.0,7.0,21.0,4.0,50.0,35.0,AEST,99.0,...,,,,,,,,,0.0,
1,1658343036,Thu,2022.0,7.0,21.0,4.0,50.0,36.0,AEST,99.0,...,,,,,,,,,0.0,
2,1658343037,Thu,2022.0,7.0,21.0,4.0,50.0,37.0,AEST,99.0,...,,,,,,,,,0.0,
3,1658343038,Thu,2022.0,7.0,21.0,4.0,50.0,38.0,AEST,99.0,...,,,,,,,,,0.0,
4,1658343039,Thu,2022.0,7.0,21.0,4.0,50.0,39.0,AEST,99.0,...,,,,,,,,,0.0,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23836 entries, 0 to 23835
Data columns (total 32 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              23836 non-null  int64  
 1   Day               23012 non-null  object 
 2   Year              23012 non-null  float64
 3   Month             23012 non-null  float64
 4   Date              23012 non-null  float64
 5   hour              23012 non-null  float64
 6   min               23012 non-null  float64
 7   sec               23012 non-null  float64
 8   timezone          23012 non-null  object 
 9   latitude          23012 non-null  float64
 10  longitude         23012 non-null  float64
 11  speed             23012 non-null  float64
 12  truck             23012 non-null  object 
 13  svr1              20633 non-null  float64
 14  svr2              20633 non-null  float64
 15  svr3              20633 non-null  float64
 16  svr4              20633 non-null  float6

In [3]:
# 3. Combine samples (all files) and basic cleaning
df = pd.concat(dfs, ignore_index=True)

# convert UNIX timestamp
if 'time' in df.columns:
    df['datetime'] = pd.to_datetime(df['time'], unit='s')

# mark invalid GPS as NaN and drop them
for c in ['latitude','longitude']:
    df.loc[df[c]==99999, c] = np.nan
df.dropna(subset=['latitude','longitude'], inplace=True)
df.reset_index(drop=True, inplace=True)

print(f'After cleaning: {df.shape[0]} rows')

After cleaning: 77892 rows


In [4]:
# 5. Prepare features & find best K via silhouette score
feat_cols = ['svr1','svr2','svr3','svr4']  # adjust/add throughput cols
X = df[feat_cols].dropna()
scaler = StandardScaler().fit(X)
Xs = scaler.transform(X)

scores = []
for k in range(2,7):
    km = KMeans(n_clusters=k, random_state=42).fit(Xs)
    scores.append((k, silhouette_score(Xs, km.labels_)))
print('silhouette scores:', scores)

# pick optimal k (e.g. k=3) and assign clusters back to df
k_opt = max(scores, key=lambda x: x[1])[0]
km = KMeans(n_clusters=k_opt, random_state=42).fit(Xs)
df.loc[X.index, 'cluster'] = km.labels_
print(f'Applied KMeans with k={k_opt}')


silhouette scores: [(2, np.float64(0.8033231080515175)), (3, np.float64(0.7384243366863146)), (4, np.float64(0.7216086902347246)), (5, np.float64(0.703343358043569)), (6, np.float64(0.6870407611566827))]
Applied KMeans with k=2


In [5]:
# — pick the busiest zone
zone = df['square_id'].value_counts().idxmax()
print("Using zone:", zone)

dfz = df[df['square_id']==zone].sort_values('datetime')
start, end = dfz['datetime'].min(), dfz['datetime'].max()
span_hrs = (end - start).total_seconds() / 3600
print(f"Time span for {zone}: {span_hrs:.1f} hours ({start} → {end})")

# — choose freq: hourly if that gives ≥10 bins, else minute
freq = 'H'
n_bins = len(pd.date_range(start, end, freq=freq))
if n_bins < 10:
    freq = 'T'
print(f"Resampling at '{freq}' gives {n_bins if freq=='H' else len(pd.date_range(start, end, freq=freq))} bins")

# — build the series
ts = (
    dfz
    .set_index('datetime')['svr1']
    .resample(freq)
    .mean()
    .interpolate()
)
print(f"Final ts length: {len(ts)}")

if len(ts) < 10:
    raise RuntimeError(f"Still not enough data (got {len(ts)} points). Try combining multiple zones or a coarser time span.")

# — fit ARIMA(1,1,1)
model = sm.tsa.ARIMA(ts, order=(1,1,1)).fit()
print(model.summary())

# — forecast next 3 steps (hours or minutes)
pred = model.forecast(steps=3)
print("\nNext 3‑step forecast:")
print(pred)


Using zone: square_60129542182
Time span for square_60129542182: 4.0 hours (2022-07-07 20:26:38 → 2022-07-08 00:25:02)
Resampling at 'T' gives 239 bins
Final ts length: 240
                               SARIMAX Results                                
Dep. Variable:                   svr1   No. Observations:                  240
Model:                 ARIMA(1, 1, 1)   Log Likelihood               -1545.539
Date:                Mon, 21 Apr 2025   AIC                           3097.078
Time:                        15:18:32   BIC                           3107.507
Sample:                    07-07-2022   HQIC                          3101.281
                         - 07-08-2022                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.6855      0.070     

In [8]:
# 7. Fixed LSTM‑based next‑hour prediction with safe dynamic n_steps

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# 7.1 Make sure your series has at least 2 points
ts_len = len(ts)
if ts_len < 2:
    raise ValueError(f"Time series too short ({ts_len} points). Need at least 2 for any LSTM.")

# 7.2 Choose n_steps so that you get >=2 sequences
max_steps = 24
# can't use more lags than ts_len - 1
n_steps = min(max_steps, ts_len - 1)
# ensure at least 1 lag
n_steps = max(1, n_steps)
print(f'Using n_steps = {n_steps} for series length = {ts_len}')

# 7.3 Sequence builder
def make_seqs(x, n_steps):
    X, y = [], []
    for i in range(len(x) - n_steps):
        X.append(x[i : i + n_steps])
        y.append(x[i + n_steps])
    return np.array(X), np.array(y)

X_seq, y_seq = make_seqs(ts.values, n_steps)
n_samples = X_seq.shape[0]
print(f'Created {n_samples} sequences.')

# 7.4 Require at least two sequences to split
if n_samples < 2:
    raise ValueError(f"Only {n_samples} sequence(s) available; need ≥2 for train/test split.")

# 7.5 Train/test split (time‑ordered; no shuffle)
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y_seq, test_size=0.2, shuffle=False
)

# 7.6 Reshape to [samples, timesteps, features]
X_train = X_train.reshape((-1, n_steps, 1))
X_test  = X_test.reshape((-1, n_steps, 1))

# 7.7 Build & fit LSTM
model = Sequential([
    LSTM(50, input_shape=(n_steps, 1)),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# 7.8 Evaluate
preds = model.predict(X_test)
print('Test MSE:', mean_squared_error(y_test, preds))


Using n_steps = 24 for series length = 240
Created 216 sequences.
Epoch 1/10


E0000 00:00:1745213066.149926  242121 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
2025-04-21 15:24:26.150554: W tensorflow/core/framework/op_kernel.cc:1857] OP_REQUIRES failed at cudnn_rnn_ops.cc:1769 : INVALID_ARGUMENT: Dnn is not supported


InvalidArgumentError: Graph execution error:

Detected at node sequential_2_1/lstm_2_1/CudnnRNNV3 defined at (most recent call last):
  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/runpy.py", line 86, in _run_code

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/admin/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/admin/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/admin/anaconda3/envs/AI_Eng/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/admin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/admin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/admin/.local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/admin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/admin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/admin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_231864/313619368.py", line 53, in <module>

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 113, in one_step_on_data

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 57, in train_step

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/layers/layer.py", line 910, in __call__

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/ops/operation.py", line 58, in __call__

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/models/sequential.py", line 221, in call

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/models/functional.py", line 183, in call

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/ops/function.py", line 171, in _run_through_graph

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/models/functional.py", line 643, in call

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/layers/layer.py", line 910, in __call__

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/ops/operation.py", line 58, in __call__

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/layers/rnn/lstm.py", line 584, in call

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/layers/rnn/rnn.py", line 408, in call

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/layers/rnn/lstm.py", line 551, in inner_loop

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/rnn.py", line 841, in lstm

  File "/home/admin/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/rnn.py", line 933, in _cudnn_lstm

Dnn is not supported
	 [[{{node sequential_2_1/lstm_2_1/CudnnRNNV3}}]] [Op:__inference_multi_step_on_iterator_4575]

In [20]:
import tensorflow as tf

# 1. Make sure TF can see your GPU
gpus = tf.config.list_physical_devices('GPU')
assert gpus, "No GPU found – check your CUDA_VISIBLE_DEVICES and driver install!"
print("GPUs detected:", gpus)

# 2. Enable memory growth so TF doesn’t pre‑allocate all GPU RAM
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# 3. Turn on device placement logging
tf.debugging.set_log_device_placement(True)

# 4. (Optionally) wrap your model training in a GPU device context
with tf.device('/GPU:0'):
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(50, input_shape=(n_steps, 1)), 
        tf.keras.layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train, y_train, epochs=3, batch_size=32)


GPUs detected: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


RuntimeError: Physical devices cannot be modified after being initialized